aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorqinxialei <xialeiqin@gmail.com>2021-04-22 11:20:15 +0800
committerqinxialei <xialeiqin@gmail.com>2021-04-22 11:20:15 +0800
commit2381d803c76105f44717d75f089ec37f51e5cfe4 (patch)
tree33f40fb4dfd1039ac262d5f1c1065d298578ddc1
parente8d277081293b6fb2a5d469616baaa7a06f52496 (diff)
downloadlibgav1-2381d803c76105f44717d75f089ec37f51e5cfe4.tar.gz
libgav1-2381d803c76105f44717d75f089ec37f51e5cfe4.tar.bz2
libgav1-2381d803c76105f44717d75f089ec37f51e5cfe4.zip
New upstream version 0.16.3
-rw-r--r--.gitattributes1
-rw-r--r--CMakeLists.txt55
-rw-r--r--README.md20
-rw-r--r--cmake/libgav1_build_definitions.cmake23
-rw-r--r--cmake/libgav1_flags.cmake14
-rw-r--r--cmake/libgav1_helpers.cmake14
-rw-r--r--cmake/libgav1_sanitizer.cmake4
-rw-r--r--cmake/libgav1_targets.cmake38
-rw-r--r--examples/gav1_decode.cc3
-rw-r--r--examples/logging.h2
-rw-r--r--src/decoder_impl.cc27
-rw-r--r--src/decoder_impl.h5
-rw-r--r--src/decoder_state.h18
-rw-r--r--src/dsp/arm/average_blend_neon.cc135
-rw-r--r--src/dsp/arm/cdef_neon.cc11
-rw-r--r--src/dsp/arm/common_neon.h70
-rw-r--r--src/dsp/arm/convolve_neon.cc943
-rw-r--r--src/dsp/arm/distance_weighted_blend_neon.cc162
-rw-r--r--src/dsp/arm/distance_weighted_blend_neon.h2
-rw-r--r--src/dsp/arm/film_grain_neon.cc2
-rw-r--r--src/dsp/arm/intra_edge_neon.cc243
-rw-r--r--src/dsp/arm/intra_edge_neon.h3
-rw-r--r--src/dsp/arm/intrapred_cfl_neon.cc1012
-rw-r--r--src/dsp/arm/intrapred_cfl_neon.h179
-rw-r--r--src/dsp/arm/intrapred_directional_neon.cc594
-rw-r--r--src/dsp/arm/intrapred_directional_neon.h56
-rw-r--r--src/dsp/arm/intrapred_filter_neon.cc (renamed from src/dsp/arm/intrapred_filter_intra_neon.cc)10
-rw-r--r--src/dsp/arm/intrapred_filter_neon.h37
-rw-r--r--src/dsp/arm/intrapred_neon.cc247
-rw-r--r--src/dsp/arm/intrapred_neon.h218
-rw-r--r--src/dsp/arm/intrapred_smooth_neon.cc5
-rw-r--r--src/dsp/arm/intrapred_smooth_neon.h149
-rw-r--r--src/dsp/arm/inverse_transform_10bit_neon.cc2543
-rw-r--r--src/dsp/arm/inverse_transform_neon.cc2
-rw-r--r--src/dsp/arm/inverse_transform_neon.h16
-rw-r--r--src/dsp/arm/loop_filter_neon.cc18
-rw-r--r--src/dsp/arm/loop_restoration_neon.cc1470
-rw-r--r--src/dsp/arm/mask_blend_neon.cc2
-rw-r--r--src/dsp/arm/motion_field_projection_neon.cc2
-rw-r--r--src/dsp/arm/motion_vector_search_neon.cc2
-rw-r--r--src/dsp/arm/obmc_neon.cc2
-rw-r--r--src/dsp/arm/super_res_neon.cc151
-rw-r--r--src/dsp/arm/super_res_neon.h5
-rw-r--r--src/dsp/arm/warp_neon.cc4
-rw-r--r--src/dsp/arm/weight_mask_neon.cc2
-rw-r--r--src/dsp/average_blend.cc2
-rw-r--r--src/dsp/average_blend_test.cc322
-rw-r--r--src/dsp/cdef.h1
-rw-r--r--src/dsp/cdef_test.cc409
-rw-r--r--src/dsp/constants.cc2
-rw-r--r--src/dsp/convolve.cc4
-rw-r--r--src/dsp/convolve_test.cc1373
-rw-r--r--src/dsp/distance_weighted_blend_test.cc324
-rw-r--r--src/dsp/dsp.cc57
-rw-r--r--src/dsp/dsp.h24
-rw-r--r--src/dsp/dsp_test.cc248
-rw-r--r--src/dsp/film_grain.h8
-rw-r--r--src/dsp/intra_edge_test.cc504
-rw-r--r--src/dsp/intrapred.cc1484
-rw-r--r--src/dsp/intrapred.h4
-rw-r--r--src/dsp/intrapred_cfl.cc654
-rw-r--r--src/dsp/intrapred_cfl.h48
-rw-r--r--src/dsp/intrapred_cfl_test.cc923
-rw-r--r--src/dsp/intrapred_directional.cc252
-rw-r--r--src/dsp/intrapred_directional.h48
-rw-r--r--src/dsp/intrapred_directional_test.cc929
-rw-r--r--src/dsp/intrapred_filter.cc144
-rw-r--r--src/dsp/intrapred_filter.h49
-rw-r--r--src/dsp/intrapred_filter_test.cc554
-rw-r--r--src/dsp/intrapred_smooth.cc738
-rw-r--r--src/dsp/intrapred_smooth.h48
-rw-r--r--src/dsp/intrapred_test.cc710
-rw-r--r--src/dsp/inverse_transform.cc7
-rw-r--r--src/dsp/inverse_transform_test.cc536
-rw-r--r--src/dsp/libgav1_dsp.cmake28
-rw-r--r--src/dsp/loop_filter_test.cc348
-rw-r--r--src/dsp/loop_restoration.cc142
-rw-r--r--src/dsp/loop_restoration_test.cc616
-rw-r--r--src/dsp/mask_blend.cc9
-rw-r--r--src/dsp/mask_blend_test.cc493
-rw-r--r--src/dsp/motion_field_projection_test.cc213
-rw-r--r--src/dsp/motion_vector_search_test.cc197
-rw-r--r--src/dsp/obmc_test.cc349
-rw-r--r--src/dsp/super_res.cc10
-rw-r--r--src/dsp/super_res_test.cc264
-rw-r--r--src/dsp/warp_test.cc649
-rw-r--r--src/dsp/weight_mask_test.cc390
-rw-r--r--src/dsp/x86/average_blend_sse4.cc224
-rw-r--r--src/dsp/x86/average_blend_sse4.h4
-rw-r--r--src/dsp/x86/cdef_avx2.cc784
-rw-r--r--src/dsp/x86/cdef_avx2.h45
-rw-r--r--src/dsp/x86/cdef_sse4.cc9
-rw-r--r--src/dsp/x86/common_avx2.h151
-rw-r--r--src/dsp/x86/common_avx2.inc121
-rw-r--r--src/dsp/x86/common_sse4.h225
-rw-r--r--src/dsp/x86/common_sse4.inc206
-rw-r--r--src/dsp/x86/convolve_avx2.cc1286
-rw-r--r--src/dsp/x86/convolve_avx2.h16
-rw-r--r--src/dsp/x86/convolve_sse4.cc1039
-rw-r--r--src/dsp/x86/convolve_sse4.inc934
-rw-r--r--src/dsp/x86/distance_weighted_blend_sse4.cc223
-rw-r--r--src/dsp/x86/distance_weighted_blend_sse4.h4
-rw-r--r--src/dsp/x86/film_grain_sse4.cc514
-rw-r--r--src/dsp/x86/film_grain_sse4.h40
-rw-r--r--src/dsp/x86/intra_edge_sse4.cc4
-rw-r--r--src/dsp/x86/intrapred_cfl_sse4.cc1057
-rw-r--r--src/dsp/x86/intrapred_cfl_sse4.h376
-rw-r--r--src/dsp/x86/intrapred_directional_sse4.cc1478
-rw-r--r--src/dsp/x86/intrapred_directional_sse4.h54
-rw-r--r--src/dsp/x86/intrapred_filter_sse4.cc432
-rw-r--r--src/dsp/x86/intrapred_filter_sse4.h41
-rw-r--r--src/dsp/x86/intrapred_smooth_sse4.cc27
-rw-r--r--src/dsp/x86/intrapred_smooth_sse4.h318
-rw-r--r--src/dsp/x86/intrapred_sse4.cc1355
-rw-r--r--src/dsp/x86/intrapred_sse4.h473
-rw-r--r--src/dsp/x86/inverse_transform_sse4.cc104
-rw-r--r--src/dsp/x86/loop_filter_sse4.cc38
-rw-r--r--src/dsp/x86/loop_restoration_10bit_avx2.cc2619
-rw-r--r--src/dsp/x86/loop_restoration_10bit_sse4.cc2033
-rw-r--r--src/dsp/x86/loop_restoration_avx2.cc339
-rw-r--r--src/dsp/x86/loop_restoration_avx2.h4
-rw-r--r--src/dsp/x86/loop_restoration_sse4.cc241
-rw-r--r--src/dsp/x86/loop_restoration_sse4.h4
-rw-r--r--src/dsp/x86/mask_blend_sse4.cc507
-rw-r--r--src/dsp/x86/mask_blend_sse4.h24
-rw-r--r--src/dsp/x86/motion_field_projection_sse4.cc6
-rw-r--r--src/dsp/x86/motion_vector_search_sse4.cc2
-rw-r--r--src/dsp/x86/obmc_sse4.cc287
-rw-r--r--src/dsp/x86/obmc_sse4.h6
-rw-r--r--src/dsp/x86/super_res_sse4.cc175
-rw-r--r--src/dsp/x86/super_res_sse4.h12
-rw-r--r--src/dsp/x86/transpose_sse4.h6
-rw-r--r--src/dsp/x86/warp_sse4.cc2
-rw-r--r--src/dsp/x86/weight_mask_sse4.cc633
-rw-r--r--src/dsp/x86/weight_mask_sse4.h67
-rw-r--r--src/gav1/decoder_settings.h6
-rw-r--r--src/gav1/symbol_visibility.h7
-rw-r--r--src/gav1/version.h2
-rw-r--r--src/obu_parser.cc34
-rw-r--r--src/obu_parser.h6
-rw-r--r--src/post_filter.h9
-rw-r--r--src/post_filter/cdef.cc2
-rw-r--r--src/post_filter/loop_restoration.cc24
-rw-r--r--src/post_filter/post_filter.cc32
-rw-r--r--src/post_filter/super_res.cc52
-rw-r--r--src/residual_buffer_pool.cc3
-rw-r--r--src/residual_buffer_pool.h77
-rw-r--r--src/threading_strategy.cc25
-rw-r--r--src/tile.h16
-rw-r--r--src/tile/bitstream/palette.cc6
-rw-r--r--src/tile/tile.cc207
-rw-r--r--src/utils/array_2d.h2
-rw-r--r--src/utils/block_parameters_holder.cc56
-rw-r--r--src/utils/block_parameters_holder.h37
-rw-r--r--src/utils/common.h5
-rw-r--r--src/utils/constants.h46
-rw-r--r--src/utils/cpu.cc2
-rw-r--r--src/utils/cpu.h2
-rw-r--r--src/utils/dynamic_buffer.h2
-rw-r--r--src/utils/libgav1_utils.cmake2
-rw-r--r--src/utils/logging.cc2
-rw-r--r--src/utils/logging.h20
-rw-r--r--src/utils/memory.h2
-rw-r--r--src/utils/parameter_tree.cc133
-rw-r--r--src/utils/parameter_tree.h113
-rw-r--r--src/utils/raw_bit_reader.h2
-rw-r--r--src/utils/threadpool.cc20
-rw-r--r--src/utils/types.h24
-rw-r--r--tests/block_utils.cc130
-rw-r--r--tests/block_utils.h62
-rw-r--r--tests/libgav1_tests.cmake626
-rw-r--r--tests/third_party/libvpx/LICENSE30
-rw-r--r--tests/third_party/libvpx/acm_random.h91
-rw-r--r--tests/third_party/libvpx/md5_helper.h53
-rw-r--r--tests/third_party/libvpx/md5_utils.cc249
-rw-r--r--tests/third_party/libvpx/md5_utils.h41
-rw-r--r--tests/utils.cc120
-rw-r--r--tests/utils.h138
-rw-r--r--tests/utils_test.cc190
179 files changed, 36565 insertions, 7327 deletions
diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000..b934084
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1 @@
+* whitespace=tab-in-indent,space-before-tab,trailing-space
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5d00ae6..5e9e17a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -36,31 +36,17 @@ endif()
set(libgav1_examples "${libgav1_root}/examples")
set(libgav1_source "${libgav1_root}/src")
-include(FindThreads)
-
-include("${libgav1_examples}/libgav1_examples.cmake")
-include("${libgav1_root}/cmake/libgav1_build_definitions.cmake")
-include("${libgav1_root}/cmake/libgav1_cpu_detection.cmake")
-include("${libgav1_root}/cmake/libgav1_flags.cmake")
-include("${libgav1_root}/cmake/libgav1_helpers.cmake")
-include("${libgav1_root}/cmake/libgav1_install.cmake")
-include("${libgav1_root}/cmake/libgav1_intrinsics.cmake")
include("${libgav1_root}/cmake/libgav1_options.cmake")
-include("${libgav1_root}/cmake/libgav1_sanitizer.cmake")
-include("${libgav1_root}/cmake/libgav1_targets.cmake")
-include("${libgav1_root}/cmake/libgav1_variables.cmake")
-include("${libgav1_source}/dsp/libgav1_dsp.cmake")
-include("${libgav1_source}/libgav1_decoder.cmake")
-include("${libgav1_source}/utils/libgav1_utils.cmake")
libgav1_option(NAME LIBGAV1_ENABLE_OPTIMIZATIONS HELPSTRING
"Enables optimized code." VALUE ON)
-libgav1_option(NAME LIBGAV1_ENABLE_AVX2 HELPSTRING
- "Enables avx2 optimizations." VALUE ON)
+libgav1_option(NAME LIBGAV1_ENABLE_AVX2 HELPSTRING "Enables avx2 optimizations."
+ VALUE ON)
libgav1_option(NAME LIBGAV1_ENABLE_NEON HELPSTRING "Enables neon optimizations."
VALUE ON)
libgav1_option(NAME LIBGAV1_ENABLE_SSE4_1 HELPSTRING
"Enables sse4.1 optimizations." VALUE ON)
+libgav1_option(NAME LIBGAV1_ENABLE_TESTS HELPSTRING "Enables tests." VALUE ON)
libgav1_option(
NAME LIBGAV1_VERBOSE HELPSTRING
"Enables verbose build system output. Higher numbers are more verbose." VALUE
@@ -70,6 +56,23 @@ if(NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE Release)
endif()
+include(FindThreads)
+
+include("${libgav1_examples}/libgav1_examples.cmake")
+include("${libgav1_root}/cmake/libgav1_build_definitions.cmake")
+include("${libgav1_root}/cmake/libgav1_cpu_detection.cmake")
+include("${libgav1_root}/cmake/libgav1_flags.cmake")
+include("${libgav1_root}/cmake/libgav1_helpers.cmake")
+include("${libgav1_root}/cmake/libgav1_install.cmake")
+include("${libgav1_root}/cmake/libgav1_intrinsics.cmake")
+include("${libgav1_root}/cmake/libgav1_sanitizer.cmake")
+include("${libgav1_root}/cmake/libgav1_targets.cmake")
+include("${libgav1_root}/cmake/libgav1_variables.cmake")
+include("${libgav1_root}/tests/libgav1_tests.cmake")
+include("${libgav1_source}/dsp/libgav1_dsp.cmake")
+include("${libgav1_source}/libgav1_decoder.cmake")
+include("${libgav1_source}/utils/libgav1_utils.cmake")
+
libgav1_optimization_detect()
libgav1_set_build_definitions()
libgav1_set_cxx_flags()
@@ -109,13 +112,27 @@ if(NOT "${LIBGAV1_EXE_LINKER_FLAGS}" STREQUAL "")
separate_arguments(LIBGAV1_EXE_LINKER_FLAGS)
endif()
-add_subdirectory("${libgav1_root}/third_party/abseil-cpp"
- "${libgav1_abseil_build}" EXCLUDE_FROM_ALL)
+# Set test-only flags based on LIBGAV1_CXX_FLAGS.
+libgav1_set_test_flags()
+
+set(libgav1_abseil "${libgav1_root}/third_party/abseil-cpp")
+if(NOT EXISTS "${libgav1_abseil}")
+ message(
+ FATAL_ERROR
+ "Abseil not found. This dependency is required by the"
+ " examples & tests and libgav1 when LIBGAV1_THREADPOOL_USE_STD_MUTEX is"
+ " not defined. To continue, download the Abseil repository to"
+ " third_party/abseil-cpp:\n git \\\n -C ${libgav1_root} \\\n"
+ " clone \\\n"
+ " https://github.com/abseil/abseil-cpp.git third_party/abseil-cpp")
+endif()
+add_subdirectory("${libgav1_abseil}" "${libgav1_abseil_build}" EXCLUDE_FROM_ALL)
libgav1_reset_target_lists()
libgav1_add_dsp_targets()
libgav1_add_decoder_targets()
libgav1_add_examples_targets()
+libgav1_add_tests_targets()
libgav1_add_utils_targets()
libgav1_setup_install_target()
diff --git a/README.md b/README.md
index 8ab8eab..3155970 100644
--- a/README.md
+++ b/README.md
@@ -20,7 +20,18 @@ information on the AV1 video format can be found at
From within the libgav1 directory:
```shell
- $ git clone https://github.com/abseil/abseil-cpp.git third_party/abseil-cpp
+ $ git clone https://github.com/abseil/abseil-cpp.git third_party/abseil-cpp
+ ```
+
+ Note: Abseil is required by the examples and tests. libgav1 will depend on
+ it if `LIBGAV1_THREADPOOL_USE_STD_MUTEX` is set to `0` (see below).
+
+4. (Optional) [GoogleTest](https://github.com/google/googletest)
+
+ From within the libgav1 directory:
+
+ ```shell
+ $ git clone https://github.com/google/googletest.git third_party/googletest
```
### Compile
@@ -58,10 +69,11 @@ Configuration options:
* `LIBGAV1_THREADPOOL_USE_STD_MUTEX`: controls use of std::mutex and
absl::Mutex in ThreadPool. Defining this to 1 will remove any Abseil
dependency from the core library. Automatically defined in
- `src/utils/threadpool.h` if unset.
+ `src/utils/threadpool.h` if unset. Defaults to 1 on Android & iOS, 0
+ otherwise.
* `LIBGAV1_MAX_THREADS`: sets the number of threads that the library is
- allowed to create. Has to be an integer > 0. Otherwise this is ignored.
- The default value is 128.
+ allowed to create. Has to be an integer > 0. Otherwise this is ignored. The
+ default value is 128.
* `LIBGAV1_FRAME_PARALLEL_THRESHOLD_MULTIPLIER`: the threshold multiplier that
is used to determine when to use frame parallel decoding. Frame parallel
decoding will be used if |threads| > |tile_count| * this multiplier. Has to
diff --git a/cmake/libgav1_build_definitions.cmake b/cmake/libgav1_build_definitions.cmake
index b170e7e..fc83490 100644
--- a/cmake/libgav1_build_definitions.cmake
+++ b/cmake/libgav1_build_definitions.cmake
@@ -21,7 +21,24 @@ macro(libgav1_set_build_definitions)
string(TOLOWER "${CMAKE_BUILD_TYPE}" build_type_lowercase)
libgav1_load_version_info()
- set(LIBGAV1_SOVERSION 0)
+
+ # Library version info. See the libtool docs for updating the values:
+ # https://www.gnu.org/software/libtool/manual/libtool.html#Updating-version-info
+ #
+ # c=<current>, r=<revision>, a=<age>
+ #
+ # libtool generates a .so file as .so.[c-a].a.r, while -version-info c:r:a is
+ # passed to libtool.
+ #
+ # We set LIBGAV1_SOVERSION = [c-a].a.r
+ set(LT_CURRENT 0)
+ set(LT_REVISION 0)
+ set(LT_AGE 0)
+ math(EXPR LIBGAV1_SOVERSION_MAJOR "${LT_CURRENT} - ${LT_AGE}")
+ set(LIBGAV1_SOVERSION "${LIBGAV1_SOVERSION_MAJOR}.${LT_AGE}.${LT_REVISION}")
+ unset(LT_CURRENT)
+ unset(LT_REVISION)
+ unset(LT_AGE)
list(APPEND libgav1_include_paths "${libgav1_root}" "${libgav1_root}/src"
"${libgav1_build}" "${libgav1_root}/third_party/abseil-cpp")
@@ -89,9 +106,7 @@ macro(libgav1_set_build_definitions)
endif()
if(build_type_lowercase MATCHES "rel")
- # TODO(tomfinegan): this value is only a concern for the core library and
- # can be made smaller if the test targets are avoided.
- list(APPEND libgav1_base_cxx_flags "-Wstack-usage=196608")
+ list(APPEND libgav1_base_cxx_flags "-Wframe-larger-than=196608")
endif()
list(APPEND libgav1_msvc_cxx_flags
diff --git a/cmake/libgav1_flags.cmake b/cmake/libgav1_flags.cmake
index 2d8d9a6..a5408e2 100644
--- a/cmake/libgav1_flags.cmake
+++ b/cmake/libgav1_flags.cmake
@@ -205,7 +205,7 @@ macro(libgav1_test_exe_linker_flag)
# Restore cached global exe linker flags.
if(cached_CMAKE_EXE_LINKER_FLAGS)
- set(CMAKE_EXE_LINKER_FLAGS cached_CMAKE_EXE_LINKER_FLAGS)
+ set(CMAKE_EXE_LINKER_FLAGS ${cached_CMAKE_EXE_LINKER_FLAGS})
else()
unset(CMAKE_EXE_LINKER_FLAGS)
endif()
@@ -249,3 +249,15 @@ macro(libgav1_set_cxx_flags)
libgav1_test_cxx_flag(FLAG_LIST_VAR_NAMES ${cxx_flag_lists})
endmacro()
+
+# Sets LIBGAV1_TEST_C_FLAGS and LIBGAV1_TEST_CXX_FLAGS.
+#
+# Note: libgav1_set_cxx_flags() must be called before this macro. Furthermore,
+# the call to this macro should be made after all additions to LIBGAV1_CXX_FLAGS
+# are complete.
+macro(libgav1_set_test_flags)
+ if(LIBGAV1_ENABLE_TESTS)
+ set(LIBGAV1_TEST_CXX_FLAGS ${LIBGAV1_CXX_FLAGS})
+ list(FILTER LIBGAV1_TEST_CXX_FLAGS EXCLUDE REGEX "-Wframe-larger-than")
+ endif()
+endmacro()
diff --git a/cmake/libgav1_helpers.cmake b/cmake/libgav1_helpers.cmake
index 76d8d67..ac16257 100644
--- a/cmake/libgav1_helpers.cmake
+++ b/cmake/libgav1_helpers.cmake
@@ -20,7 +20,13 @@ set(LIBGAV1_CMAKE_LIBGAV1_HELPERS_CMAKE_ 1)
# Kills build generation using message(FATAL_ERROR) and outputs all data passed
# to the console via use of $ARGN.
macro(libgav1_die)
- message(FATAL_ERROR ${ARGN})
+ # macro parameters are not variables so a temporary is needed to work with
+ # list().
+ set(msg ${ARGN})
+ # message(${ARGN}) will merge all list elements with no separator while
+ # "${ARGN}" will output the list as a ';' delimited string.
+ list(JOIN msg " " msg)
+ message(FATAL_ERROR "${msg}")
endmacro()
# Converts semi-colon delimited list variable(s) to string. Output is written to
@@ -94,10 +100,10 @@ macro(libgav1_create_dummy_source_file)
"${dummy_source_dir}/libgav1_${cdsf_TARGET}_${cdsf_BASENAME}.cc")
set(dummy_source_code
"// Generated file. DO NOT EDIT!\n"
- "// C++ source file created for target ${cdsf_TARGET}. \n"
- "void libgav1_${cdsf_TARGET}_${cdsf_BASENAME}_dummy_function(void);\n"
+ "// C++ source file created for target ${cdsf_TARGET}.\n"
+ "void libgav1_${cdsf_TARGET}_${cdsf_BASENAME}_dummy_function(void)\;\n"
"void libgav1_${cdsf_TARGET}_${cdsf_BASENAME}_dummy_function(void) {}\n")
- file(WRITE "${dummy_source_file}" "${dummy_source_code}")
+ file(WRITE "${dummy_source_file}" ${dummy_source_code})
target_sources(${cdsf_TARGET} PRIVATE ${dummy_source_file})
diff --git a/cmake/libgav1_sanitizer.cmake b/cmake/libgav1_sanitizer.cmake
index 4bb2263..2f9ee07 100644
--- a/cmake/libgav1_sanitizer.cmake
+++ b/cmake/libgav1_sanitizer.cmake
@@ -39,7 +39,9 @@ macro(libgav1_configure_sanitizer)
list(APPEND LIBGAV1_CXX_FLAGS "-fno-omit-frame-pointer"
"-fno-optimize-sibling-calls")
- libgav1_test_cxx_flag(FLAG_LIST_VAR_NAMES LIBGAV1_CXX_FLAGS FLAG_REQUIRED)
+ # Check the linker flags first as they may be required in the compile check
+ # to avoid undefined symbols related to the sanitizer.
libgav1_test_exe_linker_flag(FLAG_LIST_VAR_NAME LIBGAV1_EXE_LINKER_FLAGS)
+ libgav1_test_cxx_flag(FLAG_LIST_VAR_NAMES LIBGAV1_CXX_FLAGS FLAG_REQUIRED)
endif()
endmacro()
diff --git a/cmake/libgav1_targets.cmake b/cmake/libgav1_targets.cmake
index 78b4865..997f8bd 100644
--- a/cmake/libgav1_targets.cmake
+++ b/cmake/libgav1_targets.cmake
@@ -29,7 +29,7 @@ endmacro()
# Creates an executable target. The target name is passed as a parameter to the
# NAME argument, and the sources passed as a parameter to the SOURCES argument:
-# libgav1_add_test(NAME <name> SOURCES <sources> [optional args])
+# libgav1_add_executable(NAME <name> SOURCES <sources> [optional args])
#
# Optional args:
# cmake-format: off
@@ -115,15 +115,35 @@ macro(libgav1_add_executable)
target_include_directories(${exe_NAME} PRIVATE ${exe_INCLUDES})
endif()
- if(exe_COMPILE_FLAGS OR LIBGAV1_CXX_FLAGS)
+ unset(exe_LIBGAV1_COMPILE_FLAGS)
+ if(exe_TEST)
+ list(FILTER exe_SOURCES INCLUDE REGEX "\\.c$")
+ list(LENGTH exe_SOURCES exe_SOURCES_length)
+ if(exe_SOURCES_length EQUAL 0)
+ set(exe_LIBGAV1_COMPILE_FLAGS ${LIBGAV1_TEST_CXX_FLAGS})
+ else()
+ set(exe_LIBGAV1_COMPILE_FLAGS ${LIBGAV1_TEST_C_FLAGS})
+ endif()
+ else()
+ set(exe_LIBGAV1_COMPILE_FLAGS ${LIBGAV1_CXX_FLAGS})
+ endif()
+
+ if(exe_COMPILE_FLAGS OR exe_LIBGAV1_COMPILE_FLAGS)
target_compile_options(${exe_NAME}
- PRIVATE ${exe_COMPILE_FLAGS} ${LIBGAV1_CXX_FLAGS})
+ PRIVATE ${exe_COMPILE_FLAGS}
+ ${exe_LIBGAV1_COMPILE_FLAGS})
endif()
if(exe_LINK_FLAGS OR LIBGAV1_EXE_LINKER_FLAGS)
- set_target_properties(${exe_NAME}
- PROPERTIES LINK_FLAGS ${exe_LINK_FLAGS}
- ${LIBGAV1_EXE_LINKER_FLAGS})
+ list(APPEND exe_LINK_FLAGS "${LIBGAV1_EXE_LINKER_FLAGS}")
+ if(${CMAKE_VERSION} VERSION_LESS "3.13")
+ # LINK_FLAGS is managed as a string.
+ libgav1_set_and_stringify(SOURCE "${exe_LINK_FLAGS}" DEST exe_LINK_FLAGS)
+ set_target_properties(${exe_NAME}
+ PROPERTIES LINK_FLAGS "${exe_LINK_FLAGS}")
+ else()
+ target_link_options(${exe_NAME} PRIVATE ${exe_LINK_FLAGS})
+ endif()
endif()
if(exe_OBJLIB_DEPS)
@@ -137,7 +157,7 @@ macro(libgav1_add_executable)
endif()
if(BUILD_SHARED_LIBS AND (MSVC OR WIN32))
- target_compile_definitions(${lib_NAME} PRIVATE "LIBGAV1_BUILDING_DLL=0")
+ target_compile_definitions(${exe_NAME} PRIVATE "LIBGAV1_BUILDING_DLL=0")
endif()
if(exe_LIB_DEPS)
@@ -321,7 +341,9 @@ macro(libgav1_add_library)
endif()
if(lib_TYPE STREQUAL SHARED AND NOT MSVC)
- set_target_properties(${lib_NAME} PROPERTIES SOVERSION ${LIBGAV1_SOVERSION})
+ set_target_properties(${lib_NAME}
+ PROPERTIES VERSION ${LIBGAV1_SOVERSION} SOVERSION
+ ${LIBGAV1_SOVERSION_MAJOR})
endif()
if(BUILD_SHARED_LIBS AND (MSVC OR WIN32))
diff --git a/examples/gav1_decode.cc b/examples/gav1_decode.cc
index 4de0ba2..1408e8c 100644
--- a/examples/gav1_decode.cc
+++ b/examples/gav1_decode.cc
@@ -419,6 +419,9 @@ int main(int argc, char* argv[]) {
input_buffers.ReleaseInputBuffer(input_buffer);
}
input_buffer = nullptr;
+ // Clear any in progress frames to ensure the output frame limit is
+ // respected.
+ decoder.SignalEOS();
}
} while (input_buffer != nullptr ||
(!file_reader->IsEndOfFile() && !limit_reached) ||
diff --git a/examples/logging.h b/examples/logging.h
index c0bcad7..cf5a09f 100644
--- a/examples/logging.h
+++ b/examples/logging.h
@@ -46,7 +46,7 @@ constexpr const char* Basename(const char* file_name, size_t offset) {
#define LIBGAV1_EXAMPLES_LOG_ERROR(error_string) \
do { \
constexpr const char* libgav1_examples_basename = \
- ::libgav1::examples::Basename(__FILE__, sizeof(__FILE__) - 1); \
+ libgav1::examples::Basename(__FILE__, sizeof(__FILE__) - 1); \
fprintf(stderr, "%s:%d (%s): %s.\n", libgav1_examples_basename, __LINE__, \
__func__, error_string); \
} while (false)
diff --git a/src/decoder_impl.cc b/src/decoder_impl.cc
index 751671d..e23903c 100644
--- a/src/decoder_impl.cc
+++ b/src/decoder_impl.cc
@@ -36,7 +36,6 @@
#include "src/utils/common.h"
#include "src/utils/constants.h"
#include "src/utils/logging.h"
-#include "src/utils/parameter_tree.h"
#include "src/utils/raw_bit_reader.h"
#include "src/utils/segmentation.h"
#include "src/utils/threadpool.h"
@@ -631,10 +630,6 @@ DecoderImpl::~DecoderImpl() {
}
StatusCode DecoderImpl::Init() {
- if (!GenerateWedgeMask(&wedge_masks_)) {
- LIBGAV1_DLOG(ERROR, "GenerateWedgeMask() failed.");
- return kStatusOutOfMemory;
- }
if (!output_frame_queue_.Init(kMaxLayers)) {
LIBGAV1_DLOG(ERROR, "output_frame_queue_.Init() failed.");
return kStatusOutOfMemory;
@@ -857,6 +852,10 @@ StatusCode DecoderImpl::ParseAndSchedule(const uint8_t* data, size_t size,
LIBGAV1_DLOG(ERROR, "InitializeQuantizerMatrix() failed.");
return kStatusOutOfMemory;
}
+ if (!MaybeInitializeWedgeMasks(obu->frame_header().frame_type)) {
+ LIBGAV1_DLOG(ERROR, "InitializeWedgeMasks() failed.");
+ return kStatusOutOfMemory;
+ }
if (IsNewSequenceHeader(*obu)) {
const ObuSequenceHeader& sequence_header = obu->sequence_header();
const Libgav1ImageFormat image_format =
@@ -1050,6 +1049,10 @@ StatusCode DecoderImpl::DecodeTemporalUnit(const TemporalUnit& temporal_unit,
LIBGAV1_DLOG(ERROR, "InitializeQuantizerMatrix() failed.");
return kStatusOutOfMemory;
}
+ if (!MaybeInitializeWedgeMasks(obu->frame_header().frame_type)) {
+ LIBGAV1_DLOG(ERROR, "InitializeWedgeMasks() failed.");
+ return kStatusOutOfMemory;
+ }
if (IsNewSequenceHeader(*obu)) {
const ObuSequenceHeader& sequence_header = obu->sequence_header();
const Libgav1ImageFormat image_format =
@@ -1278,8 +1281,7 @@ StatusCode DecoderImpl::DecodeTiles(
// without having to check for boundary conditions.
if (!frame_scratch_buffer->block_parameters_holder.Reset(
frame_header.rows4x4 + kMaxBlockHeight4x4,
- frame_header.columns4x4 + kMaxBlockWidth4x4,
- sequence_header.use_128x128_superblock)) {
+ frame_header.columns4x4 + kMaxBlockWidth4x4)) {
return kStatusOutOfMemory;
}
const dsp::Dsp* const dsp =
@@ -1646,6 +1648,17 @@ bool DecoderImpl::IsNewSequenceHeader(const ObuParser& obu) {
return sequence_header_changed;
}
+bool DecoderImpl::MaybeInitializeWedgeMasks(FrameType frame_type) {
+ if (IsIntraFrame(frame_type) || wedge_masks_initialized_) {
+ return true;
+ }
+ if (!GenerateWedgeMask(&wedge_masks_)) {
+ return false;
+ }
+ wedge_masks_initialized_ = true;
+ return true;
+}
+
bool DecoderImpl::MaybeInitializeQuantizerMatrix(
const ObuFrameHeader& frame_header) {
if (quantizer_matrix_initialized_ || !frame_header.quantizer.use_matrix) {
diff --git a/src/decoder_impl.h b/src/decoder_impl.h
index 721b666..b52ecdf 100644
--- a/src/decoder_impl.h
+++ b/src/decoder_impl.h
@@ -215,6 +215,10 @@ class DecoderImpl : public Allocable {
// |quantizer_matrix_initialized_| to true.
bool MaybeInitializeQuantizerMatrix(const ObuFrameHeader& frame_header);
+ // Allocates and generates the |wedge_masks_| if necessary and sets
+ // |wedge_masks_initialized_| to true.
+ bool MaybeInitializeWedgeMasks(FrameType frame_type);
+
// Elements in this queue cannot be moved with std::move since the
// |EncodedFrame.temporal_unit| stores a pointer to elements in this queue.
Queue<TemporalUnit> temporal_units_;
@@ -233,6 +237,7 @@ class DecoderImpl : public Allocable {
BufferPool buffer_pool_;
WedgeMaskArray wedge_masks_;
+ bool wedge_masks_initialized_ = false;
QuantizerMatrix quantizer_matrix_;
bool quantizer_matrix_initialized_ = false;
FrameScratchBufferPool frame_scratch_buffer_pool_;
diff --git a/src/decoder_state.h b/src/decoder_state.h
index 897c99f..ea5c792 100644
--- a/src/decoder_state.h
+++ b/src/decoder_state.h
@@ -33,7 +33,6 @@ struct DecoderState {
for (int ref_index = 0, mask = refresh_frame_flags; mask != 0;
++ref_index, mask >>= 1) {
if ((mask & 1) != 0) {
- reference_valid[ref_index] = true;
reference_frame_id[ref_index] = current_frame_id;
reference_frame[ref_index] = current_frame;
reference_order_hint[ref_index] = order_hint;
@@ -43,7 +42,6 @@ struct DecoderState {
// Clears all the reference frames.
void ClearReferenceFrames() {
- reference_valid = {};
reference_frame_id = {};
reference_order_hint = {};
for (int ref_index = 0; ref_index < kNumReferenceFrameTypes; ++ref_index) {
@@ -51,12 +49,11 @@ struct DecoderState {
}
}
- // reference_valid and reference_frame_id are used only if
- // sequence_header_.frame_id_numbers_present is true.
- // The reference_valid array is indexed by a reference picture slot number.
- // A value (boolean) in the array signifies whether the corresponding
- // reference picture slot is valid for use as a reference picture.
- std::array<bool, kNumReferenceFrameTypes> reference_valid = {};
+ // reference_frame_id and current_frame_id have meaningful values and are used
+ // in checks only if sequence_header_.frame_id_numbers_present is true. If
+ // sequence_header_.frame_id_numbers_present is false, reference_frame_id and
+ // current_frame_id are assigned the default value 0 and are not used in
+ // checks.
std::array<uint16_t, kNumReferenceFrameTypes> reference_frame_id = {};
// A valid value of current_frame_id is an unsigned integer of at most 16
// bits. -1 indicates current_frame_id is not initialized.
@@ -81,6 +78,11 @@ struct DecoderState {
// * |true| indicates that the reference frame is a backwards reference.
// Note: reference_frame_sign_bias[0] (for kReferenceFrameIntra) is not used.
std::array<bool, kNumReferenceFrameTypes> reference_frame_sign_bias = {};
+ // The RefValid[i] variable in the spec does not need to be stored explicitly.
+ // If the RefValid[i] variable in the spec is 0, then reference_frame[i] is a
+ // null pointer. (Whenever the spec sets the RefValid[i] variable to 0, we set
+ // reference_frame[i] to a null pointer.) If the RefValid[i] variable in the
+ // spec is 1, then reference_frame[i] contains a frame buffer pointer.
std::array<RefCountedBufferPtr, kNumReferenceFrameTypes> reference_frame;
};
diff --git a/src/dsp/arm/average_blend_neon.cc b/src/dsp/arm/average_blend_neon.cc
index 834e8b4..5b4c094 100644
--- a/src/dsp/arm/average_blend_neon.cc
+++ b/src/dsp/arm/average_blend_neon.cc
@@ -35,6 +35,11 @@ namespace {
constexpr int kInterPostRoundBit =
kInterRoundBitsVertical - kInterRoundBitsCompoundVertical;
+} // namespace
+
+namespace low_bitdepth {
+namespace {
+
inline uint8x8_t AverageBlend8Row(const int16_t* prediction_0,
const int16_t* prediction_1) {
const int16x8_t pred0 = vld1q_s16(prediction_0);
@@ -128,13 +133,139 @@ void Init8bpp() {
}
} // namespace
+} // namespace low_bitdepth
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+inline uint16x8_t AverageBlend8Row(const uint16_t* prediction_0,
+ const uint16_t* prediction_1,
+ const int32x4_t compound_offset,
+ const uint16x8_t v_bitdepth) {
+ const uint16x8_t pred0 = vld1q_u16(prediction_0);
+ const uint16x8_t pred1 = vld1q_u16(prediction_1);
+ const uint32x4_t pred_lo =
+ vaddl_u16(vget_low_u16(pred0), vget_low_u16(pred1));
+ const uint32x4_t pred_hi =
+ vaddl_u16(vget_high_u16(pred0), vget_high_u16(pred1));
+ const int32x4_t offset_lo =
+ vsubq_s32(vreinterpretq_s32_u32(pred_lo), compound_offset);
+ const int32x4_t offset_hi =
+ vsubq_s32(vreinterpretq_s32_u32(pred_hi), compound_offset);
+ const uint16x4_t res_lo = vqrshrun_n_s32(offset_lo, kInterPostRoundBit + 1);
+ const uint16x4_t res_hi = vqrshrun_n_s32(offset_hi, kInterPostRoundBit + 1);
+ return vminq_u16(vcombine_u16(res_lo, res_hi), v_bitdepth);
+}
+
+inline void AverageBlendLargeRow(const uint16_t* prediction_0,
+ const uint16_t* prediction_1, const int width,
+ uint16_t* dest,
+ const int32x4_t compound_offset,
+ const uint16x8_t v_bitdepth) {
+ int x = width;
+ do {
+ vst1q_u16(dest, AverageBlend8Row(prediction_0, prediction_1,
+ compound_offset, v_bitdepth));
+ prediction_0 += 8;
+ prediction_1 += 8;
+ dest += 8;
+
+ vst1q_u16(dest, AverageBlend8Row(prediction_0, prediction_1,
+ compound_offset, v_bitdepth));
+ prediction_0 += 8;
+ prediction_1 += 8;
+ dest += 8;
+
+ x -= 16;
+ } while (x != 0);
+}
+
+void AverageBlend_NEON(const void* prediction_0, const void* prediction_1,
+ const int width, const int height, void* const dest,
+ const ptrdiff_t dest_stride) {
+ auto* dst = static_cast<uint16_t*>(dest);
+ const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+ int y = height;
+
+ const ptrdiff_t dst_stride = dest_stride >> 1;
+ const int32x4_t compound_offset =
+ vdupq_n_s32(static_cast<int32_t>(kCompoundOffset + kCompoundOffset));
+ const uint16x8_t v_bitdepth = vdupq_n_u16((1 << kBitdepth10) - 1);
+ if (width == 4) {
+ do {
+ const uint16x8_t result =
+ AverageBlend8Row(pred_0, pred_1, compound_offset, v_bitdepth);
+ pred_0 += 8;
+ pred_1 += 8;
+
+ vst1_u16(dst, vget_low_u16(result));
+ dst += dst_stride;
+ vst1_u16(dst, vget_high_u16(result));
+ dst += dst_stride;
+ y -= 2;
+ } while (y != 0);
+ return;
+ }
+
+ if (width == 8) {
+ do {
+ vst1q_u16(dst,
+ AverageBlend8Row(pred_0, pred_1, compound_offset, v_bitdepth));
+ dst += dst_stride;
+ pred_0 += 8;
+ pred_1 += 8;
+
+ vst1q_u16(dst,
+ AverageBlend8Row(pred_0, pred_1, compound_offset, v_bitdepth));
+ dst += dst_stride;
+ pred_0 += 8;
+ pred_1 += 8;
+
+ y -= 2;
+ } while (y != 0);
+ return;
+ }
+
+ do {
+ AverageBlendLargeRow(pred_0, pred_1, width, dst, compound_offset,
+ v_bitdepth);
+ dst += dst_stride;
+ pred_0 += width;
+ pred_1 += width;
+
+ AverageBlendLargeRow(pred_0, pred_1, width, dst, compound_offset,
+ v_bitdepth);
+ dst += dst_stride;
+ pred_0 += width;
+ pred_1 += width;
+
+ y -= 2;
+ } while (y != 0);
+}
+
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+ dsp->average_blend = AverageBlend_NEON;
+}
+
+} // namespace
+} // namespace high_bitdepth
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
-void AverageBlendInit_NEON() { Init8bpp(); }
+void AverageBlendInit_NEON() {
+ low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ high_bitdepth::Init10bpp();
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+}
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_NEON
+#else // !LIBGAV1_ENABLE_NEON
namespace libgav1 {
namespace dsp {
diff --git a/src/dsp/arm/cdef_neon.cc b/src/dsp/arm/cdef_neon.cc
index 4d0e76f..60c72d6 100644
--- a/src/dsp/arm/cdef_neon.cc
+++ b/src/dsp/arm/cdef_neon.cc
@@ -265,7 +265,7 @@ LIBGAV1_ALWAYS_INLINE void AddPartial(const void* const source,
// 05 15 25 35 45 55 65 75 00 00 00 00 00 00 00 00
// 06 16 26 36 46 56 66 76 00 00 00 00 00 00 00 00
// 07 17 27 37 47 57 67 77 00 00 00 00 00 00 00 00
- partial_lo[2] = vsetq_lane_u16(SumVector(v_src[0]), partial_lo[2], 0);
+ partial_lo[2] = vsetq_lane_u16(SumVector(v_src[0]), vdupq_n_u16(0), 0);
partial_lo[2] = vsetq_lane_u16(SumVector(v_src[1]), partial_lo[2], 1);
partial_lo[2] = vsetq_lane_u16(SumVector(v_src[2]), partial_lo[2], 2);
partial_lo[2] = vsetq_lane_u16(SumVector(v_src[3]), partial_lo[2], 3);
@@ -285,9 +285,8 @@ LIBGAV1_ALWAYS_INLINE void AddPartial(const void* const source,
// 50 51 52 53 54 55 56 57 00 00 00 00 00 00 00 00
// 60 61 62 63 64 65 66 67 00 00 00 00 00 00 00 00
// 70 71 72 73 74 75 76 77 00 00 00 00 00 00 00 00
- const uint8x8_t v_zero = vdup_n_u8(0);
- partial_lo[6] = vaddl_u8(v_zero, v_src[0]);
- for (int i = 1; i < 8; ++i) {
+ partial_lo[6] = vaddl_u8(v_src[0], v_src[1]);
+ for (int i = 2; i < 8; ++i) {
partial_lo[6] = vaddw_u8(partial_lo[6], v_src[i]);
}
@@ -451,7 +450,7 @@ void LoadDirection4(const uint16_t* const src, const ptrdiff_t stride,
int16x8_t Constrain(const uint16x8_t pixel, const uint16x8_t reference,
const uint16x8_t threshold, const int16x8_t damping) {
- // If reference > pixel, the difference will be negative, so covert to 0 or
+ // If reference > pixel, the difference will be negative, so convert to 0 or
// -1.
const uint16x8_t sign = vcgtq_u16(reference, pixel);
const uint16x8_t abs_diff = vabdq_u16(pixel, reference);
@@ -686,7 +685,7 @@ void CdefInit_NEON() { low_bitdepth::Init8bpp(); }
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_NEON
+#else // !LIBGAV1_ENABLE_NEON
namespace libgav1 {
namespace dsp {
diff --git a/src/dsp/arm/common_neon.h b/src/dsp/arm/common_neon.h
index dcb7567..05e0d05 100644
--- a/src/dsp/arm/common_neon.h
+++ b/src/dsp/arm/common_neon.h
@@ -28,8 +28,7 @@
#if 0
#include <cstdio>
-
-#include "absl/strings/str_cat.h"
+#include <string>
constexpr bool kEnablePrintRegs = true;
@@ -86,11 +85,11 @@ inline void PrintVectQ(const DebugRegisterQ r, const char* const name,
inline void PrintReg(const int32x4x2_t val, const std::string& name) {
DebugRegisterQ r;
- vst1q_u32(r.u32, val.val[0]);
- const std::string name0 = absl::StrCat(name, ".val[0]").c_str();
+ vst1q_s32(r.i32, val.val[0]);
+ const std::string name0 = name + std::string(".val[0]");
PrintVectQ(r, name0.c_str(), 32);
- vst1q_u32(r.u32, val.val[1]);
- const std::string name1 = absl::StrCat(name, ".val[1]").c_str();
+ vst1q_s32(r.i32, val.val[1]);
+ const std::string name1 = name + std::string(".val[1]");
PrintVectQ(r, name1.c_str(), 32);
}
@@ -169,14 +168,14 @@ inline void PrintReg(const int8x8_t val, const char* name) {
// Print an individual (non-vector) value in decimal format.
inline void PrintReg(const int x, const char* name) {
if (kEnablePrintRegs) {
- printf("%s: %d\n", name, x);
+ fprintf(stderr, "%s: %d\n", name, x);
}
}
// Print an individual (non-vector) value in hexadecimal format.
inline void PrintHex(const int x, const char* name) {
if (kEnablePrintRegs) {
- printf("%s: %x\n", name, x);
+ fprintf(stderr, "%s: %x\n", name, x);
}
}
@@ -277,22 +276,32 @@ inline void Store2(uint16_t* const buf, const uint16x4_t val) {
ValueToMem<uint32_t>(buf, vget_lane_u32(vreinterpret_u32_u16(val), lane));
}
+// Simplify code when caller has |buf| cast as uint8_t*.
+inline void Store4(void* const buf, const uint16x4_t val) {
+ vst1_u16(static_cast<uint16_t*>(buf), val);
+}
+
+// Simplify code when caller has |buf| cast as uint8_t*.
+inline void Store8(void* const buf, const uint16x8_t val) {
+ vst1q_u16(static_cast<uint16_t*>(buf), val);
+}
+
//------------------------------------------------------------------------------
// Bit manipulation.
// vshXX_n_XX() requires an immediate.
template <int shift>
-inline uint8x8_t LeftShift(const uint8x8_t vector) {
+inline uint8x8_t LeftShiftVector(const uint8x8_t vector) {
return vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(vector), shift));
}
template <int shift>
-inline uint8x8_t RightShift(const uint8x8_t vector) {
+inline uint8x8_t RightShiftVector(const uint8x8_t vector) {
return vreinterpret_u8_u64(vshr_n_u64(vreinterpret_u64_u8(vector), shift));
}
template <int shift>
-inline int8x8_t RightShift(const int8x8_t vector) {
+inline int8x8_t RightShiftVector(const int8x8_t vector) {
return vreinterpret_s8_u64(vshr_n_u64(vreinterpret_u64_s8(vector), shift));
}
@@ -387,6 +396,15 @@ inline uint16_t SumVector(const uint8x8_t a) {
#endif // defined(__aarch64__)
}
+inline uint32_t SumVector(const uint32x2_t a) {
+#if defined(__aarch64__)
+ return vaddv_u32(a);
+#else
+ const uint64x1_t b = vpaddl_u32(a);
+ return vget_lane_u32(vreinterpret_u32_u64(b), 0);
+#endif // defined(__aarch64__)
+}
+
inline uint32_t SumVector(const uint32x4_t a) {
#if defined(__aarch64__)
return vaddvq_u32(a);
@@ -447,6 +465,36 @@ inline uint16x8x2_t VtrnqU64(uint32x4_t a0, uint32x4_t a1) {
}
// Input:
+// 00 01 02 03
+// 10 11 12 13
+// 20 21 22 23
+// 30 31 32 33
+inline void Transpose4x4(uint16x4_t a[4]) {
+ // b:
+ // 00 10 02 12
+ // 01 11 03 13
+ const uint16x4x2_t b = vtrn_u16(a[0], a[1]);
+ // c:
+ // 20 30 22 32
+ // 21 31 23 33
+ const uint16x4x2_t c = vtrn_u16(a[2], a[3]);
+ // d:
+ // 00 10 20 30
+ // 02 12 22 32
+ const uint32x2x2_t d =
+ vtrn_u32(vreinterpret_u32_u16(b.val[0]), vreinterpret_u32_u16(c.val[0]));
+ // e:
+ // 01 11 21 31
+ // 03 13 23 33
+ const uint32x2x2_t e =
+ vtrn_u32(vreinterpret_u32_u16(b.val[1]), vreinterpret_u32_u16(c.val[1]));
+ a[0] = vreinterpret_u16_u32(d.val[0]);
+ a[1] = vreinterpret_u16_u32(e.val[0]);
+ a[2] = vreinterpret_u16_u32(d.val[1]);
+ a[3] = vreinterpret_u16_u32(e.val[1]);
+}
+
+// Input:
// a: 00 01 02 03 10 11 12 13
// b: 20 21 22 23 30 31 32 33
// Output:
diff --git a/src/dsp/arm/convolve_neon.cc b/src/dsp/arm/convolve_neon.cc
index fd9b912..331bfe2 100644
--- a/src/dsp/arm/convolve_neon.cc
+++ b/src/dsp/arm/convolve_neon.cc
@@ -101,245 +101,278 @@ int16x8_t SumOnePassTaps(const uint8x8_t* const src,
return vreinterpretq_s16_u16(sum);
}
-template <int filter_index, bool negative_outside_taps>
-int16x8_t SumHorizontalTaps(const uint8_t* const src,
- const uint8x8_t* const v_tap) {
- uint8x8_t v_src[8];
- const uint8x16_t src_long = vld1q_u8(src);
- int16x8_t sum;
-
- if (filter_index < 2) {
- v_src[0] = vget_low_u8(vextq_u8(src_long, src_long, 1));
- v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 2));
- v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 3));
- v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 4));
- v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 5));
- v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 6));
- sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src, v_tap + 1);
- } else if (filter_index == 2) {
- v_src[0] = vget_low_u8(src_long);
- v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
- v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
- v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
- v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 4));
- v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 5));
- v_src[6] = vget_low_u8(vextq_u8(src_long, src_long, 6));
- v_src[7] = vget_low_u8(vextq_u8(src_long, src_long, 7));
- sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src, v_tap);
- } else if (filter_index == 3) {
- v_src[0] = vget_low_u8(vextq_u8(src_long, src_long, 3));
- v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 4));
- sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src, v_tap + 3);
- } else if (filter_index > 3) {
- v_src[0] = vget_low_u8(vextq_u8(src_long, src_long, 2));
- v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 3));
- v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 4));
- v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 5));
- sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src, v_tap + 2);
- }
- return sum;
-}
-
-template <int filter_index, bool negative_outside_taps>
-uint8x8_t SimpleHorizontalTaps(const uint8_t* const src,
- const uint8x8_t* const v_tap) {
- int16x8_t sum =
- SumHorizontalTaps<filter_index, negative_outside_taps>(src, v_tap);
-
- // Normally the Horizontal pass does the downshift in two passes:
- // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
- // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them
- // requires adding the rounding offset from the skipped shift.
- constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2);
-
- sum = vaddq_s16(sum, vdupq_n_s16(first_shift_rounding_bit));
- return vqrshrun_n_s16(sum, kFilterBits - 1);
-}
-
-template <int filter_index, bool negative_outside_taps>
-uint16x8_t HorizontalTaps8To16(const uint8_t* const src,
- const uint8x8_t* const v_tap) {
- const int16x8_t sum =
- SumHorizontalTaps<filter_index, negative_outside_taps>(src, v_tap);
-
- return vreinterpretq_u16_s16(
- vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1));
-}
-
-template <int filter_index>
-int16x8_t SumHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
- const uint8x8_t* const v_tap) {
- uint16x8_t sum;
- const uint8x8_t input0 = vld1_u8(src);
- src += src_stride;
- const uint8x8_t input1 = vld1_u8(src);
- uint8x8x2_t input = vzip_u8(input0, input1);
-
- if (filter_index == 3) {
- // tap signs : + +
- sum = vmull_u8(vext_u8(input.val[0], input.val[1], 6), v_tap[3]);
- sum = vmlal_u8(sum, input.val[1], v_tap[4]);
- } else if (filter_index == 4) {
- // tap signs : - + + -
- sum = vmull_u8(vext_u8(input.val[0], input.val[1], 6), v_tap[3]);
- sum = vmlsl_u8(sum, RightShift<4 * 8>(input.val[0]), v_tap[2]);
- sum = vmlal_u8(sum, input.val[1], v_tap[4]);
- sum = vmlsl_u8(sum, RightShift<2 * 8>(input.val[1]), v_tap[5]);
- } else {
- // tap signs : + + + +
- sum = vmull_u8(RightShift<4 * 8>(input.val[0]), v_tap[2]);
- sum = vmlal_u8(sum, vext_u8(input.val[0], input.val[1], 6), v_tap[3]);
- sum = vmlal_u8(sum, input.val[1], v_tap[4]);
- sum = vmlal_u8(sum, RightShift<2 * 8>(input.val[1]), v_tap[5]);
- }
-
- return vreinterpretq_s16_u16(sum);
-}
-
-template <int filter_index>
-uint8x8_t SimpleHorizontalTaps2x2(const uint8_t* src,
- const ptrdiff_t src_stride,
- const uint8x8_t* const v_tap) {
- int16x8_t sum = SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
-
- // Normally the Horizontal pass does the downshift in two passes:
- // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
- // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them
- // requires adding the rounding offset from the skipped shift.
- constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2);
-
- sum = vaddq_s16(sum, vdupq_n_s16(first_shift_rounding_bit));
- return vqrshrun_n_s16(sum, kFilterBits - 1);
-}
-
-template <int filter_index>
-uint16x8_t HorizontalTaps8To16_2x2(const uint8_t* src,
- const ptrdiff_t src_stride,
- const uint8x8_t* const v_tap) {
- const int16x8_t sum =
- SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
-
- return vreinterpretq_u16_s16(
- vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1));
-}
-
-template <int num_taps, int step, int filter_index,
- bool negative_outside_taps = true, bool is_2d = false,
- bool is_compound = false>
-void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
- void* const dest, const ptrdiff_t pred_stride,
- const int width, const int height,
- const uint8x8_t* const v_tap) {
+template <int filter_index, bool negative_outside_taps, bool is_2d,
+ bool is_compound>
+void FilterHorizontalWidth8AndUp(const uint8_t* src, const ptrdiff_t src_stride,
+ void* const dest, const ptrdiff_t pred_stride,
+ const int width, const int height,
+ const uint8x8_t* const v_tap) {
auto* dest8 = static_cast<uint8_t*>(dest);
auto* dest16 = static_cast<uint16_t*>(dest);
-
- // 4 tap filters are never used when width > 4.
- if (num_taps != 4 && width > 4) {
- int y = 0;
+ if (!is_2d) {
+ int y = height;
do {
int x = 0;
- do {
- if (is_2d || is_compound) {
- const uint16x8_t v_sum =
- HorizontalTaps8To16<filter_index, negative_outside_taps>(&src[x],
- v_tap);
+ do { // Increasing loop counter x is better.
+ const uint8x16_t src_long = vld1q_u8(src + x);
+ uint8x8_t v_src[8];
+ int16x8_t sum;
+ if (filter_index < 2) {
+ v_src[0] = vget_low_u8(src_long);
+ v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+ v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
+ v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+ v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 4));
+ v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 5));
+ sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src,
+ v_tap + 1);
+ } else if (filter_index == 2) {
+ v_src[0] = vget_low_u8(src_long);
+ v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+ v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
+ v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+ v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 4));
+ v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 5));
+ v_src[6] = vget_low_u8(vextq_u8(src_long, src_long, 6));
+ v_src[7] = vget_low_u8(vextq_u8(src_long, src_long, 7));
+ sum = SumOnePassTaps<filter_index, false>(v_src, v_tap);
+ } else if (filter_index == 3) {
+ v_src[0] = vget_low_u8(src_long);
+ v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+ sum = SumOnePassTaps<filter_index, false>(v_src, v_tap + 3);
+ } else if (filter_index > 3) {
+ v_src[0] = vget_low_u8(src_long);
+ v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+ v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
+ v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+ sum = SumOnePassTaps<filter_index, false>(v_src, v_tap + 2);
+ }
+ if (is_compound) {
+ const uint16x8_t v_sum = vreinterpretq_u16_s16(
+ vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1));
vst1q_u16(&dest16[x], v_sum);
} else {
- const uint8x8_t result =
- SimpleHorizontalTaps<filter_index, negative_outside_taps>(&src[x],
- v_tap);
+ // Normally the Horizontal pass does the downshift in two passes:
+ // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
+ // kInterRoundBitsHorizontal). Each one uses a rounding shift.
+ // Combining them requires adding the rounding offset from the skipped
+ // shift.
+ constexpr int first_shift_rounding_bit =
+ 1 << (kInterRoundBitsHorizontal - 2);
+ sum = vaddq_s16(sum, vdupq_n_s16(first_shift_rounding_bit));
+ const uint8x8_t result = vqrshrun_n_s16(sum, kFilterBits - 1);
vst1_u8(&dest8[x], result);
}
- x += step;
+ x += 8;
} while (x < width);
src += src_stride;
dest8 += pred_stride;
dest16 += pred_stride;
- } while (++y < height);
+ } while (--y != 0);
+ } else {
+ int x = 0;
+ do {
+ const uint8_t* s = src + x;
+ int y = height;
+ do { // Increasing loop counter x is better.
+ const uint8x16_t src_long = vld1q_u8(s);
+ uint8x8_t v_src[8];
+ int16x8_t sum;
+ if (filter_index < 2) {
+ v_src[0] = vget_low_u8(src_long);
+ v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+ v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
+ v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+ v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 4));
+ v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 5));
+ sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src,
+ v_tap + 1);
+ } else if (filter_index == 2) {
+ v_src[0] = vget_low_u8(src_long);
+ v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+ v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
+ v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+ v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 4));
+ v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 5));
+ v_src[6] = vget_low_u8(vextq_u8(src_long, src_long, 6));
+ v_src[7] = vget_low_u8(vextq_u8(src_long, src_long, 7));
+ sum = SumOnePassTaps<filter_index, false>(v_src, v_tap);
+ } else if (filter_index == 3) {
+ v_src[0] = vget_low_u8(src_long);
+ v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+ sum = SumOnePassTaps<filter_index, false>(v_src, v_tap + 3);
+ } else if (filter_index > 3) {
+ v_src[0] = vget_low_u8(src_long);
+ v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+ v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
+ v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+ sum = SumOnePassTaps<filter_index, false>(v_src, v_tap + 2);
+ }
+ const uint16x8_t v_sum = vreinterpretq_u16_s16(
+ vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1));
+ vst1q_u16(dest16, v_sum);
+ s += src_stride;
+ dest16 += 8;
+ } while (--y != 0);
+ x += 8;
+ } while (x < width);
+ }
+}
+
+template <int filter_index, bool is_2d, bool is_compound>
+void FilterHorizontalWidth4(const uint8_t* src, const ptrdiff_t src_stride,
+ void* const dest, const ptrdiff_t pred_stride,
+ const int height, const uint8x8_t* const v_tap) {
+ auto* dest8 = static_cast<uint8_t*>(dest);
+ auto* dest16 = static_cast<uint16_t*>(dest);
+ int y = height;
+ do {
+ uint8x8_t v_src[4];
+ int16x8_t sum;
+ v_src[0] = vld1_u8(src);
+ if (filter_index == 3) {
+ v_src[1] = RightShiftVector<1 * 8>(v_src[0]);
+ sum = SumOnePassTaps<filter_index, false>(v_src, v_tap + 3);
+ } else {
+ v_src[1] = RightShiftVector<1 * 8>(v_src[0]);
+ v_src[2] = RightShiftVector<2 * 8>(v_src[0]);
+ v_src[3] = RightShiftVector<3 * 8>(v_src[0]);
+ sum = SumOnePassTaps<filter_index, false>(v_src, v_tap + 2);
+ }
+ if (is_2d || is_compound) {
+ const uint16x4_t v_sum = vreinterpret_u16_s16(
+ vrshr_n_s16(vget_low_s16(sum), kInterRoundBitsHorizontal - 1));
+ vst1_u16(dest16, v_sum);
+ } else {
+ constexpr int first_shift_rounding_bit =
+ 1 << (kInterRoundBitsHorizontal - 2);
+ sum = vaddq_s16(sum, vdupq_n_s16(first_shift_rounding_bit));
+ const uint8x8_t result = vqrshrun_n_s16(sum, kFilterBits - 1);
+ StoreLo4(&dest8[0], result);
+ }
+ src += src_stride;
+ dest8 += pred_stride;
+ dest16 += pred_stride;
+ } while (--y != 0);
+}
+
+template <int filter_index, bool is_2d>
+void FilterHorizontalWidth2(const uint8_t* src, const ptrdiff_t src_stride,
+ void* const dest, const ptrdiff_t pred_stride,
+ const int height, const uint8x8_t* const v_tap) {
+ auto* dest8 = static_cast<uint8_t*>(dest);
+ auto* dest16 = static_cast<uint16_t*>(dest);
+ int y = height >> 1;
+ do {
+ const uint8x8_t input0 = vld1_u8(src);
+ const uint8x8_t input1 = vld1_u8(src + src_stride);
+ const uint8x8x2_t input = vzip_u8(input0, input1);
+ uint16x8_t sum;
+ if (filter_index == 3) {
+ // tap signs : + +
+ sum = vmull_u8(input.val[0], v_tap[3]);
+ sum = vmlal_u8(sum, vext_u8(input.val[0], input.val[1], 2), v_tap[4]);
+ } else if (filter_index == 4) {
+ // tap signs : - + + -
+ sum = vmull_u8(RightShiftVector<2 * 8>(input.val[0]), v_tap[3]);
+ sum = vmlsl_u8(sum, input.val[0], v_tap[2]);
+ sum = vmlal_u8(sum, RightShiftVector<4 * 8>(input.val[0]), v_tap[4]);
+ sum = vmlsl_u8(sum, vext_u8(input.val[0], input.val[1], 6), v_tap[5]);
+ } else {
+ // tap signs : + + + +
+ sum = vmull_u8(input.val[0], v_tap[2]);
+ sum = vmlal_u8(sum, RightShiftVector<2 * 8>(input.val[0]), v_tap[3]);
+ sum = vmlal_u8(sum, RightShiftVector<4 * 8>(input.val[0]), v_tap[4]);
+ sum = vmlal_u8(sum, vext_u8(input.val[0], input.val[1], 6), v_tap[5]);
+ }
+ int16x8_t s = vreinterpretq_s16_u16(sum);
+ if (is_2d) {
+ const uint16x8_t v_sum =
+ vreinterpretq_u16_s16(vrshrq_n_s16(s, kInterRoundBitsHorizontal - 1));
+ dest16[0] = vgetq_lane_u16(v_sum, 0);
+ dest16[1] = vgetq_lane_u16(v_sum, 2);
+ dest16 += pred_stride;
+ dest16[0] = vgetq_lane_u16(v_sum, 1);
+ dest16[1] = vgetq_lane_u16(v_sum, 3);
+ dest16 += pred_stride;
+ } else {
+ // Normally the Horizontal pass does the downshift in two passes:
+ // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
+ // kInterRoundBitsHorizontal). Each one uses a rounding shift.
+ // Combining them requires adding the rounding offset from the skipped
+ // shift.
+ constexpr int first_shift_rounding_bit =
+ 1 << (kInterRoundBitsHorizontal - 2);
+ s = vaddq_s16(s, vdupq_n_s16(first_shift_rounding_bit));
+ const uint8x8_t result = vqrshrun_n_s16(s, kFilterBits - 1);
+ dest8[0] = vget_lane_u8(result, 0);
+ dest8[1] = vget_lane_u8(result, 2);
+ dest8 += pred_stride;
+ dest8[0] = vget_lane_u8(result, 1);
+ dest8[1] = vget_lane_u8(result, 3);
+ dest8 += pred_stride;
+ }
+ src += src_stride << 1;
+ } while (--y != 0);
+
+ // The 2d filters have an odd |height| because the horizontal pass
+ // generates context for the vertical pass.
+ if (is_2d) {
+ assert(height % 2 == 1);
+ const uint8x8_t input = vld1_u8(src);
+ uint16x8_t sum;
+ if (filter_index == 3) {
+ sum = vmull_u8(input, v_tap[3]);
+ sum = vmlal_u8(sum, RightShiftVector<1 * 8>(input), v_tap[4]);
+ } else if (filter_index == 4) {
+ sum = vmull_u8(RightShiftVector<1 * 8>(input), v_tap[3]);
+ sum = vmlsl_u8(sum, input, v_tap[2]);
+ sum = vmlal_u8(sum, RightShiftVector<2 * 8>(input), v_tap[4]);
+ sum = vmlsl_u8(sum, RightShiftVector<3 * 8>(input), v_tap[5]);
+ } else {
+ assert(filter_index == 5);
+ sum = vmull_u8(input, v_tap[2]);
+ sum = vmlal_u8(sum, RightShiftVector<1 * 8>(input), v_tap[3]);
+ sum = vmlal_u8(sum, RightShiftVector<2 * 8>(input), v_tap[4]);
+ sum = vmlal_u8(sum, RightShiftVector<3 * 8>(input), v_tap[5]);
+ }
+ // |sum| contains an int16_t value.
+ sum = vreinterpretq_u16_s16(vrshrq_n_s16(vreinterpretq_s16_u16(sum),
+ kInterRoundBitsHorizontal - 1));
+ Store2<0>(dest16, sum);
+ }
+}
+
+template <int filter_index, bool negative_outside_taps, bool is_2d,
+ bool is_compound>
+void FilterHorizontal(const uint8_t* const src, const ptrdiff_t src_stride,
+ void* const dest, const ptrdiff_t pred_stride,
+ const int width, const int height,
+ const uint8x8_t* const v_tap) {
+ assert(width < 8 || filter_index <= 3);
+ // Don't simplify the redundant if conditions with the template parameters,
+ // which helps the compiler generate compact code.
+ if (width >= 8 && filter_index <= 3) {
+ FilterHorizontalWidth8AndUp<filter_index, negative_outside_taps, is_2d,
+ is_compound>(src, src_stride, dest, pred_stride,
+ width, height, v_tap);
return;
}
- // Horizontal passes only needs to account for |num_taps| 2 and 4 when
+ // Horizontal passes only needs to account for number of taps 2 and 4 when
// |width| <= 4.
assert(width <= 4);
- assert(num_taps <= 4);
- if (num_taps <= 4) {
+ assert(filter_index >= 3 && filter_index <= 5);
+ if (filter_index >= 3 && filter_index <= 5) {
if (width == 4) {
- int y = 0;
- do {
- if (is_2d || is_compound) {
- const uint16x8_t v_sum =
- HorizontalTaps8To16<filter_index, negative_outside_taps>(src,
- v_tap);
- vst1_u16(dest16, vget_low_u16(v_sum));
- } else {
- const uint8x8_t result =
- SimpleHorizontalTaps<filter_index, negative_outside_taps>(src,
- v_tap);
- StoreLo4(&dest8[0], result);
- }
- src += src_stride;
- dest8 += pred_stride;
- dest16 += pred_stride;
- } while (++y < height);
+ FilterHorizontalWidth4<filter_index, is_2d, is_compound>(
+ src, src_stride, dest, pred_stride, height, v_tap);
return;
}
-
+ assert(width == 2);
if (!is_compound) {
- int y = 0;
- do {
- if (is_2d) {
- const uint16x8_t sum =
- HorizontalTaps8To16_2x2<filter_index>(src, src_stride, v_tap);
- dest16[0] = vgetq_lane_u16(sum, 0);
- dest16[1] = vgetq_lane_u16(sum, 2);
- dest16 += pred_stride;
- dest16[0] = vgetq_lane_u16(sum, 1);
- dest16[1] = vgetq_lane_u16(sum, 3);
- dest16 += pred_stride;
- } else {
- const uint8x8_t sum =
- SimpleHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
-
- dest8[0] = vget_lane_u8(sum, 0);
- dest8[1] = vget_lane_u8(sum, 2);
- dest8 += pred_stride;
-
- dest8[0] = vget_lane_u8(sum, 1);
- dest8[1] = vget_lane_u8(sum, 3);
- dest8 += pred_stride;
- }
-
- src += src_stride << 1;
- y += 2;
- } while (y < height - 1);
-
- // The 2d filters have an odd |height| because the horizontal pass
- // generates context for the vertical pass.
- if (is_2d) {
- assert(height % 2 == 1);
- uint16x8_t sum;
- const uint8x8_t input = vld1_u8(src);
- if (filter_index == 3) { // |num_taps| == 2
- sum = vmull_u8(RightShift<3 * 8>(input), v_tap[3]);
- sum = vmlal_u8(sum, RightShift<4 * 8>(input), v_tap[4]);
- } else if (filter_index == 4) {
- sum = vmull_u8(RightShift<3 * 8>(input), v_tap[3]);
- sum = vmlsl_u8(sum, RightShift<2 * 8>(input), v_tap[2]);
- sum = vmlal_u8(sum, RightShift<4 * 8>(input), v_tap[4]);
- sum = vmlsl_u8(sum, RightShift<5 * 8>(input), v_tap[5]);
- } else {
- assert(filter_index == 5);
- sum = vmull_u8(RightShift<2 * 8>(input), v_tap[2]);
- sum = vmlal_u8(sum, RightShift<3 * 8>(input), v_tap[3]);
- sum = vmlal_u8(sum, RightShift<4 * 8>(input), v_tap[4]);
- sum = vmlal_u8(sum, RightShift<5 * 8>(input), v_tap[5]);
- }
- // |sum| contains an int16_t value.
- sum = vreinterpretq_u16_s16(vrshrq_n_s16(
- vreinterpretq_s16_u16(sum), kInterRoundBitsHorizontal - 1));
- Store2<0>(dest16, sum);
- }
+ FilterHorizontalWidth2<filter_index, is_2d>(src, src_stride, dest,
+ pred_stride, height, v_tap);
}
}
}
@@ -451,78 +484,85 @@ int16x8_t SimpleSum2DVerticalTaps(const int16x8_t* const src,
}
template <int num_taps, bool is_compound = false>
-void Filter2DVertical(const uint16_t* src, void* const dst,
- const ptrdiff_t dst_stride, const int width,
- const int height, const int16x8_t taps) {
+void Filter2DVerticalWidth8AndUp(const uint16_t* src, void* const dst,
+ const ptrdiff_t dst_stride, const int width,
+ const int height, const int16x8_t taps) {
assert(width >= 8);
constexpr int next_row = num_taps - 1;
- // The Horizontal pass uses |width| as |stride| for the intermediate buffer.
- const ptrdiff_t src_stride = width;
-
- auto* dst8 = static_cast<uint8_t*>(dst);
- auto* dst16 = static_cast<uint16_t*>(dst);
+ auto* const dst8 = static_cast<uint8_t*>(dst);
+ auto* const dst16 = static_cast<uint16_t*>(dst);
int x = 0;
do {
- int16x8_t srcs[8];
- const uint16_t* src_x = src + x;
- srcs[0] = vreinterpretq_s16_u16(vld1q_u16(src_x));
- src_x += src_stride;
+ int16x8_t srcs[9];
+ srcs[0] = vreinterpretq_s16_u16(vld1q_u16(src));
+ src += 8;
if (num_taps >= 4) {
- srcs[1] = vreinterpretq_s16_u16(vld1q_u16(src_x));
- src_x += src_stride;
- srcs[2] = vreinterpretq_s16_u16(vld1q_u16(src_x));
- src_x += src_stride;
+ srcs[1] = vreinterpretq_s16_u16(vld1q_u16(src));
+ src += 8;
+ srcs[2] = vreinterpretq_s16_u16(vld1q_u16(src));
+ src += 8;
if (num_taps >= 6) {
- srcs[3] = vreinterpretq_s16_u16(vld1q_u16(src_x));
- src_x += src_stride;
- srcs[4] = vreinterpretq_s16_u16(vld1q_u16(src_x));
- src_x += src_stride;
+ srcs[3] = vreinterpretq_s16_u16(vld1q_u16(src));
+ src += 8;
+ srcs[4] = vreinterpretq_s16_u16(vld1q_u16(src));
+ src += 8;
if (num_taps == 8) {
- srcs[5] = vreinterpretq_s16_u16(vld1q_u16(src_x));
- src_x += src_stride;
- srcs[6] = vreinterpretq_s16_u16(vld1q_u16(src_x));
- src_x += src_stride;
+ srcs[5] = vreinterpretq_s16_u16(vld1q_u16(src));
+ src += 8;
+ srcs[6] = vreinterpretq_s16_u16(vld1q_u16(src));
+ src += 8;
}
}
}
- int y = 0;
+ uint8_t* d8 = dst8 + x;
+ uint16_t* d16 = dst16 + x;
+ int y = height;
do {
- srcs[next_row] = vreinterpretq_s16_u16(vld1q_u16(src_x));
- src_x += src_stride;
-
- const int16x8_t sum =
- SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
+ srcs[next_row] = vreinterpretq_s16_u16(vld1q_u16(src));
+ src += 8;
+ srcs[next_row + 1] = vreinterpretq_s16_u16(vld1q_u16(src));
+ src += 8;
+ const int16x8_t sum0 =
+ SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs + 0, taps);
+ const int16x8_t sum1 =
+ SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs + 1, taps);
if (is_compound) {
- vst1q_u16(dst16 + x + y * dst_stride, vreinterpretq_u16_s16(sum));
+ vst1q_u16(d16, vreinterpretq_u16_s16(sum0));
+ d16 += dst_stride;
+ vst1q_u16(d16, vreinterpretq_u16_s16(sum1));
+ d16 += dst_stride;
} else {
- vst1_u8(dst8 + x + y * dst_stride, vqmovun_s16(sum));
+ vst1_u8(d8, vqmovun_s16(sum0));
+ d8 += dst_stride;
+ vst1_u8(d8, vqmovun_s16(sum1));
+ d8 += dst_stride;
}
-
- srcs[0] = srcs[1];
+ srcs[0] = srcs[2];
if (num_taps >= 4) {
- srcs[1] = srcs[2];
- srcs[2] = srcs[3];
+ srcs[1] = srcs[3];
+ srcs[2] = srcs[4];
if (num_taps >= 6) {
- srcs[3] = srcs[4];
- srcs[4] = srcs[5];
+ srcs[3] = srcs[5];
+ srcs[4] = srcs[6];
if (num_taps == 8) {
- srcs[5] = srcs[6];
- srcs[6] = srcs[7];
+ srcs[5] = srcs[7];
+ srcs[6] = srcs[8];
}
}
}
- } while (++y < height);
+ y -= 2;
+ } while (y != 0);
x += 8;
} while (x < width);
}
// Take advantage of |src_stride| == |width| to process two rows at a time.
template <int num_taps, bool is_compound = false>
-void Filter2DVertical4xH(const uint16_t* src, void* const dst,
- const ptrdiff_t dst_stride, const int height,
- const int16x8_t taps) {
+void Filter2DVerticalWidth4(const uint16_t* src, void* const dst,
+ const ptrdiff_t dst_stride, const int height,
+ const int16x8_t taps) {
auto* dst8 = static_cast<uint8_t*>(dst);
auto* dst16 = static_cast<uint16_t*>(dst);
@@ -545,7 +585,7 @@ void Filter2DVertical4xH(const uint16_t* src, void* const dst,
}
}
- int y = 0;
+ int y = height;
do {
srcs[num_taps] = vreinterpretq_s16_u16(vld1q_u16(src));
src += 8;
@@ -580,15 +620,15 @@ void Filter2DVertical4xH(const uint16_t* src, void* const dst,
}
}
}
- y += 2;
- } while (y < height);
+ y -= 2;
+ } while (y != 0);
}
// Take advantage of |src_stride| == |width| to process four rows at a time.
template <int num_taps>
-void Filter2DVertical2xH(const uint16_t* src, void* const dst,
- const ptrdiff_t dst_stride, const int height,
- const int16x8_t taps) {
+void Filter2DVerticalWidth2(const uint16_t* src, void* const dst,
+ const ptrdiff_t dst_stride, const int height,
+ const int16x8_t taps) {
constexpr int next_row = (num_taps < 6) ? 4 : 8;
auto* dst8 = static_cast<uint8_t*>(dst);
@@ -672,29 +712,47 @@ LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
}
if (filter_index == 2) { // 8 tap.
- FilterHorizontal<8, 8, 2, true, is_2d, is_compound>(
+ FilterHorizontal<2, true, is_2d, is_compound>(
src, src_stride, dst, dst_stride, width, height, v_tap);
} else if (filter_index == 1) { // 6 tap.
// Check if outside taps are positive.
if ((filter_id == 1) | (filter_id == 15)) {
- FilterHorizontal<6, 8, 1, false, is_2d, is_compound>(
- src, src_stride, dst, dst_stride, width, height, v_tap);
+ FilterHorizontal<1, false, is_2d, is_compound>(
+ src + 1, src_stride, dst, dst_stride, width, height, v_tap);
} else {
- FilterHorizontal<6, 8, 1, true, is_2d, is_compound>(
- src, src_stride, dst, dst_stride, width, height, v_tap);
+ FilterHorizontal<1, true, is_2d, is_compound>(
+ src + 1, src_stride, dst, dst_stride, width, height, v_tap);
}
} else if (filter_index == 0) { // 6 tap.
- FilterHorizontal<6, 8, 0, true, is_2d, is_compound>(
- src, src_stride, dst, dst_stride, width, height, v_tap);
+ FilterHorizontal<0, true, is_2d, is_compound>(
+ src + 1, src_stride, dst, dst_stride, width, height, v_tap);
} else if (filter_index == 4) { // 4 tap.
- FilterHorizontal<4, 8, 4, true, is_2d, is_compound>(
- src, src_stride, dst, dst_stride, width, height, v_tap);
+ FilterHorizontal<4, true, is_2d, is_compound>(
+ src + 2, src_stride, dst, dst_stride, width, height, v_tap);
} else if (filter_index == 5) { // 4 tap.
- FilterHorizontal<4, 8, 5, true, is_2d, is_compound>(
- src, src_stride, dst, dst_stride, width, height, v_tap);
+ FilterHorizontal<5, true, is_2d, is_compound>(
+ src + 2, src_stride, dst, dst_stride, width, height, v_tap);
} else { // 2 tap.
- FilterHorizontal<2, 8, 3, true, is_2d, is_compound>(
- src, src_stride, dst, dst_stride, width, height, v_tap);
+ FilterHorizontal<3, true, is_2d, is_compound>(
+ src + 3, src_stride, dst, dst_stride, width, height, v_tap);
+ }
+}
+
+template <int vertical_taps>
+void Filter2DVertical(const uint16_t* const intermediate_result,
+ const int width, const int height, const int16x8_t taps,
+ void* const prediction, const ptrdiff_t pred_stride) {
+ auto* const dest = static_cast<uint8_t*>(prediction);
+ if (width >= 8) {
+ Filter2DVerticalWidth8AndUp<vertical_taps>(
+ intermediate_result, dest, pred_stride, width, height, taps);
+ } else if (width == 4) {
+ Filter2DVerticalWidth4<vertical_taps>(intermediate_result, dest,
+ pred_stride, height, taps);
+ } else {
+ assert(width == 2);
+ Filter2DVerticalWidth2<vertical_taps>(intermediate_result, dest,
+ pred_stride, height, taps);
}
}
@@ -704,7 +762,7 @@ void Convolve2D_NEON(const void* const reference,
const int vertical_filter_index,
const int horizontal_filter_id,
const int vertical_filter_id, const int width,
- const int height, void* prediction,
+ const int height, void* const prediction,
const ptrdiff_t pred_stride) {
const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
@@ -715,67 +773,31 @@ void Convolve2D_NEON(const void* const reference,
intermediate_result[kMaxSuperBlockSizeInPixels *
(kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
const int intermediate_height = height + vertical_taps - 1;
-
const ptrdiff_t src_stride = reference_stride;
- const auto* src = static_cast<const uint8_t*>(reference) -
- (vertical_taps / 2 - 1) * src_stride - kHorizontalOffset;
+ const auto* const src = static_cast<const uint8_t*>(reference) -
+ (vertical_taps / 2 - 1) * src_stride -
+ kHorizontalOffset;
DoHorizontalPass</*is_2d=*/true>(src, src_stride, intermediate_result, width,
width, intermediate_height,
horizontal_filter_id, horiz_filter_index);
// Vertical filter.
- auto* dest = static_cast<uint8_t*>(prediction);
- const ptrdiff_t dest_stride = pred_stride;
assert(vertical_filter_id != 0);
-
const int16x8_t taps = vmovl_s8(
vld1_s8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]));
-
if (vertical_taps == 8) {
- if (width == 2) {
- Filter2DVertical2xH<8>(intermediate_result, dest, dest_stride, height,
- taps);
- } else if (width == 4) {
- Filter2DVertical4xH<8>(intermediate_result, dest, dest_stride, height,
- taps);
- } else {
- Filter2DVertical<8>(intermediate_result, dest, dest_stride, width, height,
- taps);
- }
+ Filter2DVertical<8>(intermediate_result, width, height, taps, prediction,
+ pred_stride);
} else if (vertical_taps == 6) {
- if (width == 2) {
- Filter2DVertical2xH<6>(intermediate_result, dest, dest_stride, height,
- taps);
- } else if (width == 4) {
- Filter2DVertical4xH<6>(intermediate_result, dest, dest_stride, height,
- taps);
- } else {
- Filter2DVertical<6>(intermediate_result, dest, dest_stride, width, height,
- taps);
- }
+ Filter2DVertical<6>(intermediate_result, width, height, taps, prediction,
+ pred_stride);
} else if (vertical_taps == 4) {
- if (width == 2) {
- Filter2DVertical2xH<4>(intermediate_result, dest, dest_stride, height,
- taps);
- } else if (width == 4) {
- Filter2DVertical4xH<4>(intermediate_result, dest, dest_stride, height,
- taps);
- } else {
- Filter2DVertical<4>(intermediate_result, dest, dest_stride, width, height,
- taps);
- }
+ Filter2DVertical<4>(intermediate_result, width, height, taps, prediction,
+ pred_stride);
} else { // |vertical_taps| == 2
- if (width == 2) {
- Filter2DVertical2xH<2>(intermediate_result, dest, dest_stride, height,
- taps);
- } else if (width == 4) {
- Filter2DVertical4xH<2>(intermediate_result, dest, dest_stride, height,
- taps);
- } else {
- Filter2DVertical<2>(intermediate_result, dest, dest_stride, width, height,
- taps);
- }
+ Filter2DVertical<2>(intermediate_result, width, height, taps, prediction,
+ pred_stride);
}
}
@@ -788,7 +810,7 @@ void Convolve2D_NEON(const void* const reference,
// increments. The first load covers the initial elements of src_x, while the
// final load covers the taps.
template <int grade_x>
-inline uint8x8x3_t LoadSrcVals(const uint8_t* src_x) {
+inline uint8x8x3_t LoadSrcVals(const uint8_t* const src_x) {
uint8x8x3_t ret;
const uint8x16_t src_val = vld1q_u8(src_x);
ret.val[0] = vget_low_u8(src_val);
@@ -811,7 +833,7 @@ inline uint8x16_t GetPositive2TapFilter(const int tap_index) {
}
template <int grade_x>
-inline void ConvolveKernelHorizontal2Tap(const uint8_t* src,
+inline void ConvolveKernelHorizontal2Tap(const uint8_t* const src,
const ptrdiff_t src_stride,
const int width, const int subpixel_x,
const int step_x,
@@ -843,7 +865,7 @@ inline void ConvolveKernelHorizontal2Tap(const uint8_t* src,
// on x.
const uint8x8_t taps[2] = {VQTbl1U8(filter_taps0, filter_indices),
VQTbl1U8(filter_taps1, filter_indices)};
- int y = 0;
+ int y = intermediate_height;
do {
// Load a pool of samples to select from using stepped indices.
const uint8x16_t src_vals = vld1q_u8(src_x);
@@ -860,7 +882,7 @@ inline void ConvolveKernelHorizontal2Tap(const uint8_t* src,
kInterRoundBitsHorizontal - 1));
src_x += src_stride;
intermediate += kIntermediateStride;
- } while (++y < intermediate_height);
+ } while (--y != 0);
return;
}
@@ -883,7 +905,7 @@ inline void ConvolveKernelHorizontal2Tap(const uint8_t* src,
// on x.
const uint8x8_t taps[2] = {VQTbl1U8(filter_taps0, filter_indices),
VQTbl1U8(filter_taps1, filter_indices)};
- int y = 0;
+ int y = intermediate_height;
do {
// Load a pool of samples to select from using stepped indices.
const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
@@ -900,7 +922,7 @@ inline void ConvolveKernelHorizontal2Tap(const uint8_t* src,
kInterRoundBitsHorizontal - 1));
src_x += src_stride;
intermediate_x += kIntermediateStride;
- } while (++y < intermediate_height);
+ } while (--y != 0);
x += 8;
p += step_x8;
} while (x < width);
@@ -921,7 +943,7 @@ inline uint8x16_t GetPositive4TapFilter(const int tap_index) {
// This filter is only possible when width <= 4.
void ConvolveKernelHorizontalPositive4Tap(
- const uint8_t* src, const ptrdiff_t src_stride, const int subpixel_x,
+ const uint8_t* const src, const ptrdiff_t src_stride, const int subpixel_x,
const int step_x, const int intermediate_height, int16_t* intermediate) {
const int kernel_offset = 2;
const int ref_x = subpixel_x >> kScaleSubPixelBits;
@@ -950,7 +972,7 @@ void ConvolveKernelHorizontalPositive4Tap(
const uint8x8_t src_indices =
vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
- int y = 0;
+ int y = intermediate_height;
do {
// Load a pool of samples to select from using stepped index vectors.
const uint8x16_t src_vals = vld1q_u8(src_x);
@@ -970,7 +992,7 @@ void ConvolveKernelHorizontalPositive4Tap(
src_x += src_stride;
intermediate += kIntermediateStride;
- } while (++y < intermediate_height);
+ } while (--y != 0);
}
// Pre-transpose the 4 tap filters in |kAbsHalfSubPixelFilters|[4].
@@ -988,7 +1010,7 @@ inline uint8x16_t GetSigned4TapFilter(const int tap_index) {
// This filter is only possible when width <= 4.
inline void ConvolveKernelHorizontalSigned4Tap(
- const uint8_t* src, const ptrdiff_t src_stride, const int subpixel_x,
+ const uint8_t* const src, const ptrdiff_t src_stride, const int subpixel_x,
const int step_x, const int intermediate_height, int16_t* intermediate) {
const int kernel_offset = 2;
const int ref_x = subpixel_x >> kScaleSubPixelBits;
@@ -1025,7 +1047,7 @@ inline void ConvolveKernelHorizontalSigned4Tap(
vadd_u8(src_indices_base, vdup_n_u8(2)),
vadd_u8(src_indices_base, vdup_n_u8(3))};
- int y = 0;
+ int y = intermediate_height;
do {
// Load a pool of samples to select from using stepped indices.
const uint8x16_t src_vals = vld1q_u8(src_x);
@@ -1042,7 +1064,7 @@ inline void ConvolveKernelHorizontalSigned4Tap(
kInterRoundBitsHorizontal - 1));
src_x += src_stride;
intermediate += kIntermediateStride;
- } while (++y < intermediate_height);
+ } while (--y != 0);
}
// Pre-transpose the 6 tap filters in |kAbsHalfSubPixelFilters|[0].
@@ -1063,9 +1085,9 @@ inline uint8x16_t GetSigned6TapFilter(const int tap_index) {
// This filter is only possible when width >= 8.
template <int grade_x>
inline void ConvolveKernelHorizontalSigned6Tap(
- const uint8_t* src, const ptrdiff_t src_stride, const int width,
+ const uint8_t* const src, const ptrdiff_t src_stride, const int width,
const int subpixel_x, const int step_x, const int intermediate_height,
- int16_t* intermediate) {
+ int16_t* const intermediate) {
const int kernel_offset = 1;
const uint8x8_t one = vdup_n_u8(1);
const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
@@ -1107,7 +1129,7 @@ inline void ConvolveKernelHorizontalSigned6Tap(
for (int i = 0; i < 6; ++i) {
taps[i] = VQTbl1U8(filter_taps[i], filter_indices);
}
- int y = 0;
+ int y = intermediate_height;
do {
// Load a pool of samples to select from using stepped indices.
const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
@@ -1122,7 +1144,7 @@ inline void ConvolveKernelHorizontalSigned6Tap(
kInterRoundBitsHorizontal - 1));
src_x += src_stride;
intermediate_x += kIntermediateStride;
- } while (++y < intermediate_height);
+ } while (--y != 0);
x += 8;
p += step_x8;
} while (x < width);
@@ -1156,9 +1178,9 @@ inline int8x16_t GetMixed6TapFilter(const int tap_index) {
// This filter is only possible when width >= 8.
template <int grade_x>
inline void ConvolveKernelHorizontalMixed6Tap(
- const uint8_t* src, const ptrdiff_t src_stride, const int width,
+ const uint8_t* const src, const ptrdiff_t src_stride, const int width,
const int subpixel_x, const int step_x, const int intermediate_height,
- int16_t* intermediate) {
+ int16_t* const intermediate) {
const int kernel_offset = 1;
const uint8x8_t one = vdup_n_u8(1);
const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
@@ -1205,7 +1227,7 @@ inline void ConvolveKernelHorizontalMixed6Tap(
mixed_taps[0] = vmovl_s8(VQTbl1S8(mixed_filter_taps[0], filter_indices));
mixed_taps[1] = vmovl_s8(VQTbl1S8(mixed_filter_taps[1], filter_indices));
- int y = 0;
+ int y = intermediate_height;
do {
// Load a pool of samples to select from using stepped indices.
const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
@@ -1224,7 +1246,7 @@ inline void ConvolveKernelHorizontalMixed6Tap(
kInterRoundBitsHorizontal - 1));
src_x += src_stride;
intermediate_x += kIntermediateStride;
- } while (++y < intermediate_height);
+ } while (--y != 0);
x += 8;
p += step_x8;
} while (x < width);
@@ -1250,9 +1272,9 @@ inline uint8x16_t GetSigned8TapFilter(const int tap_index) {
// This filter is only possible when width >= 8.
template <int grade_x>
inline void ConvolveKernelHorizontalSigned8Tap(
- const uint8_t* src, const ptrdiff_t src_stride, const int width,
+ const uint8_t* const src, const ptrdiff_t src_stride, const int width,
const int subpixel_x, const int step_x, const int intermediate_height,
- int16_t* intermediate) {
+ int16_t* const intermediate) {
const uint8x8_t one = vdup_n_u8(1);
const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
const int ref_x = subpixel_x >> kScaleSubPixelBits;
@@ -1290,7 +1312,7 @@ inline void ConvolveKernelHorizontalSigned8Tap(
taps[i] = VQTbl1U8(filter_taps[i], filter_indices);
}
- int y = 0;
+ int y = intermediate_height;
do {
// Load a pool of samples to select from using stepped indices.
const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
@@ -1306,7 +1328,7 @@ inline void ConvolveKernelHorizontalSigned8Tap(
kInterRoundBitsHorizontal - 1));
src_x += src_stride;
intermediate_x += kIntermediateStride;
- } while (++y < intermediate_height);
+ } while (--y != 0);
x += 8;
p += step_x8;
} while (x < width);
@@ -1314,9 +1336,9 @@ inline void ConvolveKernelHorizontalSigned8Tap(
// This function handles blocks of width 2 or 4.
template <int num_taps, int grade_y, int width, bool is_compound>
-void ConvolveVerticalScale4xH(const int16_t* src, const int subpixel_y,
+void ConvolveVerticalScale4xH(const int16_t* const src, const int subpixel_y,
const int filter_index, const int step_y,
- const int height, void* dest,
+ const int height, void* const dest,
const ptrdiff_t dest_stride) {
constexpr ptrdiff_t src_stride = kIntermediateStride;
const int16_t* src_y = src;
@@ -1327,8 +1349,8 @@ void ConvolveVerticalScale4xH(const int16_t* src, const int subpixel_y,
int p = subpixel_y & 1023;
int prev_p = p;
- int y = 0;
- do { // y < height
+ int y = height;
+ do {
for (int i = 0; i < num_taps; ++i) {
s[i] = vld1_s16(src_y + i * src_stride);
}
@@ -1381,16 +1403,16 @@ void ConvolveVerticalScale4xH(const int16_t* src, const int subpixel_y,
prev_p = p;
dest16_y += dest_stride;
dest_y += dest_stride;
-
- y += 2;
- } while (y < height);
+ y -= 2;
+ } while (y != 0);
}
template <int num_taps, int grade_y, bool is_compound>
-inline void ConvolveVerticalScale(const int16_t* src, const int width,
+inline void ConvolveVerticalScale(const int16_t* const src, const int width,
const int subpixel_y, const int filter_index,
const int step_y, const int height,
- void* dest, const ptrdiff_t dest_stride) {
+ void* const dest,
+ const ptrdiff_t dest_stride) {
constexpr ptrdiff_t src_stride = kIntermediateStride;
// A possible improvement is to use arithmetic to decide how many times to
// apply filters to same source before checking whether to load new srcs.
@@ -1401,15 +1423,15 @@ inline void ConvolveVerticalScale(const int16_t* src, const int width,
uint8_t* dest_y;
int x = 0;
- do { // x < width
- const int16_t* src_x = src + x;
+ do {
+ const int16_t* const src_x = src + x;
const int16_t* src_y = src_x;
dest16_y = static_cast<uint16_t*>(dest) + x;
dest_y = static_cast<uint8_t*>(dest) + x;
int p = subpixel_y & 1023;
int prev_p = p;
- int y = 0;
- do { // y < height
+ int y = height;
+ do {
for (int i = 0; i < num_taps; ++i) {
s[i] = vld1q_s16(src_y + i * src_stride);
}
@@ -1448,9 +1470,8 @@ inline void ConvolveVerticalScale(const int16_t* src, const int width,
prev_p = p;
dest16_y += dest_stride;
dest_y += dest_stride;
-
- y += 2;
- } while (y < height);
+ y -= 2;
+ } while (y != 0);
x += 8;
} while (x < width);
}
@@ -1462,7 +1483,7 @@ void ConvolveScale2D_NEON(const void* const reference,
const int vertical_filter_index, const int subpixel_x,
const int subpixel_y, const int step_x,
const int step_y, const int width, const int height,
- void* prediction, const ptrdiff_t pred_stride) {
+ void* const prediction, const ptrdiff_t pred_stride) {
const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
assert(step_x <= 2048);
@@ -1699,12 +1720,13 @@ void ConvolveHorizontal_NEON(const void* const reference,
const int /*vertical_filter_index*/,
const int horizontal_filter_id,
const int /*vertical_filter_id*/, const int width,
- const int height, void* prediction,
+ const int height, void* const prediction,
const ptrdiff_t pred_stride) {
const int filter_index = GetFilterIndex(horizontal_filter_index, width);
// Set |src| to the outermost tap.
- const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
- auto* dest = static_cast<uint8_t*>(prediction);
+ const auto* const src =
+ static_cast<const uint8_t*>(reference) - kHorizontalOffset;
+ auto* const dest = static_cast<uint8_t*>(prediction);
DoHorizontalPass(src, reference_stride, dest, pred_stride, width, height,
horizontal_filter_id, filter_index);
@@ -1719,14 +1741,14 @@ uint16x8_t Compound1DShift(const int16x8_t sum) {
template <int filter_index, bool is_compound = false,
bool negative_outside_taps = false>
-void FilterVertical(const uint8_t* src, const ptrdiff_t src_stride,
+void FilterVertical(const uint8_t* const src, const ptrdiff_t src_stride,
void* const dst, const ptrdiff_t dst_stride,
const int width, const int height,
const uint8x8_t* const taps) {
const int num_taps = GetNumTapsInFilter(filter_index);
const int next_row = num_taps - 1;
- auto* dst8 = static_cast<uint8_t*>(dst);
- auto* dst16 = static_cast<uint16_t*>(dst);
+ auto* const dst8 = static_cast<uint8_t*>(dst);
+ auto* const dst16 = static_cast<uint16_t*>(dst);
assert(width >= 8);
int x = 0;
@@ -1754,6 +1776,9 @@ void FilterVertical(const uint8_t* src, const ptrdiff_t src_stride,
}
}
+ // Decreasing the y loop counter produces worse code with clang.
+ // Don't unroll this loop since it generates too much code and the decoder
+ // is even slower.
int y = 0;
do {
srcs[next_row] = vld1_u8(src_x);
@@ -1804,7 +1829,7 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
srcs[0] = Load4(src);
src += src_stride;
- int y = 0;
+ int y = height;
do {
srcs[0] = Load4<1>(src, srcs[0]);
src += src_stride;
@@ -1829,8 +1854,8 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
}
srcs[0] = srcs[2];
- y += 2;
- } while (y < height);
+ y -= 2;
+ } while (y != 0);
} else if (num_taps == 4) {
srcs[4] = vdup_n_u8(0);
@@ -1842,7 +1867,7 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
src += src_stride;
srcs[1] = vext_u8(srcs[0], srcs[2], 4);
- int y = 0;
+ int y = height;
do {
srcs[2] = Load4<1>(src, srcs[2]);
src += src_stride;
@@ -1869,8 +1894,8 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
srcs[0] = srcs[2];
srcs[1] = srcs[3];
srcs[2] = srcs[4];
- y += 2;
- } while (y < height);
+ y -= 2;
+ } while (y != 0);
} else if (num_taps == 6) {
srcs[6] = vdup_n_u8(0);
@@ -1887,7 +1912,7 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
src += src_stride;
srcs[3] = vext_u8(srcs[2], srcs[4], 4);
- int y = 0;
+ int y = height;
do {
srcs[4] = Load4<1>(src, srcs[4]);
src += src_stride;
@@ -1916,8 +1941,8 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
srcs[2] = srcs[4];
srcs[3] = srcs[5];
srcs[4] = srcs[6];
- y += 2;
- } while (y < height);
+ y -= 2;
+ } while (y != 0);
} else if (num_taps == 8) {
srcs[8] = vdup_n_u8(0);
@@ -1939,7 +1964,7 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
src += src_stride;
srcs[5] = vext_u8(srcs[4], srcs[6], 4);
- int y = 0;
+ int y = height;
do {
srcs[6] = Load4<1>(src, srcs[6]);
src += src_stride;
@@ -1970,8 +1995,8 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
srcs[4] = srcs[6];
srcs[5] = srcs[7];
srcs[6] = srcs[8];
- y += 2;
- } while (y < height);
+ y -= 2;
+ } while (y != 0);
}
}
@@ -2186,14 +2211,14 @@ void ConvolveVertical_NEON(const void* const reference,
const int vertical_filter_index,
const int /*horizontal_filter_id*/,
const int vertical_filter_id, const int width,
- const int height, void* prediction,
+ const int height, void* const prediction,
const ptrdiff_t pred_stride) {
const int filter_index = GetFilterIndex(vertical_filter_index, height);
const int vertical_taps = GetNumTapsInFilter(filter_index);
const ptrdiff_t src_stride = reference_stride;
const auto* src = static_cast<const uint8_t*>(reference) -
(vertical_taps / 2 - 1) * src_stride;
- auto* dest = static_cast<uint8_t*>(prediction);
+ auto* const dest = static_cast<uint8_t*>(prediction);
const ptrdiff_t dest_stride = pred_stride;
assert(vertical_filter_id != 0);
@@ -2303,7 +2328,7 @@ void ConvolveCompoundCopy_NEON(
const void* const reference, const ptrdiff_t reference_stride,
const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
const int /*horizontal_filter_id*/, const int /*vertical_filter_id*/,
- const int width, const int height, void* prediction,
+ const int width, const int height, void* const prediction,
const ptrdiff_t /*pred_stride*/) {
const auto* src = static_cast<const uint8_t*>(reference);
const ptrdiff_t src_stride = reference_stride;
@@ -2312,7 +2337,7 @@ void ConvolveCompoundCopy_NEON(
kInterRoundBitsVertical - kInterRoundBitsCompoundVertical;
if (width >= 16) {
- int y = 0;
+ int y = height;
do {
int x = 0;
do {
@@ -2328,20 +2353,20 @@ void ConvolveCompoundCopy_NEON(
} while (x < width);
src += src_stride;
dest += width;
- } while (++y < height);
+ } while (--y != 0);
} else if (width == 8) {
- int y = 0;
+ int y = height;
do {
const uint8x8_t v_src = vld1_u8(&src[0]);
const uint16x8_t v_dest = vshll_n_u8(v_src, final_shift);
vst1q_u16(&dest[0], v_dest);
src += src_stride;
dest += width;
- } while (++y < height);
- } else { /* width == 4 */
+ } while (--y != 0);
+ } else { // width == 4
uint8x8_t v_src = vdup_n_u8(0);
- int y = 0;
+ int y = height;
do {
v_src = Load4<0>(&src[0], v_src);
src += src_stride;
@@ -2350,8 +2375,8 @@ void ConvolveCompoundCopy_NEON(
const uint16x8_t v_dest = vshll_n_u8(v_src, final_shift);
vst1q_u16(&dest[0], v_dest);
dest += 4 << 1;
- y += 2;
- } while (y < height);
+ y -= 2;
+ } while (y != 0);
}
}
@@ -2359,14 +2384,14 @@ void ConvolveCompoundVertical_NEON(
const void* const reference, const ptrdiff_t reference_stride,
const int /*horizontal_filter_index*/, const int vertical_filter_index,
const int /*horizontal_filter_id*/, const int vertical_filter_id,
- const int width, const int height, void* prediction,
+ const int width, const int height, void* const prediction,
const ptrdiff_t /*pred_stride*/) {
const int filter_index = GetFilterIndex(vertical_filter_index, height);
const int vertical_taps = GetNumTapsInFilter(filter_index);
const ptrdiff_t src_stride = reference_stride;
const auto* src = static_cast<const uint8_t*>(reference) -
(vertical_taps / 2 - 1) * src_stride;
- auto* dest = static_cast<uint16_t*>(prediction);
+ auto* const dest = static_cast<uint16_t*>(prediction);
assert(vertical_filter_id != 0);
uint8x8_t taps[8];
@@ -2454,24 +2479,39 @@ void ConvolveCompoundHorizontal_NEON(
const void* const reference, const ptrdiff_t reference_stride,
const int horizontal_filter_index, const int /*vertical_filter_index*/,
const int horizontal_filter_id, const int /*vertical_filter_id*/,
- const int width, const int height, void* prediction,
+ const int width, const int height, void* const prediction,
const ptrdiff_t /*pred_stride*/) {
const int filter_index = GetFilterIndex(horizontal_filter_index, width);
- const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
- auto* dest = static_cast<uint16_t*>(prediction);
+ const auto* const src =
+ static_cast<const uint8_t*>(reference) - kHorizontalOffset;
+ auto* const dest = static_cast<uint16_t*>(prediction);
DoHorizontalPass</*is_2d=*/false, /*is_compound=*/true>(
src, reference_stride, dest, width, width, height, horizontal_filter_id,
filter_index);
}
+template <int vertical_taps>
+void Compound2DVertical(const uint16_t* const intermediate_result,
+ const int width, const int height, const int16x8_t taps,
+ void* const prediction) {
+ auto* const dest = static_cast<uint16_t*>(prediction);
+ if (width == 4) {
+ Filter2DVerticalWidth4<vertical_taps, /*is_compound=*/true>(
+ intermediate_result, dest, width, height, taps);
+ } else {
+ Filter2DVerticalWidth8AndUp<vertical_taps, /*is_compound=*/true>(
+ intermediate_result, dest, width, width, height, taps);
+ }
+}
+
void ConvolveCompound2D_NEON(const void* const reference,
const ptrdiff_t reference_stride,
const int horizontal_filter_index,
const int vertical_filter_index,
const int horizontal_filter_id,
const int vertical_filter_id, const int width,
- const int height, void* prediction,
+ const int height, void* const prediction,
const ptrdiff_t /*pred_stride*/) {
// The output of the horizontal filter, i.e. the intermediate_result, is
// guaranteed to fit in int16_t.
@@ -2492,55 +2532,26 @@ void ConvolveCompound2D_NEON(const void* const reference,
const auto* const src = static_cast<const uint8_t*>(reference) -
(vertical_taps / 2 - 1) * src_stride -
kHorizontalOffset;
-
DoHorizontalPass</*is_2d=*/true, /*is_compound=*/true>(
src, src_stride, intermediate_result, width, width, intermediate_height,
horizontal_filter_id, horiz_filter_index);
// Vertical filter.
- auto* dest = static_cast<uint16_t*>(prediction);
assert(vertical_filter_id != 0);
-
- const ptrdiff_t dest_stride = width;
const int16x8_t taps = vmovl_s8(
vld1_s8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]));
-
if (vertical_taps == 8) {
- if (width == 4) {
- Filter2DVertical4xH<8, /*is_compound=*/true>(intermediate_result, dest,
- dest_stride, height, taps);
- } else {
- Filter2DVertical<8, /*is_compound=*/true>(
- intermediate_result, dest, dest_stride, width, height, taps);
- }
+ Compound2DVertical<8>(intermediate_result, width, height, taps, prediction);
} else if (vertical_taps == 6) {
- if (width == 4) {
- Filter2DVertical4xH<6, /*is_compound=*/true>(intermediate_result, dest,
- dest_stride, height, taps);
- } else {
- Filter2DVertical<6, /*is_compound=*/true>(
- intermediate_result, dest, dest_stride, width, height, taps);
- }
+ Compound2DVertical<6>(intermediate_result, width, height, taps, prediction);
} else if (vertical_taps == 4) {
- if (width == 4) {
- Filter2DVertical4xH<4, /*is_compound=*/true>(intermediate_result, dest,
- dest_stride, height, taps);
- } else {
- Filter2DVertical<4, /*is_compound=*/true>(
- intermediate_result, dest, dest_stride, width, height, taps);
- }
+ Compound2DVertical<4>(intermediate_result, width, height, taps, prediction);
} else { // |vertical_taps| == 2
- if (width == 4) {
- Filter2DVertical4xH<2, /*is_compound=*/true>(intermediate_result, dest,
- dest_stride, height, taps);
- } else {
- Filter2DVertical<2, /*is_compound=*/true>(
- intermediate_result, dest, dest_stride, width, height, taps);
- }
+ Compound2DVertical<2>(intermediate_result, width, height, taps, prediction);
}
}
-inline void HalfAddHorizontal(const uint8_t* src, uint8_t* dst) {
+inline void HalfAddHorizontal(const uint8_t* const src, uint8_t* const dst) {
const uint8x16_t left = vld1q_u8(src);
const uint8x16_t right = vld1q_u8(src + 1);
vst1q_u8(dst, vrhaddq_u8(left, right));
@@ -2554,7 +2565,7 @@ inline void IntraBlockCopyHorizontal(const uint8_t* src,
const ptrdiff_t src_remainder_stride = src_stride - (width - 16);
const ptrdiff_t dst_remainder_stride = dst_stride - (width - 16);
- int y = 0;
+ int y = height;
do {
HalfAddHorizontal(src, dst);
if (width >= 32) {
@@ -2586,7 +2597,7 @@ inline void IntraBlockCopyHorizontal(const uint8_t* src,
}
src += src_remainder_stride;
dst += dst_remainder_stride;
- } while (++y < height);
+ } while (--y != 0);
}
void ConvolveIntraBlockCopyHorizontal_NEON(
@@ -2610,7 +2621,7 @@ void ConvolveIntraBlockCopyHorizontal_NEON(
IntraBlockCopyHorizontal<16>(src, reference_stride, height, dest,
pred_stride);
} else if (width == 8) {
- int y = 0;
+ int y = height;
do {
const uint8x8_t left = vld1_u8(src);
const uint8x8_t right = vld1_u8(src + 1);
@@ -2618,11 +2629,11 @@ void ConvolveIntraBlockCopyHorizontal_NEON(
src += reference_stride;
dest += pred_stride;
- } while (++y < height);
+ } while (--y != 0);
} else if (width == 4) {
uint8x8_t left = vdup_n_u8(0);
uint8x8_t right = vdup_n_u8(0);
- int y = 0;
+ int y = height;
do {
left = Load4<0>(src, left);
right = Load4<0>(src + 1, right);
@@ -2637,13 +2648,13 @@ void ConvolveIntraBlockCopyHorizontal_NEON(
dest += pred_stride;
StoreHi4(dest, result);
dest += pred_stride;
- y += 2;
- } while (y < height);
+ y -= 2;
+ } while (y != 0);
} else {
assert(width == 2);
uint8x8_t left = vdup_n_u8(0);
uint8x8_t right = vdup_n_u8(0);
- int y = 0;
+ int y = height;
do {
left = Load2<0>(src, left);
right = Load2<0>(src + 1, right);
@@ -2658,8 +2669,8 @@ void ConvolveIntraBlockCopyHorizontal_NEON(
dest += pred_stride;
Store2<1>(dest, result);
dest += pred_stride;
- y += 2;
- } while (y < height);
+ y -= 2;
+ } while (y != 0);
}
}
@@ -2694,7 +2705,7 @@ inline void IntraBlockCopyVertical(const uint8_t* src,
}
src += src_remainder_stride;
- int y = 0;
+ int y = height;
do {
below[0] = vld1q_u8(src);
if (width >= 32) {
@@ -2749,7 +2760,7 @@ inline void IntraBlockCopyVertical(const uint8_t* src,
}
}
dst += dst_remainder_stride;
- } while (++y < height);
+ } while (--y != 0);
}
void ConvolveIntraBlockCopyVertical_NEON(
@@ -2778,7 +2789,7 @@ void ConvolveIntraBlockCopyVertical_NEON(
row = vld1_u8(src);
src += reference_stride;
- int y = 0;
+ int y = height;
do {
below = vld1_u8(src);
src += reference_stride;
@@ -2787,13 +2798,13 @@ void ConvolveIntraBlockCopyVertical_NEON(
dest += pred_stride;
row = below;
- } while (++y < height);
+ } while (--y != 0);
} else if (width == 4) {
uint8x8_t row = Load4(src);
uint8x8_t below = vdup_n_u8(0);
src += reference_stride;
- int y = 0;
+ int y = height;
do {
below = Load4<0>(src, below);
src += reference_stride;
@@ -2802,14 +2813,14 @@ void ConvolveIntraBlockCopyVertical_NEON(
dest += pred_stride;
row = below;
- } while (++y < height);
+ } while (--y != 0);
} else {
assert(width == 2);
uint8x8_t row = Load2(src);
uint8x8_t below = vdup_n_u8(0);
src += reference_stride;
- int y = 0;
+ int y = height;
do {
below = Load2<0>(src, below);
src += reference_stride;
@@ -2818,7 +2829,7 @@ void ConvolveIntraBlockCopyVertical_NEON(
dest += pred_stride;
row = below;
- } while (++y < height);
+ } while (--y != 0);
}
}
@@ -2870,7 +2881,7 @@ inline void IntraBlockCopy2D(const uint8_t* src, const ptrdiff_t src_stride,
}
src += src_remainder_stride;
- int y = 0;
+ int y = height;
do {
const uint16x8_t below_0 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[0], below_0), 2));
@@ -2981,7 +2992,7 @@ inline void IntraBlockCopy2D(const uint8_t* src, const ptrdiff_t src_stride,
}
src += src_remainder_stride;
dst += dst_remainder_stride;
- } while (++y < height);
+ } while (--y != 0);
}
void ConvolveIntraBlockCopy2D_NEON(
@@ -3013,7 +3024,7 @@ void ConvolveIntraBlockCopy2D_NEON(
uint16x4_t row = vget_low_u16(vaddl_u8(left, right));
- int y = 0;
+ int y = height;
do {
left = Load4<0>(src, left);
right = Load4<0>(src + 1, right);
@@ -3032,8 +3043,8 @@ void ConvolveIntraBlockCopy2D_NEON(
dest += pred_stride;
row = vget_high_u16(below);
- y += 2;
- } while (y < height);
+ y -= 2;
+ } while (y != 0);
} else {
uint8x8_t left = Load2(src);
uint8x8_t right = Load2(src + 1);
@@ -3041,7 +3052,7 @@ void ConvolveIntraBlockCopy2D_NEON(
uint16x4_t row = vget_low_u16(vaddl_u8(left, right));
- int y = 0;
+ int y = height;
do {
left = Load2<0>(src, left);
right = Load2<0>(src + 1, right);
@@ -3060,8 +3071,8 @@ void ConvolveIntraBlockCopy2D_NEON(
dest += pred_stride;
row = vget_high_u16(below);
- y += 2;
- } while (y < height);
+ y -= 2;
+ } while (y != 0);
}
}
@@ -3093,7 +3104,7 @@ void ConvolveInit_NEON() { low_bitdepth::Init8bpp(); }
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_NEON
+#else // !LIBGAV1_ENABLE_NEON
namespace libgav1 {
namespace dsp {
diff --git a/src/dsp/arm/distance_weighted_blend_neon.cc b/src/dsp/arm/distance_weighted_blend_neon.cc
index 04952ab..a0cd0ac 100644
--- a/src/dsp/arm/distance_weighted_blend_neon.cc
+++ b/src/dsp/arm/distance_weighted_blend_neon.cc
@@ -30,10 +30,12 @@
namespace libgav1 {
namespace dsp {
-namespace {
constexpr int kInterPostRoundBit = 4;
+namespace low_bitdepth {
+namespace {
+
inline int16x8_t ComputeWeightedAverage8(const int16x8_t pred0,
const int16x8_t pred1,
const int16x4_t weights[2]) {
@@ -185,13 +187,167 @@ void Init8bpp() {
}
} // namespace
+} // namespace low_bitdepth
+
+//------------------------------------------------------------------------------
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+inline uint16x4x2_t ComputeWeightedAverage8(const uint16x4x2_t pred0,
+ const uint16x4x2_t pred1,
+ const uint16x4_t weights[2]) {
+ const uint32x4_t wpred0_lo = vmull_u16(weights[0], pred0.val[0]);
+ const uint32x4_t wpred0_hi = vmull_u16(weights[0], pred0.val[1]);
+ const uint32x4_t blended_lo = vmlal_u16(wpred0_lo, weights[1], pred1.val[0]);
+ const uint32x4_t blended_hi = vmlal_u16(wpred0_hi, weights[1], pred1.val[1]);
+ const int32x4_t offset = vdupq_n_s32(kCompoundOffset * 16);
+ const int32x4_t res_lo = vsubq_s32(vreinterpretq_s32_u32(blended_lo), offset);
+ const int32x4_t res_hi = vsubq_s32(vreinterpretq_s32_u32(blended_hi), offset);
+ const uint16x4_t bd_max = vdup_n_u16((1 << kBitdepth10) - 1);
+ // Clip the result at (1 << bd) - 1.
+ uint16x4x2_t result;
+ result.val[0] =
+ vmin_u16(vqrshrun_n_s32(res_lo, kInterPostRoundBit + 4), bd_max);
+ result.val[1] =
+ vmin_u16(vqrshrun_n_s32(res_hi, kInterPostRoundBit + 4), bd_max);
+ return result;
+}
+
+inline uint16x4x4_t ComputeWeightedAverage8(const uint16x4x4_t pred0,
+ const uint16x4x4_t pred1,
+ const uint16x4_t weights[2]) {
+ const int32x4_t offset = vdupq_n_s32(kCompoundOffset * 16);
+ const uint32x4_t wpred0 = vmull_u16(weights[0], pred0.val[0]);
+ const uint32x4_t wpred1 = vmull_u16(weights[0], pred0.val[1]);
+ const uint32x4_t blended0 = vmlal_u16(wpred0, weights[1], pred1.val[0]);
+ const uint32x4_t blended1 = vmlal_u16(wpred1, weights[1], pred1.val[1]);
+ const int32x4_t res0 = vsubq_s32(vreinterpretq_s32_u32(blended0), offset);
+ const int32x4_t res1 = vsubq_s32(vreinterpretq_s32_u32(blended1), offset);
+ const uint32x4_t wpred2 = vmull_u16(weights[0], pred0.val[2]);
+ const uint32x4_t wpred3 = vmull_u16(weights[0], pred0.val[3]);
+ const uint32x4_t blended2 = vmlal_u16(wpred2, weights[1], pred1.val[2]);
+ const uint32x4_t blended3 = vmlal_u16(wpred3, weights[1], pred1.val[3]);
+ const int32x4_t res2 = vsubq_s32(vreinterpretq_s32_u32(blended2), offset);
+ const int32x4_t res3 = vsubq_s32(vreinterpretq_s32_u32(blended3), offset);
+ const uint16x4_t bd_max = vdup_n_u16((1 << kBitdepth10) - 1);
+ // Clip the result at (1 << bd) - 1.
+ uint16x4x4_t result;
+ result.val[0] =
+ vmin_u16(vqrshrun_n_s32(res0, kInterPostRoundBit + 4), bd_max);
+ result.val[1] =
+ vmin_u16(vqrshrun_n_s32(res1, kInterPostRoundBit + 4), bd_max);
+ result.val[2] =
+ vmin_u16(vqrshrun_n_s32(res2, kInterPostRoundBit + 4), bd_max);
+ result.val[3] =
+ vmin_u16(vqrshrun_n_s32(res3, kInterPostRoundBit + 4), bd_max);
+
+ return result;
+}
+
+// We could use vld1_u16_x2, but for compatibility reasons, use this function
+// instead. The compiler optimizes to the correct instruction.
+inline uint16x4x2_t LoadU16x4_x2(uint16_t const* ptr) {
+ uint16x4x2_t x;
+ // gcc/clang (64 bit) optimizes the following to ldp.
+ x.val[0] = vld1_u16(ptr);
+ x.val[1] = vld1_u16(ptr + 4);
+ return x;
+}
+
+// We could use vld1_u16_x4, but for compatibility reasons, use this function
+// instead. The compiler optimizes to a pair of vld1_u16_x2, which showed better
+// performance in the speed tests.
+inline uint16x4x4_t LoadU16x4_x4(uint16_t const* ptr) {
+ uint16x4x4_t x;
+ x.val[0] = vld1_u16(ptr);
+ x.val[1] = vld1_u16(ptr + 4);
+ x.val[2] = vld1_u16(ptr + 8);
+ x.val[3] = vld1_u16(ptr + 12);
+ return x;
+}
+
+void DistanceWeightedBlend_NEON(const void* prediction_0,
+ const void* prediction_1,
+ const uint8_t weight_0, const uint8_t weight_1,
+ const int width, const int height,
+ void* const dest, const ptrdiff_t dest_stride) {
+ const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+ auto* dst = static_cast<uint16_t*>(dest);
+ const ptrdiff_t dst_stride = dest_stride / sizeof(dst[0]);
+ const uint16x4_t weights[2] = {vdup_n_u16(weight_0), vdup_n_u16(weight_1)};
-void DistanceWeightedBlendInit_NEON() { Init8bpp(); }
+ if (width == 4) {
+ int y = height;
+ do {
+ const uint16x4x2_t src0 = LoadU16x4_x2(pred_0);
+ const uint16x4x2_t src1 = LoadU16x4_x2(pred_1);
+ const uint16x4x2_t res = ComputeWeightedAverage8(src0, src1, weights);
+ vst1_u16(dst, res.val[0]);
+ vst1_u16(dst + dst_stride, res.val[1]);
+ dst += dst_stride << 1;
+ pred_0 += 8;
+ pred_1 += 8;
+ y -= 2;
+ } while (y != 0);
+ } else if (width == 8) {
+ int y = height;
+ do {
+ const uint16x4x4_t src0 = LoadU16x4_x4(pred_0);
+ const uint16x4x4_t src1 = LoadU16x4_x4(pred_1);
+ const uint16x4x4_t res = ComputeWeightedAverage8(src0, src1, weights);
+ vst1_u16(dst, res.val[0]);
+ vst1_u16(dst + 4, res.val[1]);
+ vst1_u16(dst + dst_stride, res.val[2]);
+ vst1_u16(dst + dst_stride + 4, res.val[3]);
+ dst += dst_stride << 1;
+ pred_0 += 16;
+ pred_1 += 16;
+ y -= 2;
+ } while (y != 0);
+ } else {
+ int y = height;
+ do {
+ int x = 0;
+ do {
+ const uint16x4x4_t src0 = LoadU16x4_x4(pred_0 + x);
+ const uint16x4x4_t src1 = LoadU16x4_x4(pred_1 + x);
+ const uint16x4x4_t res = ComputeWeightedAverage8(src0, src1, weights);
+ vst1_u16(dst + x, res.val[0]);
+ vst1_u16(dst + x + 4, res.val[1]);
+ vst1_u16(dst + x + 8, res.val[2]);
+ vst1_u16(dst + x + 12, res.val[3]);
+ x += 16;
+ } while (x < width);
+ dst += dst_stride;
+ pred_0 += width;
+ pred_1 += width;
+ } while (--y != 0);
+ }
+}
+
+void Init10bpp() {
+ Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+ dsp->distance_weighted_blend = DistanceWeightedBlend_NEON;
+}
+
+} // namespace
+} // namespace high_bitdepth
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+void DistanceWeightedBlendInit_NEON() {
+ low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ high_bitdepth::Init10bpp();
+#endif
+}
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_NEON
+#else // !LIBGAV1_ENABLE_NEON
namespace libgav1 {
namespace dsp {
diff --git a/src/dsp/arm/distance_weighted_blend_neon.h b/src/dsp/arm/distance_weighted_blend_neon.h
index 4d8824c..94a799c 100644
--- a/src/dsp/arm/distance_weighted_blend_neon.h
+++ b/src/dsp/arm/distance_weighted_blend_neon.h
@@ -34,6 +34,8 @@ void DistanceWeightedBlendInit_NEON();
#if LIBGAV1_ENABLE_NEON
#define LIBGAV1_Dsp8bpp_DistanceWeightedBlend LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_DistanceWeightedBlend LIBGAV1_CPU_NEON
+
#endif // LIBGAV1_ENABLE_NEON
#endif // LIBGAV1_SRC_DSP_ARM_DISTANCE_WEIGHTED_BLEND_NEON_H_
diff --git a/src/dsp/arm/film_grain_neon.cc b/src/dsp/arm/film_grain_neon.cc
index 2612466..8ee3745 100644
--- a/src/dsp/arm/film_grain_neon.cc
+++ b/src/dsp/arm/film_grain_neon.cc
@@ -1176,7 +1176,7 @@ void FilmGrainInit_NEON() {
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_NEON
+#else // !LIBGAV1_ENABLE_NEON
namespace libgav1 {
namespace dsp {
diff --git a/src/dsp/arm/intra_edge_neon.cc b/src/dsp/arm/intra_edge_neon.cc
index 00b186a..074283f 100644
--- a/src/dsp/arm/intra_edge_neon.cc
+++ b/src/dsp/arm/intra_edge_neon.cc
@@ -25,7 +25,7 @@
#include "src/dsp/arm/common_neon.h"
#include "src/dsp/constants.h"
#include "src/dsp/dsp.h"
-#include "src/utils/common.h" // RightShiftWithRounding()
+#include "src/utils/common.h"
namespace libgav1 {
namespace dsp {
@@ -35,6 +35,11 @@ namespace {
// required.
constexpr int kKernelsNEON[3][2] = {{4, 8}, {5, 6}};
+} // namespace
+
+namespace low_bitdepth {
+namespace {
+
void IntraEdgeFilter_NEON(void* buffer, const int size, const int strength) {
assert(strength == 1 || strength == 2 || strength == 3);
const int kernel_index = strength - 1;
@@ -44,6 +49,8 @@ void IntraEdgeFilter_NEON(void* buffer, const int size, const int strength) {
// elements written is |size| - 1.
if (size == 1) return;
+ const uint8x16_t v_index = vcombine_u8(vcreate_u8(0x0706050403020100),
+ vcreate_u8(0x0f0e0d0c0b0a0908));
// |strength| 1 and 2 use a 3 tap filter.
if (strength < 3) {
// The last value requires extending the buffer (duplicating
@@ -89,7 +96,6 @@ void IntraEdgeFilter_NEON(void* buffer, const int size, const int strength) {
// |remainder| == 1 then we don't have to do anything.
const int remainder = (size - 1) & 0xf;
if (remainder > 1) {
- uint8_t temp[16];
const uint8x16_t src_1 = vld1q_u8(dst_buffer + i);
const uint8x16_t src_2 = vld1q_u8(dst_buffer + i + 1);
@@ -102,9 +108,11 @@ void IntraEdgeFilter_NEON(void* buffer, const int size, const int strength) {
const uint8x16_t result =
vcombine_u8(vrshrn_n_u16(sum_lo, 4), vrshrn_n_u16(sum_hi, 4));
-
- vst1q_u8(temp, result);
- memcpy(dst_buffer + i, temp, remainder);
+ const uint8x16_t v_remainder = vdupq_n_u8(remainder);
+ // Create over write mask.
+ const uint8x16_t mask = vcleq_u8(v_remainder, v_index);
+ const uint8x16_t dst_remainder = vbslq_u8(mask, src_1, result);
+ vst1q_u8(dst_buffer + i, dst_remainder);
}
dst_buffer[size - 1] = last_val;
@@ -173,7 +181,6 @@ void IntraEdgeFilter_NEON(void* buffer, const int size, const int strength) {
// Like the 3 tap but if there are two remaining values we have already
// calculated them.
if (remainder > 2) {
- uint8_t temp[16];
const uint8x16_t src_2 = vld1q_u8(dst_buffer + i);
const uint8x16_t src_3 = vld1q_u8(dst_buffer + i + 1);
const uint8x16_t src_4 = vld1q_u8(dst_buffer + i + 2);
@@ -193,9 +200,11 @@ void IntraEdgeFilter_NEON(void* buffer, const int size, const int strength) {
const uint8x16_t result =
vcombine_u8(vrshrn_n_u16(sum_lo, 4), vrshrn_n_u16(sum_hi, 4));
-
- vst1q_u8(temp, result);
- memcpy(dst_buffer + i, temp, remainder);
+ const uint8x16_t v_remainder = vdupq_n_u8(remainder);
+ // Create over write mask.
+ const uint8x16_t mask = vcleq_u8(v_remainder, v_index);
+ const uint8x16_t dst_remainder = vbslq_u8(mask, src_2, result);
+ vst1q_u8(dst_buffer + i, dst_remainder);
}
dst_buffer[1] = special_vals[0];
@@ -284,13 +293,225 @@ void Init8bpp() {
}
} // namespace
+} // namespace low_bitdepth
+
+//------------------------------------------------------------------------------
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+const uint16_t kRemainderMask[8][8] = {
+ {0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000},
+ {0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000},
+ {0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000},
+ {0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000},
+ {0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000},
+ {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000},
+ {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000},
+ {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000},
+};
+
+void IntraEdgeFilter_NEON(void* buffer, const int size, const int strength) {
+ assert(strength == 1 || strength == 2 || strength == 3);
+ const int kernel_index = strength - 1;
+ auto* const dst_buffer = static_cast<uint16_t*>(buffer);
+
+ // The first element is not written out (but it is input) so the number of
+ // elements written is |size| - 1.
+ if (size == 1) return;
+
+ // |strength| 1 and 2 use a 3 tap filter.
+ if (strength < 3) {
+ // The last value requires extending the buffer (duplicating
+ // |dst_buffer[size - 1]). Calculate it here to avoid extra processing in
+ // neon.
+ const uint16_t last_val = RightShiftWithRounding(
+ kKernelsNEON[kernel_index][0] * dst_buffer[size - 2] +
+ kKernelsNEON[kernel_index][1] * dst_buffer[size - 1] +
+ kKernelsNEON[kernel_index][0] * dst_buffer[size - 1],
+ 4);
+
+ const uint16_t krn0 = kKernelsNEON[kernel_index][0];
+ const uint16_t krn1 = kKernelsNEON[kernel_index][1];
+
+ // The first value we need gets overwritten by the output from the
+ // previous iteration.
+ uint16x8_t src_0 = vld1q_u16(dst_buffer);
+ int i = 1;
+
+ // Process blocks until there are less than 16 values remaining.
+ for (; i < size - 7; i += 8) {
+ // Loading these at the end of the block with |src_0| will read past the
+ // end of |top_row_data[160]|, the source of |buffer|.
+ const uint16x8_t src_1 = vld1q_u16(dst_buffer + i);
+ const uint16x8_t src_2 = vld1q_u16(dst_buffer + i + 1);
+ const uint16x8_t sum_02 = vmulq_n_u16(vaddq_u16(src_0, src_2), krn0);
+ const uint16x8_t sum = vmlaq_n_u16(sum_02, src_1, krn1);
+ const uint16x8_t result = vrshrq_n_u16(sum, 4);
+ // Load the next row before overwriting. This loads an extra 7 values
+ // past |size| on the trailing iteration.
+ src_0 = vld1q_u16(dst_buffer + i + 7);
+ vst1q_u16(dst_buffer + i, result);
+ }
+
+ // The last output value |last_val| was already calculated so if
+ // |remainder| == 1 then we don't have to do anything.
+ const int remainder = (size - 1) & 0x7;
+ if (remainder > 1) {
+ const uint16x8_t src_1 = vld1q_u16(dst_buffer + i);
+ const uint16x8_t src_2 = vld1q_u16(dst_buffer + i + 1);
+ const uint16x8_t sum_02 = vmulq_n_u16(vaddq_u16(src_0, src_2), krn0);
+ const uint16x8_t sum = vmlaq_n_u16(sum_02, src_1, krn1);
+ const uint16x8_t result = vrshrq_n_u16(sum, 4);
+ const uint16x8_t mask = vld1q_u16(kRemainderMask[remainder]);
+ const uint16x8_t dst_remainder = vbslq_u16(mask, result, src_1);
+ vst1q_u16(dst_buffer + i, dst_remainder);
+ }
+
+ dst_buffer[size - 1] = last_val;
+ return;
+ }
+
+ assert(strength == 3);
+ // 5 tap filter. The first element requires duplicating |buffer[0]| and the
+ // last two elements require duplicating |buffer[size - 1]|.
+ uint16_t special_vals[3];
+ special_vals[0] = RightShiftWithRounding(
+ (dst_buffer[0] << 1) + (dst_buffer[0] << 2) + (dst_buffer[1] << 2) +
+ (dst_buffer[2] << 2) + (dst_buffer[3] << 1),
+ 4);
+ // Clamp index for very small |size| values.
+ const int first_index_min = std::max(size - 4, 0);
+ const int second_index_min = std::max(size - 3, 0);
+ const int third_index_min = std::max(size - 2, 0);
+ special_vals[1] = RightShiftWithRounding(
+ (dst_buffer[first_index_min] << 1) + (dst_buffer[second_index_min] << 2) +
+ (dst_buffer[third_index_min] << 2) + (dst_buffer[size - 1] << 2) +
+ (dst_buffer[size - 1] << 1),
+ 4);
+ special_vals[2] = RightShiftWithRounding(
+ (dst_buffer[second_index_min] << 1) + (dst_buffer[third_index_min] << 2) +
+ // x << 2 + x << 2 == x << 3
+ (dst_buffer[size - 1] << 3) + (dst_buffer[size - 1] << 1),
+ 4);
+
+ // The first two values we need get overwritten by the output from the
+ // previous iteration.
+ uint16x8_t src_0 = vld1q_u16(dst_buffer - 1);
+ uint16x8_t src_1 = vld1q_u16(dst_buffer);
+ int i = 1;
+
+ for (; i < size - 7; i += 8) {
+ // Loading these at the end of the block with |src_[01]| will read past
+ // the end of |top_row_data[160]|, the source of |buffer|.
+ const uint16x8_t src_2 = vld1q_u16(dst_buffer + i);
+ const uint16x8_t src_3 = vld1q_u16(dst_buffer + i + 1);
+ const uint16x8_t src_4 = vld1q_u16(dst_buffer + i + 2);
+ const uint16x8_t sum_04 = vshlq_n_u16(vaddq_u16(src_0, src_4), 1);
+ const uint16x8_t sum_123 = vaddq_u16(vaddq_u16(src_1, src_2), src_3);
+ const uint16x8_t sum = vaddq_u16(sum_04, vshlq_n_u16(sum_123, 2));
+ const uint16x8_t result = vrshrq_n_u16(sum, 4);
+
+ // Load the next before overwriting.
+ src_0 = vld1q_u16(dst_buffer + i + 6);
+ src_1 = vld1q_u16(dst_buffer + i + 7);
+
+ vst1q_u16(dst_buffer + i, result);
+ }
+
+ const int remainder = (size - 1) & 0x7;
+ // Like the 3 tap but if there are two remaining values we have already
+ // calculated them.
+ if (remainder > 2) {
+ const uint16x8_t src_2 = vld1q_u16(dst_buffer + i);
+ const uint16x8_t src_3 = vld1q_u16(dst_buffer + i + 1);
+ const uint16x8_t src_4 = vld1q_u16(dst_buffer + i + 2);
+ const uint16x8_t sum_04 = vshlq_n_u16(vaddq_u16(src_0, src_4), 1);
+ const uint16x8_t sum_123 = vaddq_u16(vaddq_u16(src_1, src_2), src_3);
+ const uint16x8_t sum = vaddq_u16(sum_04, vshlq_n_u16(sum_123, 2));
+ const uint16x8_t result = vrshrq_n_u16(sum, 4);
+ const uint16x8_t mask = vld1q_u16(kRemainderMask[remainder]);
+ const uint16x8_t dst_remainder = vbslq_u16(mask, result, src_2);
+ vst1q_u16(dst_buffer + i, dst_remainder);
+ }
+
+ dst_buffer[1] = special_vals[0];
+ // Avoid overwriting |dst_buffer[0]|.
+ if (size > 2) dst_buffer[size - 2] = special_vals[1];
+ dst_buffer[size - 1] = special_vals[2];
+}
+
+void IntraEdgeUpsampler_NEON(void* buffer, const int size) {
+ assert(size % 4 == 0 && size <= 16);
+ auto* const pixel_buffer = static_cast<uint16_t*>(buffer);
-void IntraEdgeInit_NEON() { Init8bpp(); }
+ // Extend first/last samples
+ pixel_buffer[-2] = pixel_buffer[-1];
+ pixel_buffer[size] = pixel_buffer[size - 1];
+
+ const int16x8_t src_lo = vreinterpretq_s16_u16(vld1q_u16(pixel_buffer - 2));
+ const int16x8_t src_hi =
+ vreinterpretq_s16_u16(vld1q_u16(pixel_buffer - 2 + 8));
+ const int16x8_t src9_hi = vaddq_s16(src_hi, vshlq_n_s16(src_hi, 3));
+ const int16x8_t src9_lo = vaddq_s16(src_lo, vshlq_n_s16(src_lo, 3));
+
+ int16x8_t sum_lo = vsubq_s16(vextq_s16(src9_lo, src9_hi, 1), src_lo);
+ sum_lo = vaddq_s16(sum_lo, vextq_s16(src9_lo, src9_hi, 2));
+ sum_lo = vsubq_s16(sum_lo, vextq_s16(src_lo, src_hi, 3));
+ sum_lo = vrshrq_n_s16(sum_lo, 4);
+
+ uint16x8x2_t result_lo;
+ result_lo.val[0] =
+ vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(sum_lo, vdupq_n_s16(0))),
+ vdupq_n_u16((1 << kBitdepth10) - 1));
+ result_lo.val[1] = vreinterpretq_u16_s16(vextq_s16(src_lo, src_hi, 2));
+
+ if (size > 8) {
+ const int16x8_t src_hi_extra =
+ vreinterpretq_s16_u16(vld1q_u16(pixel_buffer + 16 - 2));
+ const int16x8_t src9_hi_extra =
+ vaddq_s16(src_hi_extra, vshlq_n_s16(src_hi_extra, 3));
+
+ int16x8_t sum_hi = vsubq_s16(vextq_s16(src9_hi, src9_hi_extra, 1), src_hi);
+ sum_hi = vaddq_s16(sum_hi, vextq_s16(src9_hi, src9_hi_extra, 2));
+ sum_hi = vsubq_s16(sum_hi, vextq_s16(src_hi, src_hi_extra, 3));
+ sum_hi = vrshrq_n_s16(sum_hi, 4);
+
+ uint16x8x2_t result_hi;
+ result_hi.val[0] =
+ vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(sum_hi, vdupq_n_s16(0))),
+ vdupq_n_u16((1 << kBitdepth10) - 1));
+ result_hi.val[1] =
+ vreinterpretq_u16_s16(vextq_s16(src_hi, src_hi_extra, 2));
+ vst2q_u16(pixel_buffer - 1, result_lo);
+ vst2q_u16(pixel_buffer + 15, result_hi);
+ } else {
+ vst2q_u16(pixel_buffer - 1, result_lo);
+ }
+}
+
+void Init10bpp() {
+ Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+ dsp->intra_edge_filter = IntraEdgeFilter_NEON;
+ dsp->intra_edge_upsampler = IntraEdgeUpsampler_NEON;
+}
+
+} // namespace
+} // namespace high_bitdepth
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+void IntraEdgeInit_NEON() {
+ low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ high_bitdepth::Init10bpp();
+#endif
+}
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_NEON
+#else // !LIBGAV1_ENABLE_NEON
namespace libgav1 {
namespace dsp {
diff --git a/src/dsp/arm/intra_edge_neon.h b/src/dsp/arm/intra_edge_neon.h
index d3bb243..28e3494 100644
--- a/src/dsp/arm/intra_edge_neon.h
+++ b/src/dsp/arm/intra_edge_neon.h
@@ -34,6 +34,9 @@ void IntraEdgeInit_NEON();
#define LIBGAV1_Dsp8bpp_IntraEdgeFilter LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_IntraEdgeUpsampler LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_IntraEdgeFilter LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_IntraEdgeUpsampler LIBGAV1_CPU_NEON
+
#endif // LIBGAV1_ENABLE_NEON
#endif // LIBGAV1_SRC_DSP_ARM_INTRA_EDGE_NEON_H_
diff --git a/src/dsp/arm/intrapred_cfl_neon.cc b/src/dsp/arm/intrapred_cfl_neon.cc
index 45fe33b..8d8748f 100644
--- a/src/dsp/arm/intrapred_cfl_neon.cc
+++ b/src/dsp/arm/intrapred_cfl_neon.cc
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "src/dsp/intrapred.h"
+#include "src/dsp/intrapred_cfl.h"
#include "src/utils/cpu.h"
#if LIBGAV1_ENABLE_NEON
@@ -27,45 +27,20 @@
#include "src/dsp/constants.h"
#include "src/dsp/dsp.h"
#include "src/utils/common.h"
+#include "src/utils/constants.h"
namespace libgav1 {
namespace dsp {
-namespace low_bitdepth {
-namespace {
-
-uint8x16_t Set2ValuesQ(const uint8_t* a) {
- uint16_t combined_values = a[0] | a[1] << 8;
- return vreinterpretq_u8_u16(vdupq_n_u16(combined_values));
-}
-
-uint32_t SumVector(uint32x2_t a) {
-#if defined(__aarch64__)
- return vaddv_u32(a);
-#else
- const uint64x1_t b = vpaddl_u32(a);
- return vget_lane_u32(vreinterpret_u32_u64(b), 0);
-#endif // defined(__aarch64__)
-}
-
-uint32_t SumVector(uint32x4_t a) {
-#if defined(__aarch64__)
- return vaddvq_u32(a);
-#else
- const uint64x2_t b = vpaddlq_u32(a);
- const uint64x1_t c = vadd_u64(vget_low_u64(b), vget_high_u64(b));
- return vget_lane_u32(vreinterpret_u32_u64(c), 0);
-#endif // defined(__aarch64__)
-}
// Divide by the number of elements.
-uint32_t Average(const uint32_t sum, const int width, const int height) {
+inline uint32_t Average(const uint32_t sum, const int width, const int height) {
return RightShiftWithRounding(sum, FloorLog2(width) + FloorLog2(height));
}
// Subtract |val| from every element in |a|.
-void BlockSubtract(const uint32_t val,
- int16_t a[kCflLumaBufferStride][kCflLumaBufferStride],
- const int width, const int height) {
+inline void BlockSubtract(const uint32_t val,
+ int16_t a[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int width, const int height) {
assert(val <= INT16_MAX);
const int16x8_t val_v = vdupq_n_s16(static_cast<int16_t>(val));
@@ -94,6 +69,9 @@ void BlockSubtract(const uint32_t val,
}
}
+namespace low_bitdepth {
+namespace {
+
template <int block_width, int block_height>
void CflSubsampler420_NEON(
int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
@@ -122,26 +100,27 @@ void CflSubsampler420_NEON(
sum = SumVector(running_sum);
} else if (block_width == 8) {
- const uint8x16_t x_index = {0, 0, 2, 2, 4, 4, 6, 6,
- 8, 8, 10, 10, 12, 12, 14, 14};
- const uint8x16_t x_max_index = vdupq_n_u8(max_luma_width - 2);
- const uint8x16_t x_mask = vcltq_u8(x_index, x_max_index);
+ const uint16x8_t x_index = {0, 2, 4, 6, 8, 10, 12, 14};
+ const uint16x8_t x_max_index =
+ vdupq_n_u16(max_luma_width == 8 ? max_luma_width - 2 : 16);
+ const uint16x8_t x_mask = vcltq_u16(x_index, x_max_index);
uint32x4_t running_sum = vdupq_n_u32(0);
for (int y = 0; y < block_height; ++y) {
- const uint8x16_t x_max0 = Set2ValuesQ(src + max_luma_width - 2);
- const uint8x16_t x_max1 = Set2ValuesQ(src + max_luma_width - 2 + stride);
+ const uint8x16_t row0 = vld1q_u8(src);
+ const uint8x16_t row1 = vld1q_u8(src + stride);
+ const uint16x8_t sum_row = vpadalq_u8(vpaddlq_u8(row0), row1);
+ const uint16x8_t sum_row_shifted = vshlq_n_u16(sum_row, 1);
- uint8x16_t row0 = vld1q_u8(src);
- row0 = vbslq_u8(x_mask, row0, x_max0);
- uint8x16_t row1 = vld1q_u8(src + stride);
- row1 = vbslq_u8(x_mask, row1, x_max1);
+ // Dup the 2x2 sum at the max luma offset.
+ const uint16x8_t max_luma_sum =
+ vdupq_lane_u16(vget_low_u16(sum_row_shifted), 3);
+ const uint16x8_t final_sum_row =
+ vbslq_u16(x_mask, sum_row_shifted, max_luma_sum);
+ vst1q_s16(luma[y], vreinterpretq_s16_u16(final_sum_row));
- uint16x8_t sum_row = vpadalq_u8(vpaddlq_u8(row0), row1);
- sum_row = vshlq_n_u16(sum_row, 1);
- running_sum = vpadalq_u16(running_sum, sum_row);
- vst1q_s16(luma[y], vreinterpretq_s16_u16(sum_row));
+ running_sum = vpadalq_u16(running_sum, final_sum_row);
if (y << 1 < max_luma_height - 2) {
src += stride << 1;
@@ -150,45 +129,35 @@ void CflSubsampler420_NEON(
sum = SumVector(running_sum);
} else /* block_width >= 16 */ {
- const uint8x16_t x_max_index = vdupq_n_u8(max_luma_width - 2);
+ const uint16x8_t x_max_index = vdupq_n_u16(max_luma_width - 2);
uint32x4_t running_sum = vdupq_n_u32(0);
for (int y = 0; y < block_height; ++y) {
- uint8x16_t x_index = {0, 2, 4, 6, 8, 10, 12, 14,
- 16, 18, 20, 22, 24, 26, 28, 30};
- const uint8x16_t x_max00 = vdupq_n_u8(src[max_luma_width - 2]);
- const uint8x16_t x_max01 = vdupq_n_u8(src[max_luma_width - 2 + 1]);
- const uint8x16_t x_max10 = vdupq_n_u8(src[stride + max_luma_width - 2]);
- const uint8x16_t x_max11 =
- vdupq_n_u8(src[stride + max_luma_width - 2 + 1]);
- for (int x = 0; x < block_width; x += 16) {
- const ptrdiff_t src_x_offset = x << 1;
- const uint8x16_t x_mask = vcltq_u8(x_index, x_max_index);
- const uint8x16x2_t row0 = vld2q_u8(src + src_x_offset);
- const uint8x16x2_t row1 = vld2q_u8(src + src_x_offset + stride);
- const uint8x16_t row_masked_00 = vbslq_u8(x_mask, row0.val[0], x_max00);
- const uint8x16_t row_masked_01 = vbslq_u8(x_mask, row0.val[1], x_max01);
- const uint8x16_t row_masked_10 = vbslq_u8(x_mask, row1.val[0], x_max10);
- const uint8x16_t row_masked_11 = vbslq_u8(x_mask, row1.val[1], x_max11);
-
- uint16x8_t sum_row_lo =
- vaddl_u8(vget_low_u8(row_masked_00), vget_low_u8(row_masked_01));
- sum_row_lo = vaddw_u8(sum_row_lo, vget_low_u8(row_masked_10));
- sum_row_lo = vaddw_u8(sum_row_lo, vget_low_u8(row_masked_11));
- sum_row_lo = vshlq_n_u16(sum_row_lo, 1);
- running_sum = vpadalq_u16(running_sum, sum_row_lo);
- vst1q_s16(luma[y] + x, vreinterpretq_s16_u16(sum_row_lo));
-
- uint16x8_t sum_row_hi =
- vaddl_u8(vget_high_u8(row_masked_00), vget_high_u8(row_masked_01));
- sum_row_hi = vaddw_u8(sum_row_hi, vget_high_u8(row_masked_10));
- sum_row_hi = vaddw_u8(sum_row_hi, vget_high_u8(row_masked_11));
- sum_row_hi = vshlq_n_u16(sum_row_hi, 1);
- running_sum = vpadalq_u16(running_sum, sum_row_hi);
- vst1q_s16(luma[y] + x + 8, vreinterpretq_s16_u16(sum_row_hi));
-
- x_index = vaddq_u8(x_index, vdupq_n_u8(32));
+ // Calculate the 2x2 sum at the max_luma offset
+ const uint8_t a00 = src[max_luma_width - 2];
+ const uint8_t a01 = src[max_luma_width - 1];
+ const uint8_t a10 = src[max_luma_width - 2 + stride];
+ const uint8_t a11 = src[max_luma_width - 1 + stride];
+ // Dup the 2x2 sum at the max luma offset.
+ const uint16x8_t max_luma_sum =
+ vdupq_n_u16((uint16_t)((a00 + a01 + a10 + a11) << 1));
+ uint16x8_t x_index = {0, 2, 4, 6, 8, 10, 12, 14};
+
+ ptrdiff_t src_x_offset = 0;
+ for (int x = 0; x < block_width; x += 8, src_x_offset += 16) {
+ const uint16x8_t x_mask = vcltq_u16(x_index, x_max_index);
+ const uint8x16_t row0 = vld1q_u8(src + src_x_offset);
+ const uint8x16_t row1 = vld1q_u8(src + src_x_offset + stride);
+ const uint16x8_t sum_row = vpadalq_u8(vpaddlq_u8(row0), row1);
+ const uint16x8_t sum_row_shifted = vshlq_n_u16(sum_row, 1);
+ const uint16x8_t final_sum_row =
+ vbslq_u16(x_mask, sum_row_shifted, max_luma_sum);
+ vst1q_s16(luma[y] + x, vreinterpretq_s16_u16(final_sum_row));
+
+ running_sum = vpadalq_u16(running_sum, final_sum_row);
+ x_index = vaddq_u16(x_index, vdupq_n_u16(16));
}
+
if (y << 1 < max_luma_height - 2) {
src += stride << 1;
}
@@ -209,17 +178,30 @@ void CflSubsampler444_NEON(
uint32_t sum;
if (block_width == 4) {
assert(max_luma_width >= 4);
+ assert(max_luma_height <= block_height);
+ assert((max_luma_height % 2) == 0);
uint32x4_t running_sum = vdupq_n_u32(0);
uint8x8_t row = vdup_n_u8(0);
- for (int y = 0; y < block_height; y += 2) {
+ uint16x8_t row_shifted;
+ int y = 0;
+ do {
row = Load4<0>(src, row);
row = Load4<1>(src + stride, row);
if (y < (max_luma_height - 1)) {
src += stride << 1;
}
- const uint16x8_t row_shifted = vshll_n_u8(row, 3);
+ row_shifted = vshll_n_u8(row, 3);
+ running_sum = vpadalq_u16(running_sum, row_shifted);
+ vst1_s16(luma[y], vreinterpret_s16_u16(vget_low_u16(row_shifted)));
+ vst1_s16(luma[y + 1], vreinterpret_s16_u16(vget_high_u16(row_shifted)));
+ y += 2;
+ } while (y < max_luma_height);
+
+ row_shifted =
+ vcombine_u16(vget_high_u16(row_shifted), vget_high_u16(row_shifted));
+ for (; y < block_height; y += 2) {
running_sum = vpadalq_u16(running_sum, row_shifted);
vst1_s16(luma[y], vreinterpret_s16_u16(vget_low_u16(row_shifted)));
vst1_s16(luma[y + 1], vreinterpret_s16_u16(vget_high_u16(row_shifted)));
@@ -463,12 +445,874 @@ void Init8bpp() {
} // namespace
} // namespace low_bitdepth
-void IntraPredCflInit_NEON() { low_bitdepth::Init8bpp(); }
+//------------------------------------------------------------------------------
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+//------------------------------------------------------------------------------
+// CflSubsampler
+#ifndef __aarch64__
+uint16x8_t vpaddq_u16(uint16x8_t a, uint16x8_t b) {
+ return vcombine_u16(vpadd_u16(vget_low_u16(a), vget_high_u16(a)),
+ vpadd_u16(vget_low_u16(b), vget_high_u16(b)));
+}
+#endif
+
+// This duplicates the last two 16-bit values in |row|.
+inline uint16x8_t LastRowSamples(const uint16x8_t row) {
+ const uint32x2_t a = vget_high_u32(vreinterpretq_u32_u16(row));
+ const uint32x4_t b = vdupq_lane_u32(a, 1);
+ return vreinterpretq_u16_u32(b);
+}
+
+// This duplicates the last unsigned 16-bit value in |row|.
+inline uint16x8_t LastRowResult(const uint16x8_t row) {
+ const uint16x4_t a = vget_high_u16(row);
+ const uint16x8_t b = vdupq_lane_u16(a, 0x3);
+ return b;
+}
+
+// This duplicates the last signed 16-bit value in |row|.
+inline int16x8_t LastRowResult(const int16x8_t row) {
+ const int16x4_t a = vget_high_s16(row);
+ const int16x8_t b = vdupq_lane_s16(a, 0x3);
+ return b;
+}
+
+// Takes in two sums of input row pairs, and completes the computation for two
+// output rows.
+inline uint16x8_t StoreLumaResults4_420(const uint16x8_t vertical_sum0,
+ const uint16x8_t vertical_sum1,
+ int16_t* luma_ptr) {
+ const uint16x8_t result = vpaddq_u16(vertical_sum0, vertical_sum1);
+ const uint16x8_t result_shifted = vshlq_n_u16(result, 1);
+ vst1_s16(luma_ptr, vreinterpret_s16_u16(vget_low_u16(result_shifted)));
+ vst1_s16(luma_ptr + kCflLumaBufferStride,
+ vreinterpret_s16_u16(vget_high_u16(result_shifted)));
+ return result_shifted;
+}
+
+// Takes two halves of a vertically added pair of rows and completes the
+// computation for one output row.
+inline uint16x8_t StoreLumaResults8_420(const uint16x8_t vertical_sum0,
+ const uint16x8_t vertical_sum1,
+ int16_t* luma_ptr) {
+ const uint16x8_t result = vpaddq_u16(vertical_sum0, vertical_sum1);
+ const uint16x8_t result_shifted = vshlq_n_u16(result, 1);
+ vst1q_s16(luma_ptr, vreinterpretq_s16_u16(result_shifted));
+ return result_shifted;
+}
+
+template <int block_height_log2, bool is_inside>
+void CflSubsampler444_4xH_NEON(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_height, const void* const source, ptrdiff_t stride) {
+ static_assert(block_height_log2 <= 4, "");
+ const int block_height = 1 << block_height_log2;
+ const int visible_height = max_luma_height;
+ const auto* src = static_cast<const uint16_t*>(source);
+ const ptrdiff_t src_stride = stride / sizeof(src[0]);
+ int16_t* luma_ptr = luma[0];
+ uint16x4_t sum = vdup_n_u16(0);
+ uint16x4_t samples[2];
+ int y = visible_height;
+
+ do {
+ samples[0] = vld1_u16(src);
+ samples[1] = vld1_u16(src + src_stride);
+ src += src_stride << 1;
+ sum = vadd_u16(sum, samples[0]);
+ sum = vadd_u16(sum, samples[1]);
+ y -= 2;
+ } while (y != 0);
+
+ if (!is_inside) {
+ y = visible_height;
+ samples[1] = vshl_n_u16(samples[1], 1);
+ do {
+ sum = vadd_u16(sum, samples[1]);
+ y += 2;
+ } while (y < block_height);
+ }
+
+ // Here the left shift by 3 (to increase precision) is nullified in right
+ // shift ((log2 of width 4) + 1).
+ const uint32_t average_sum =
+ RightShiftWithRounding(SumVector(vpaddl_u16(sum)), block_height_log2 - 1);
+ const int16x4_t averages = vdup_n_s16(static_cast<int16_t>(average_sum));
+
+ const auto* ssrc = static_cast<const int16_t*>(source);
+ int16x4_t ssample;
+ luma_ptr = luma[0];
+ y = visible_height;
+ do {
+ ssample = vld1_s16(ssrc);
+ ssample = vshl_n_s16(ssample, 3);
+ vst1_s16(luma_ptr, vsub_s16(ssample, averages));
+ ssrc += src_stride;
+ luma_ptr += kCflLumaBufferStride;
+ } while (--y != 0);
+
+ if (!is_inside) {
+ y = visible_height;
+ // Replicate last line
+ do {
+ vst1_s16(luma_ptr, vsub_s16(ssample, averages));
+ luma_ptr += kCflLumaBufferStride;
+ } while (++y < block_height);
+ }
+}
+
+template <int block_height_log2>
+void CflSubsampler444_4xH_NEON(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* const source, ptrdiff_t stride) {
+ static_cast<void>(max_luma_width);
+ static_cast<void>(max_luma_height);
+ static_assert(block_height_log2 <= 4, "");
+ assert(max_luma_width >= 4);
+ assert(max_luma_height >= 4);
+ const int block_height = 1 << block_height_log2;
+
+ if (block_height <= max_luma_height) {
+ CflSubsampler444_4xH_NEON<block_height_log2, true>(luma, max_luma_height,
+ source, stride);
+ } else {
+ CflSubsampler444_4xH_NEON<block_height_log2, false>(luma, max_luma_height,
+ source, stride);
+ }
+}
+
+template <int block_height_log2, bool is_inside>
+void CflSubsampler444_8xH_NEON(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_height, const void* const source, ptrdiff_t stride) {
+ const int block_height = 1 << block_height_log2;
+ const int visible_height = max_luma_height;
+ const auto* src = static_cast<const uint16_t*>(source);
+ const ptrdiff_t src_stride = stride / sizeof(src[0]);
+ int16_t* luma_ptr = luma[0];
+ uint32x4_t sum = vdupq_n_u32(0);
+ uint16x8_t samples;
+ int y = visible_height;
+
+ do {
+ samples = vld1q_u16(src);
+ src += src_stride;
+ sum = vpadalq_u16(sum, samples);
+ } while (--y != 0);
+
+ if (!is_inside) {
+ y = visible_height;
+ do {
+ sum = vpadalq_u16(sum, samples);
+ } while (++y < block_height);
+ }
+
+ // Here the left shift by 3 (to increase precision) is nullified in right
+ // shift (log2 of width 8).
+ const uint32_t average_sum =
+ RightShiftWithRounding(SumVector(sum), block_height_log2);
+ const int16x8_t averages = vdupq_n_s16(static_cast<int16_t>(average_sum));
+
+ const auto* ssrc = static_cast<const int16_t*>(source);
+ int16x8_t ssample;
+ luma_ptr = luma[0];
+ y = visible_height;
+ do {
+ ssample = vld1q_s16(ssrc);
+ ssample = vshlq_n_s16(ssample, 3);
+ vst1q_s16(luma_ptr, vsubq_s16(ssample, averages));
+ ssrc += src_stride;
+ luma_ptr += kCflLumaBufferStride;
+ } while (--y != 0);
+
+ if (!is_inside) {
+ y = visible_height;
+ // Replicate last line
+ do {
+ vst1q_s16(luma_ptr, vsubq_s16(ssample, averages));
+ luma_ptr += kCflLumaBufferStride;
+ } while (++y < block_height);
+ }
+}
+
+template <int block_height_log2>
+void CflSubsampler444_8xH_NEON(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* const source, ptrdiff_t stride) {
+ static_cast<void>(max_luma_width);
+ static_cast<void>(max_luma_height);
+ static_assert(block_height_log2 <= 5, "");
+ assert(max_luma_width >= 4);
+ assert(max_luma_height >= 4);
+ const int block_height = 1 << block_height_log2;
+ const int block_width = 8;
+
+ const int horz_inside = block_width <= max_luma_width;
+ const int vert_inside = block_height <= max_luma_height;
+ if (horz_inside && vert_inside) {
+ CflSubsampler444_8xH_NEON<block_height_log2, true>(luma, max_luma_height,
+ source, stride);
+ } else {
+ CflSubsampler444_8xH_NEON<block_height_log2, false>(luma, max_luma_height,
+ source, stride);
+ }
+}
+
+template <int block_width_log2, int block_height_log2, bool is_inside>
+void CflSubsampler444_WxH_NEON(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* const source, ptrdiff_t stride) {
+ const int block_height = 1 << block_height_log2;
+ const int visible_height = max_luma_height;
+ const int block_width = 1 << block_width_log2;
+ const auto* src = static_cast<const uint16_t*>(source);
+ const ptrdiff_t src_stride = stride / sizeof(src[0]);
+ int16_t* luma_ptr = luma[0];
+ uint32x4_t sum = vdupq_n_u32(0);
+ uint16x8_t samples[4];
+ int y = visible_height;
+
+ do {
+ samples[0] = vld1q_u16(src);
+ samples[1] =
+ (max_luma_width >= 16) ? vld1q_u16(src + 8) : LastRowResult(samples[0]);
+ uint16x8_t inner_sum = vaddq_u16(samples[0], samples[1]);
+ if (block_width == 32) {
+ samples[2] = (max_luma_width >= 24) ? vld1q_u16(src + 16)
+ : LastRowResult(samples[1]);
+ samples[3] = (max_luma_width == 32) ? vld1q_u16(src + 24)
+ : LastRowResult(samples[2]);
+ inner_sum = vaddq_u16(samples[2], inner_sum);
+ inner_sum = vaddq_u16(samples[3], inner_sum);
+ }
+ sum = vpadalq_u16(sum, inner_sum);
+ src += src_stride;
+ } while (--y != 0);
+
+ if (!is_inside) {
+ y = visible_height;
+ uint16x8_t inner_sum = vaddq_u16(samples[0], samples[1]);
+ if (block_width == 32) {
+ inner_sum = vaddq_u16(samples[2], inner_sum);
+ inner_sum = vaddq_u16(samples[3], inner_sum);
+ }
+ do {
+ sum = vpadalq_u16(sum, inner_sum);
+ } while (++y < block_height);
+ }
+
+ // Here the left shift by 3 (to increase precision) is subtracted in right
+ // shift factor (block_width_log2 + block_height_log2 - 3).
+ const uint32_t average_sum = RightShiftWithRounding(
+ SumVector(sum), block_width_log2 + block_height_log2 - 3);
+ const int16x8_t averages = vdupq_n_s16(static_cast<int16_t>(average_sum));
+
+ const auto* ssrc = static_cast<const int16_t*>(source);
+ int16x8_t ssamples_ext = vdupq_n_s16(0);
+ int16x8_t ssamples[4];
+ luma_ptr = luma[0];
+ y = visible_height;
+ do {
+ int idx = 0;
+ for (int x = 0; x < block_width; x += 8) {
+ if (max_luma_width > x) {
+ ssamples[idx] = vld1q_s16(&ssrc[x]);
+ ssamples[idx] = vshlq_n_s16(ssamples[idx], 3);
+ ssamples_ext = ssamples[idx];
+ } else {
+ ssamples[idx] = LastRowResult(ssamples_ext);
+ }
+ vst1q_s16(&luma_ptr[x], vsubq_s16(ssamples[idx++], averages));
+ }
+ ssrc += src_stride;
+ luma_ptr += kCflLumaBufferStride;
+ } while (--y != 0);
+
+ if (!is_inside) {
+ y = visible_height;
+ // Replicate last line
+ do {
+ int idx = 0;
+ for (int x = 0; x < block_width; x += 8) {
+ vst1q_s16(&luma_ptr[x], vsubq_s16(ssamples[idx++], averages));
+ }
+ luma_ptr += kCflLumaBufferStride;
+ } while (++y < block_height);
+ }
+}
+
+template <int block_width_log2, int block_height_log2>
+void CflSubsampler444_WxH_NEON(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* const source, ptrdiff_t stride) {
+ static_assert(block_width_log2 == 4 || block_width_log2 == 5,
+ "This function will only work for block_width 16 and 32.");
+ static_assert(block_height_log2 <= 5, "");
+ assert(max_luma_width >= 4);
+ assert(max_luma_height >= 4);
+
+ const int block_height = 1 << block_height_log2;
+ const int vert_inside = block_height <= max_luma_height;
+ if (vert_inside) {
+ CflSubsampler444_WxH_NEON<block_width_log2, block_height_log2, true>(
+ luma, max_luma_width, max_luma_height, source, stride);
+ } else {
+ CflSubsampler444_WxH_NEON<block_width_log2, block_height_log2, false>(
+ luma, max_luma_width, max_luma_height, source, stride);
+ }
+}
+
+template <int block_height_log2>
+void CflSubsampler420_4xH_NEON(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int /*max_luma_width*/, const int max_luma_height,
+ const void* const source, ptrdiff_t stride) {
+ const int block_height = 1 << block_height_log2;
+ const auto* src = static_cast<const uint16_t*>(source);
+ const ptrdiff_t src_stride = stride / sizeof(src[0]);
+ int16_t* luma_ptr = luma[0];
+ const int luma_height = std::min(block_height, max_luma_height >> 1);
+ int y = luma_height;
+
+ uint32x4_t final_sum = vdupq_n_u32(0);
+ do {
+ const uint16x8_t samples_row0 = vld1q_u16(src);
+ src += src_stride;
+ const uint16x8_t samples_row1 = vld1q_u16(src);
+ src += src_stride;
+ const uint16x8_t luma_sum01 = vaddq_u16(samples_row0, samples_row1);
+
+ const uint16x8_t samples_row2 = vld1q_u16(src);
+ src += src_stride;
+ const uint16x8_t samples_row3 = vld1q_u16(src);
+ src += src_stride;
+ const uint16x8_t luma_sum23 = vaddq_u16(samples_row2, samples_row3);
+ uint16x8_t sum = StoreLumaResults4_420(luma_sum01, luma_sum23, luma_ptr);
+ luma_ptr += kCflLumaBufferStride << 1;
+
+ const uint16x8_t samples_row4 = vld1q_u16(src);
+ src += src_stride;
+ const uint16x8_t samples_row5 = vld1q_u16(src);
+ src += src_stride;
+ const uint16x8_t luma_sum45 = vaddq_u16(samples_row4, samples_row5);
+
+ const uint16x8_t samples_row6 = vld1q_u16(src);
+ src += src_stride;
+ const uint16x8_t samples_row7 = vld1q_u16(src);
+ src += src_stride;
+ const uint16x8_t luma_sum67 = vaddq_u16(samples_row6, samples_row7);
+ sum =
+ vaddq_u16(sum, StoreLumaResults4_420(luma_sum45, luma_sum67, luma_ptr));
+ luma_ptr += kCflLumaBufferStride << 1;
+
+ final_sum = vpadalq_u16(final_sum, sum);
+ y -= 4;
+ } while (y != 0);
+
+ const uint16x4_t final_fill =
+ vreinterpret_u16_s16(vld1_s16(luma_ptr - kCflLumaBufferStride));
+ const uint32x4_t final_fill_to_sum = vmovl_u16(final_fill);
+ for (y = luma_height; y < block_height; ++y) {
+ vst1_s16(luma_ptr, vreinterpret_s16_u16(final_fill));
+ luma_ptr += kCflLumaBufferStride;
+ final_sum = vaddq_u32(final_sum, final_fill_to_sum);
+ }
+ const uint32_t average_sum = RightShiftWithRounding(
+ SumVector(final_sum), block_height_log2 + 2 /*log2 of width 4*/);
+ const int16x4_t averages = vdup_n_s16(static_cast<int16_t>(average_sum));
+ luma_ptr = luma[0];
+ y = block_height;
+ do {
+ const int16x4_t samples = vld1_s16(luma_ptr);
+ vst1_s16(luma_ptr, vsub_s16(samples, averages));
+ luma_ptr += kCflLumaBufferStride;
+ } while (--y != 0);
+}
+
+template <int block_height_log2, int max_luma_width>
+inline void CflSubsampler420Impl_8xH_NEON(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_height, const void* const source, ptrdiff_t stride) {
+ const int block_height = 1 << block_height_log2;
+ const auto* src = static_cast<const uint16_t*>(source);
+ const ptrdiff_t src_stride = stride / sizeof(src[0]);
+ int16_t* luma_ptr = luma[0];
+ const int luma_height = std::min(block_height, max_luma_height >> 1);
+ int y = luma_height;
+
+ uint32x4_t final_sum = vdupq_n_u32(0);
+ do {
+ const uint16x8_t samples_row00 = vld1q_u16(src);
+ const uint16x8_t samples_row01 = (max_luma_width == 16)
+ ? vld1q_u16(src + 8)
+ : LastRowSamples(samples_row00);
+ src += src_stride;
+ const uint16x8_t samples_row10 = vld1q_u16(src);
+ const uint16x8_t samples_row11 = (max_luma_width == 16)
+ ? vld1q_u16(src + 8)
+ : LastRowSamples(samples_row10);
+ src += src_stride;
+ const uint16x8_t luma_sum00 = vaddq_u16(samples_row00, samples_row10);
+ const uint16x8_t luma_sum01 = vaddq_u16(samples_row01, samples_row11);
+ uint16x8_t sum = StoreLumaResults8_420(luma_sum00, luma_sum01, luma_ptr);
+ luma_ptr += kCflLumaBufferStride;
+
+ const uint16x8_t samples_row20 = vld1q_u16(src);
+ const uint16x8_t samples_row21 = (max_luma_width == 16)
+ ? vld1q_u16(src + 8)
+ : LastRowSamples(samples_row20);
+ src += src_stride;
+ const uint16x8_t samples_row30 = vld1q_u16(src);
+ const uint16x8_t samples_row31 = (max_luma_width == 16)
+ ? vld1q_u16(src + 8)
+ : LastRowSamples(samples_row30);
+ src += src_stride;
+ const uint16x8_t luma_sum10 = vaddq_u16(samples_row20, samples_row30);
+ const uint16x8_t luma_sum11 = vaddq_u16(samples_row21, samples_row31);
+ sum =
+ vaddq_u16(sum, StoreLumaResults8_420(luma_sum10, luma_sum11, luma_ptr));
+ luma_ptr += kCflLumaBufferStride;
+
+ const uint16x8_t samples_row40 = vld1q_u16(src);
+ const uint16x8_t samples_row41 = (max_luma_width == 16)
+ ? vld1q_u16(src + 8)
+ : LastRowSamples(samples_row40);
+ src += src_stride;
+ const uint16x8_t samples_row50 = vld1q_u16(src);
+ const uint16x8_t samples_row51 = (max_luma_width == 16)
+ ? vld1q_u16(src + 8)
+ : LastRowSamples(samples_row50);
+ src += src_stride;
+ const uint16x8_t luma_sum20 = vaddq_u16(samples_row40, samples_row50);
+ const uint16x8_t luma_sum21 = vaddq_u16(samples_row41, samples_row51);
+ sum =
+ vaddq_u16(sum, StoreLumaResults8_420(luma_sum20, luma_sum21, luma_ptr));
+ luma_ptr += kCflLumaBufferStride;
+
+ const uint16x8_t samples_row60 = vld1q_u16(src);
+ const uint16x8_t samples_row61 = (max_luma_width == 16)
+ ? vld1q_u16(src + 8)
+ : LastRowSamples(samples_row60);
+ src += src_stride;
+ const uint16x8_t samples_row70 = vld1q_u16(src);
+ const uint16x8_t samples_row71 = (max_luma_width == 16)
+ ? vld1q_u16(src + 8)
+ : LastRowSamples(samples_row70);
+ src += src_stride;
+ const uint16x8_t luma_sum30 = vaddq_u16(samples_row60, samples_row70);
+ const uint16x8_t luma_sum31 = vaddq_u16(samples_row61, samples_row71);
+ sum =
+ vaddq_u16(sum, StoreLumaResults8_420(luma_sum30, luma_sum31, luma_ptr));
+ luma_ptr += kCflLumaBufferStride;
+
+ final_sum = vpadalq_u16(final_sum, sum);
+ y -= 4;
+ } while (y != 0);
+
+ // Duplicate the final row downward to the end after max_luma_height.
+ const uint16x8_t final_fill =
+ vreinterpretq_u16_s16(vld1q_s16(luma_ptr - kCflLumaBufferStride));
+ const uint32x4_t final_fill_to_sum =
+ vaddl_u16(vget_low_u16(final_fill), vget_high_u16(final_fill));
+
+ for (y = luma_height; y < block_height; ++y) {
+ vst1q_s16(luma_ptr, vreinterpretq_s16_u16(final_fill));
+ luma_ptr += kCflLumaBufferStride;
+ final_sum = vaddq_u32(final_sum, final_fill_to_sum);
+ }
+
+ const uint32_t average_sum = RightShiftWithRounding(
+ SumVector(final_sum), block_height_log2 + 3 /*log2 of width 8*/);
+ const int16x8_t averages = vdupq_n_s16(static_cast<int16_t>(average_sum));
+ luma_ptr = luma[0];
+ y = block_height;
+ do {
+ const int16x8_t samples = vld1q_s16(luma_ptr);
+ vst1q_s16(luma_ptr, vsubq_s16(samples, averages));
+ luma_ptr += kCflLumaBufferStride;
+ } while (--y != 0);
+}
+
+template <int block_height_log2>
+void CflSubsampler420_8xH_NEON(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* const source, ptrdiff_t stride) {
+ if (max_luma_width == 8) {
+ CflSubsampler420Impl_8xH_NEON<block_height_log2, 8>(luma, max_luma_height,
+ source, stride);
+ } else {
+ CflSubsampler420Impl_8xH_NEON<block_height_log2, 16>(luma, max_luma_height,
+ source, stride);
+ }
+}
+
+template <int block_width_log2, int block_height_log2, int max_luma_width>
+inline void CflSubsampler420Impl_WxH_NEON(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_height, const void* const source, ptrdiff_t stride) {
+ const auto* src = static_cast<const uint16_t*>(source);
+ const ptrdiff_t src_stride = stride / sizeof(src[0]);
+ const int block_height = 1 << block_height_log2;
+ const int luma_height = std::min(block_height, max_luma_height >> 1);
+ int16_t* luma_ptr = luma[0];
+ // Begin first y section, covering width up to 32.
+ int y = luma_height;
+
+ uint16x8_t final_fill0, final_fill1;
+ uint32x4_t final_sum = vdupq_n_u32(0);
+ do {
+ const uint16_t* src_next = src + src_stride;
+ const uint16x8_t samples_row00 = vld1q_u16(src);
+ const uint16x8_t samples_row01 = (max_luma_width >= 16)
+ ? vld1q_u16(src + 8)
+ : LastRowSamples(samples_row00);
+ const uint16x8_t samples_row02 = (max_luma_width >= 24)
+ ? vld1q_u16(src + 16)
+ : LastRowSamples(samples_row01);
+ const uint16x8_t samples_row03 = (max_luma_width == 32)
+ ? vld1q_u16(src + 24)
+ : LastRowSamples(samples_row02);
+ const uint16x8_t samples_row10 = vld1q_u16(src_next);
+ const uint16x8_t samples_row11 = (max_luma_width >= 16)
+ ? vld1q_u16(src_next + 8)
+ : LastRowSamples(samples_row10);
+ const uint16x8_t samples_row12 = (max_luma_width >= 24)
+ ? vld1q_u16(src_next + 16)
+ : LastRowSamples(samples_row11);
+ const uint16x8_t samples_row13 = (max_luma_width == 32)
+ ? vld1q_u16(src_next + 24)
+ : LastRowSamples(samples_row12);
+ const uint16x8_t luma_sum0 = vaddq_u16(samples_row00, samples_row10);
+ const uint16x8_t luma_sum1 = vaddq_u16(samples_row01, samples_row11);
+ const uint16x8_t luma_sum2 = vaddq_u16(samples_row02, samples_row12);
+ const uint16x8_t luma_sum3 = vaddq_u16(samples_row03, samples_row13);
+ final_fill0 = StoreLumaResults8_420(luma_sum0, luma_sum1, luma_ptr);
+ final_fill1 = StoreLumaResults8_420(luma_sum2, luma_sum3, luma_ptr + 8);
+ const uint16x8_t sum = vaddq_u16(final_fill0, final_fill1);
+
+ final_sum = vpadalq_u16(final_sum, sum);
+
+ // Because max_luma_width is at most 32, any values beyond x=16 will
+ // necessarily be duplicated.
+ if (block_width_log2 == 5) {
+ const uint16x8_t wide_fill = LastRowResult(final_fill1);
+ final_sum = vpadalq_u16(final_sum, vshlq_n_u16(wide_fill, 1));
+ }
+ src += src_stride << 1;
+ luma_ptr += kCflLumaBufferStride;
+ } while (--y != 0);
+
+ // Begin second y section.
+ y = luma_height;
+ if (y < block_height) {
+ uint32x4_t wide_fill;
+ if (block_width_log2 == 5) {
+ // There are 16 16-bit fill values per row, shifting by 2 accounts for
+ // the widening to 32-bit. (a << 2) = (a + a) << 1.
+ wide_fill = vshll_n_u16(vget_low_u16(LastRowResult(final_fill1)), 2);
+ }
+ const uint16x8_t final_inner_sum = vaddq_u16(final_fill0, final_fill1);
+ const uint32x4_t final_fill_to_sum = vaddl_u16(
+ vget_low_u16(final_inner_sum), vget_high_u16(final_inner_sum));
+
+ do {
+ vst1q_s16(luma_ptr, vreinterpretq_s16_u16(final_fill0));
+ vst1q_s16(luma_ptr + 8, vreinterpretq_s16_u16(final_fill1));
+ if (block_width_log2 == 5) {
+ final_sum = vaddq_u32(final_sum, wide_fill);
+ }
+ luma_ptr += kCflLumaBufferStride;
+ final_sum = vaddq_u32(final_sum, final_fill_to_sum);
+ } while (++y < block_height);
+ } // End second y section.
+
+ const uint32_t average_sum = RightShiftWithRounding(
+ SumVector(final_sum), block_width_log2 + block_height_log2);
+ const int16x8_t averages = vdupq_n_s16(static_cast<int16_t>(average_sum));
+
+ luma_ptr = luma[0];
+ y = block_height;
+ do {
+ const int16x8_t samples0 = vld1q_s16(luma_ptr);
+ vst1q_s16(luma_ptr, vsubq_s16(samples0, averages));
+ const int16x8_t samples1 = vld1q_s16(luma_ptr + 8);
+ const int16x8_t final_row_result = vsubq_s16(samples1, averages);
+ vst1q_s16(luma_ptr + 8, final_row_result);
+
+ if (block_width_log2 == 5) {
+ const int16x8_t wide_fill = LastRowResult(final_row_result);
+ vst1q_s16(luma_ptr + 16, wide_fill);
+ vst1q_s16(luma_ptr + 24, wide_fill);
+ }
+ luma_ptr += kCflLumaBufferStride;
+ } while (--y != 0);
+}
+
+//------------------------------------------------------------------------------
+// Choose subsampler based on max_luma_width
+template <int block_width_log2, int block_height_log2>
+void CflSubsampler420_WxH_NEON(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* const source, ptrdiff_t stride) {
+ switch (max_luma_width) {
+ case 8:
+ CflSubsampler420Impl_WxH_NEON<block_width_log2, block_height_log2, 8>(
+ luma, max_luma_height, source, stride);
+ return;
+ case 16:
+ CflSubsampler420Impl_WxH_NEON<block_width_log2, block_height_log2, 16>(
+ luma, max_luma_height, source, stride);
+ return;
+ case 24:
+ CflSubsampler420Impl_WxH_NEON<block_width_log2, block_height_log2, 24>(
+ luma, max_luma_height, source, stride);
+ return;
+ default:
+ assert(max_luma_width == 32);
+ CflSubsampler420Impl_WxH_NEON<block_width_log2, block_height_log2, 32>(
+ luma, max_luma_height, source, stride);
+ return;
+ }
+}
+
+//------------------------------------------------------------------------------
+// CflIntraPredictor
+
+// |luma| can be within +/-(((1 << bitdepth) - 1) << 3), inclusive.
+// |alpha| can be -16 to 16 (inclusive).
+// Clip |dc + ((alpha * luma) >> 6))| to 0, (1 << bitdepth) - 1.
+inline uint16x8_t Combine8(const int16x8_t luma, const int16x8_t alpha_abs,
+ const int16x8_t alpha_signed, const int16x8_t dc,
+ const uint16x8_t max_value) {
+ const int16x8_t luma_abs = vabsq_s16(luma);
+ const int16x8_t luma_alpha_sign =
+ vshrq_n_s16(veorq_s16(luma, alpha_signed), 15);
+ // (alpha * luma) >> 6
+ const int16x8_t la_abs = vqrdmulhq_s16(luma_abs, alpha_abs);
+ // Convert back to signed values.
+ const int16x8_t la =
+ vsubq_s16(veorq_s16(la_abs, luma_alpha_sign), luma_alpha_sign);
+ const int16x8_t result = vaddq_s16(la, dc);
+ const int16x8_t zero = vdupq_n_s16(0);
+ // Clip.
+ return vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(result, zero)), max_value);
+}
+
+template <int block_height, int bitdepth = 10>
+inline void CflIntraPredictor4xN_NEON(
+ void* const dest, const ptrdiff_t stride,
+ const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int alpha) {
+ auto* dst = static_cast<uint16_t*>(dest);
+ const ptrdiff_t dst_stride = stride >> 1;
+ const uint16x8_t max_value = vdupq_n_u16((1 << bitdepth) - 1);
+ const int16x8_t alpha_signed = vdupq_n_s16(alpha << 9);
+ const int16x8_t alpha_abs = vabsq_s16(alpha_signed);
+ const int16x8_t dc = vdupq_n_s16(dst[0]);
+ for (int y = 0; y < block_height; y += 2) {
+ const int16x4_t luma_row0 = vld1_s16(luma[y]);
+ const int16x4_t luma_row1 = vld1_s16(luma[y + 1]);
+ const int16x8_t combined_luma = vcombine_s16(luma_row0, luma_row1);
+ const uint16x8_t sum =
+ Combine8(combined_luma, alpha_abs, alpha_signed, dc, max_value);
+ vst1_u16(dst, vget_low_u16(sum));
+ dst += dst_stride;
+ vst1_u16(dst, vget_high_u16(sum));
+ dst += dst_stride;
+ }
+}
+
+template <int block_height, int bitdepth = 10>
+inline void CflIntraPredictor8xN_NEON(
+ void* const dest, const ptrdiff_t stride,
+ const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int alpha) {
+ auto* dst = static_cast<uint16_t*>(dest);
+ const ptrdiff_t dst_stride = stride >> 1;
+ const uint16x8_t max_value = vdupq_n_u16((1 << bitdepth) - 1);
+ const int16x8_t alpha_signed = vdupq_n_s16(alpha << 9);
+ const int16x8_t alpha_abs = vabsq_s16(alpha_signed);
+ const int16x8_t dc = vdupq_n_s16(dst[0]);
+ for (int y = 0; y < block_height; ++y) {
+ const int16x8_t luma_row = vld1q_s16(luma[y]);
+ const uint16x8_t sum =
+ Combine8(luma_row, alpha_abs, alpha_signed, dc, max_value);
+ vst1q_u16(dst, sum);
+ dst += dst_stride;
+ }
+}
+
+template <int block_height, int bitdepth = 10>
+inline void CflIntraPredictor16xN_NEON(
+ void* const dest, const ptrdiff_t stride,
+ const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int alpha) {
+ auto* dst = static_cast<uint16_t*>(dest);
+ const ptrdiff_t dst_stride = stride >> 1;
+ const uint16x8_t max_value = vdupq_n_u16((1 << bitdepth) - 1);
+ const int16x8_t alpha_signed = vdupq_n_s16(alpha << 9);
+ const int16x8_t alpha_abs = vabsq_s16(alpha_signed);
+ const int16x8_t dc = vdupq_n_s16(dst[0]);
+ for (int y = 0; y < block_height; ++y) {
+ const int16x8_t luma_row_0 = vld1q_s16(luma[y]);
+ const int16x8_t luma_row_1 = vld1q_s16(luma[y] + 8);
+ const uint16x8_t sum_0 =
+ Combine8(luma_row_0, alpha_abs, alpha_signed, dc, max_value);
+ const uint16x8_t sum_1 =
+ Combine8(luma_row_1, alpha_abs, alpha_signed, dc, max_value);
+ vst1q_u16(dst, sum_0);
+ vst1q_u16(dst + 8, sum_1);
+ dst += dst_stride;
+ }
+}
+
+template <int block_height, int bitdepth = 10>
+inline void CflIntraPredictor32xN_NEON(
+ void* const dest, const ptrdiff_t stride,
+ const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int alpha) {
+ auto* dst = static_cast<uint16_t*>(dest);
+ const ptrdiff_t dst_stride = stride >> 1;
+ const uint16x8_t max_value = vdupq_n_u16((1 << bitdepth) - 1);
+ const int16x8_t alpha_signed = vdupq_n_s16(alpha << 9);
+ const int16x8_t alpha_abs = vabsq_s16(alpha_signed);
+ const int16x8_t dc = vdupq_n_s16(dst[0]);
+ for (int y = 0; y < block_height; ++y) {
+ const int16x8_t luma_row_0 = vld1q_s16(luma[y]);
+ const int16x8_t luma_row_1 = vld1q_s16(luma[y] + 8);
+ const int16x8_t luma_row_2 = vld1q_s16(luma[y] + 16);
+ const int16x8_t luma_row_3 = vld1q_s16(luma[y] + 24);
+ const uint16x8_t sum_0 =
+ Combine8(luma_row_0, alpha_abs, alpha_signed, dc, max_value);
+ const uint16x8_t sum_1 =
+ Combine8(luma_row_1, alpha_abs, alpha_signed, dc, max_value);
+ const uint16x8_t sum_2 =
+ Combine8(luma_row_2, alpha_abs, alpha_signed, dc, max_value);
+ const uint16x8_t sum_3 =
+ Combine8(luma_row_3, alpha_abs, alpha_signed, dc, max_value);
+ vst1q_u16(dst, sum_0);
+ vst1q_u16(dst + 8, sum_1);
+ vst1q_u16(dst + 16, sum_2);
+ vst1q_u16(dst + 24, sum_3);
+ dst += dst_stride;
+ }
+}
+
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+
+ dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] =
+ CflSubsampler420_4xH_NEON<2>;
+ dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] =
+ CflSubsampler420_4xH_NEON<3>;
+ dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] =
+ CflSubsampler420_4xH_NEON<4>;
+
+ dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] =
+ CflSubsampler420_8xH_NEON<2>;
+ dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] =
+ CflSubsampler420_8xH_NEON<3>;
+ dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] =
+ CflSubsampler420_8xH_NEON<4>;
+ dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] =
+ CflSubsampler420_8xH_NEON<5>;
+
+ dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] =
+ CflSubsampler420_WxH_NEON<4, 2>;
+ dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] =
+ CflSubsampler420_WxH_NEON<4, 3>;
+ dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] =
+ CflSubsampler420_WxH_NEON<4, 4>;
+ dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] =
+ CflSubsampler420_WxH_NEON<4, 5>;
+
+ dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] =
+ CflSubsampler420_WxH_NEON<5, 3>;
+ dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] =
+ CflSubsampler420_WxH_NEON<5, 4>;
+ dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] =
+ CflSubsampler420_WxH_NEON<5, 5>;
+
+ dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] =
+ CflSubsampler444_4xH_NEON<2>;
+ dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] =
+ CflSubsampler444_4xH_NEON<3>;
+ dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] =
+ CflSubsampler444_4xH_NEON<4>;
+
+ dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] =
+ CflSubsampler444_8xH_NEON<2>;
+ dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] =
+ CflSubsampler444_8xH_NEON<3>;
+ dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] =
+ CflSubsampler444_8xH_NEON<4>;
+ dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] =
+ CflSubsampler444_8xH_NEON<5>;
+
+ dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] =
+ CflSubsampler444_WxH_NEON<4, 2>;
+ dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] =
+ CflSubsampler444_WxH_NEON<4, 3>;
+ dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] =
+ CflSubsampler444_WxH_NEON<4, 4>;
+ dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] =
+ CflSubsampler444_WxH_NEON<4, 5>;
+
+ dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] =
+ CflSubsampler444_WxH_NEON<5, 3>;
+ dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] =
+ CflSubsampler444_WxH_NEON<5, 4>;
+ dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] =
+ CflSubsampler444_WxH_NEON<5, 5>;
+
+ dsp->cfl_intra_predictors[kTransformSize4x4] = CflIntraPredictor4xN_NEON<4>;
+ dsp->cfl_intra_predictors[kTransformSize4x8] = CflIntraPredictor4xN_NEON<8>;
+ dsp->cfl_intra_predictors[kTransformSize4x16] = CflIntraPredictor4xN_NEON<16>;
+
+ dsp->cfl_intra_predictors[kTransformSize8x4] = CflIntraPredictor8xN_NEON<4>;
+ dsp->cfl_intra_predictors[kTransformSize8x8] = CflIntraPredictor8xN_NEON<8>;
+ dsp->cfl_intra_predictors[kTransformSize8x16] = CflIntraPredictor8xN_NEON<16>;
+ dsp->cfl_intra_predictors[kTransformSize8x32] = CflIntraPredictor8xN_NEON<32>;
+
+ dsp->cfl_intra_predictors[kTransformSize16x4] = CflIntraPredictor16xN_NEON<4>;
+ dsp->cfl_intra_predictors[kTransformSize16x8] = CflIntraPredictor16xN_NEON<8>;
+ dsp->cfl_intra_predictors[kTransformSize16x16] =
+ CflIntraPredictor16xN_NEON<16>;
+ dsp->cfl_intra_predictors[kTransformSize16x32] =
+ CflIntraPredictor16xN_NEON<32>;
+ dsp->cfl_intra_predictors[kTransformSize32x8] = CflIntraPredictor32xN_NEON<8>;
+ dsp->cfl_intra_predictors[kTransformSize32x16] =
+ CflIntraPredictor32xN_NEON<16>;
+ dsp->cfl_intra_predictors[kTransformSize32x32] =
+ CflIntraPredictor32xN_NEON<32>;
+ // Max Cfl predictor size is 32x32.
+}
+
+} // namespace
+} // namespace high_bitdepth
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+void IntraPredCflInit_NEON() {
+ low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ high_bitdepth::Init10bpp();
+#endif
+}
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_NEON
+#else // !LIBGAV1_ENABLE_NEON
namespace libgav1 {
namespace dsp {
diff --git a/src/dsp/arm/intrapred_cfl_neon.h b/src/dsp/arm/intrapred_cfl_neon.h
new file mode 100644
index 0000000..b4f983a
--- /dev/null
+++ b/src/dsp/arm/intrapred_cfl_neon.h
@@ -0,0 +1,179 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_INTRAPRED_CFL_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_INTRAPRED_CFL_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::cfl_intra_predictors and Dsp::cfl_subsamplers, see the
+// defines below for specifics. These functions are not thread-safe.
+void IntraPredCflInit_NEON();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+// 4x4
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 4x8
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 4x16
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x4
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x8
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x16
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x32
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x4
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x8
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x16
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x32
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 32x8
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 32x16
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 32x32
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// -----------------------------------------------------------------------------
+// 10bpp
+
+// 4x4
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 4x8
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 4x16
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x4
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x8
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x16
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x32
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x4
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x8
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x16
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x32
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 32x8
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 32x16
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 32x32
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler444 LIBGAV1_CPU_NEON
+
+#endif // LIBGAV1_ENABLE_NEON
+
+#endif // LIBGAV1_SRC_DSP_ARM_INTRAPRED_CFL_NEON_H_
diff --git a/src/dsp/arm/intrapred_directional_neon.cc b/src/dsp/arm/intrapred_directional_neon.cc
index 805ba81..3f5edbd 100644
--- a/src/dsp/arm/intrapred_directional_neon.cc
+++ b/src/dsp/arm/intrapred_directional_neon.cc
@@ -12,18 +12,18 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "src/dsp/intrapred.h"
+#include "src/dsp/intrapred_directional.h"
#include "src/utils/cpu.h"
#if LIBGAV1_ENABLE_NEON
#include <arm_neon.h>
-#include <algorithm> // std::min
+#include <algorithm>
#include <cassert>
#include <cstddef>
#include <cstdint>
-#include <cstring> // memset
+#include <cstring>
#include "src/dsp/arm/common_neon.h"
#include "src/dsp/constants.h"
@@ -35,14 +35,14 @@ namespace dsp {
namespace low_bitdepth {
namespace {
-// Blend two values based on a 32 bit weight.
+// Blend two values based on weights that sum to 32.
inline uint8x8_t WeightedBlend(const uint8x8_t a, const uint8x8_t b,
const uint8x8_t a_weight,
const uint8x8_t b_weight) {
const uint16x8_t a_product = vmull_u8(a, a_weight);
const uint16x8_t b_product = vmull_u8(b, b_weight);
- return vrshrn_n_u16(vaddq_u16(a_product, b_product), 5);
+ return vrshrn_n_u16(vaddq_u16(a_product, b_product), 5 /*log2(32)*/);
}
// For vertical operations the weights are one constant value.
@@ -112,7 +112,7 @@ inline void DirectionalZone1_WxH(uint8_t* dst, const ptrdiff_t stride,
// 4 wide subsamples the output. 8 wide subsamples the input.
if (width == 4) {
const uint8x8_t left_values = vld1_u8(top + top_base_x);
- const uint8x8_t right_values = RightShift<8>(left_values);
+ const uint8x8_t right_values = RightShiftVector<8>(left_values);
const uint8x8_t value = WeightedBlend(left_values, right_values, shift);
// If |upsampled| is true then extract every other value for output.
@@ -910,12 +910,590 @@ void Init8bpp() {
} // namespace
} // namespace low_bitdepth
-void IntraPredDirectionalInit_NEON() { low_bitdepth::Init8bpp(); }
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+// Blend two values based on weights that sum to 32.
+inline uint16x4_t WeightedBlend(const uint16x4_t a, const uint16x4_t b,
+ const int a_weight, const int b_weight) {
+ const uint16x4_t a_product = vmul_n_u16(a, a_weight);
+ const uint16x4_t sum = vmla_n_u16(a_product, b, b_weight);
+
+ return vrshr_n_u16(sum, 5 /*log2(32)*/);
+}
+
+// Blend two values based on weights that sum to 32.
+inline uint16x8_t WeightedBlend(const uint16x8_t a, const uint16x8_t b,
+ const uint16_t a_weight,
+ const uint16_t b_weight) {
+ const uint16x8_t a_product = vmulq_n_u16(a, a_weight);
+ const uint16x8_t sum = vmlaq_n_u16(a_product, b, b_weight);
+
+ return vrshrq_n_u16(sum, 5 /*log2(32)*/);
+}
+
+// Each element of |dest| contains values associated with one weight value.
+inline void LoadEdgeVals(uint16x4x2_t* dest, const uint16_t* const source,
+ const bool upsampled) {
+ if (upsampled) {
+ *dest = vld2_u16(source);
+ } else {
+ dest->val[0] = vld1_u16(source);
+ dest->val[1] = vld1_u16(source + 1);
+ }
+}
+
+// Each element of |dest| contains values associated with one weight value.
+inline void LoadEdgeVals(uint16x8x2_t* dest, const uint16_t* const source,
+ const bool upsampled) {
+ if (upsampled) {
+ *dest = vld2q_u16(source);
+ } else {
+ dest->val[0] = vld1q_u16(source);
+ dest->val[1] = vld1q_u16(source + 1);
+ }
+}
+
+template <bool upsampled>
+inline void DirectionalZone1_4xH(uint16_t* dst, const ptrdiff_t stride,
+ const int height, const uint16_t* const top,
+ const int xstep) {
+ const int upsample_shift = static_cast<int>(upsampled);
+ const int index_scale_bits = 6 - upsample_shift;
+
+ const int max_base_x = (4 + height - 1) << upsample_shift;
+ const int16x4_t max_base = vdup_n_s16(max_base_x);
+ const uint16x4_t final_top_val = vdup_n_u16(top[max_base_x]);
+ const int16x4_t index_offset = {0, 1, 2, 3};
+
+ // All rows from |min_corner_only_y| down will simply use Memset.
+ // |max_base_x| is always greater than |height|, so clipping the denominator
+ // to 1 is enough to make the logic work.
+ const int xstep_units = std::max(xstep >> index_scale_bits, 1);
+ const int min_corner_only_y = std::min(max_base_x / xstep_units, height);
+
+ int top_x = xstep;
+ int y = 0;
+ for (; y < min_corner_only_y; ++y, dst += stride, top_x += xstep) {
+ const int top_base_x = top_x >> index_scale_bits;
+
+ // To accommodate reuse of this function in Zone2, permit negative values
+ // for |xstep|.
+ const uint16_t shift_0 = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+ const uint16_t shift_1 = 32 - shift_0;
+
+ // Use signed values to compare |top_base_x| to |max_base_x|.
+ const int16x4_t base_x = vadd_s16(vdup_n_s16(top_base_x), index_offset);
+ const uint16x4_t max_base_mask = vclt_s16(base_x, max_base);
+
+ uint16x4x2_t sampled_top_row;
+ LoadEdgeVals(&sampled_top_row, top + top_base_x, upsampled);
+ const uint16x4_t combined = WeightedBlend(
+ sampled_top_row.val[0], sampled_top_row.val[1], shift_1, shift_0);
+
+ // If |upsampled| is true then extract every other value for output.
+ const uint16x4_t masked_result =
+ vbsl_u16(max_base_mask, combined, final_top_val);
+
+ vst1_u16(dst, masked_result);
+ }
+ for (; y < height; ++y) {
+ Memset(dst, top[max_base_x], 4 /* width */);
+ dst += stride;
+ }
+}
+
+// Process a multiple of 8 |width| by any |height|. Processes horizontally
+// before vertically in the hopes of being a little more cache friendly.
+template <bool upsampled>
+inline void DirectionalZone1_WxH(uint16_t* dst, const ptrdiff_t stride,
+ const int width, const int height,
+ const uint16_t* const top, const int xstep) {
+ assert(width % 8 == 0);
+ const int upsample_shift = static_cast<int>(upsampled);
+ const int index_scale_bits = 6 - upsample_shift;
+
+ const int max_base_index = (width + height - 1) << upsample_shift;
+ const int16x8_t max_base_x = vdupq_n_s16(max_base_index);
+ const uint16x8_t final_top_val = vdupq_n_u16(top[max_base_index]);
+ const int16x8_t index_offset = {0, 1, 2, 3, 4, 5, 6, 7};
+
+ const int base_step = 1 << upsample_shift;
+ const int base_step8 = base_step << 3;
+ const int16x8_t block_step = vdupq_n_s16(base_step8);
+
+ // All rows from |min_corner_only_y| down will simply use Memset.
+ // |max_base_x| is always greater than |height|, so clipping the denominator
+ // to 1 is enough to make the logic work.
+ const int xstep_units = std::max(xstep >> index_scale_bits, 1);
+ const int min_corner_only_y = std::min(max_base_index / xstep_units, height);
+
+ int top_x = xstep;
+ int y = 0;
+ for (; y < min_corner_only_y; ++y, dst += stride, top_x += xstep) {
+ int top_base_x = top_x >> index_scale_bits;
+
+ // To accommodate reuse of this function in Zone2, permit negative values
+ // for |xstep|.
+ const uint16_t shift_0 = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+ const uint16_t shift_1 = 32 - shift_0;
+
+ // Use signed values to compare |top_base_x| to |max_base_x|.
+ int16x8_t base_x = vaddq_s16(vdupq_n_s16(top_base_x), index_offset);
+
+ int x = 0;
+ do {
+ const uint16x8_t max_base_mask = vcltq_s16(base_x, max_base_x);
+
+ uint16x8x2_t sampled_top_row;
+ LoadEdgeVals(&sampled_top_row, top + top_base_x, upsampled);
+ const uint16x8_t combined = WeightedBlend(
+ sampled_top_row.val[0], sampled_top_row.val[1], shift_1, shift_0);
+
+ const uint16x8_t masked_result =
+ vbslq_u16(max_base_mask, combined, final_top_val);
+ vst1q_u16(dst + x, masked_result);
+
+ base_x = vaddq_s16(base_x, block_step);
+ top_base_x += base_step8;
+ x += 8;
+ } while (x < width);
+ }
+ for (int i = y; i < height; ++i) {
+ Memset(dst, top[max_base_index], width);
+ dst += stride;
+ }
+}
+
+// Process a multiple of 8 |width| by any |height|. Processes horizontally
+// before vertically in the hopes of being a little more cache friendly.
+inline void DirectionalZone1_Large(uint16_t* dst, const ptrdiff_t stride,
+ const int width, const int height,
+ const uint16_t* const top, const int xstep,
+ const bool upsampled) {
+ assert(width % 8 == 0);
+ const int upsample_shift = static_cast<int>(upsampled);
+ const int index_scale_bits = 6 - upsample_shift;
+
+ const int max_base_index = (width + height - 1) << upsample_shift;
+ const int16x8_t max_base_x = vdupq_n_s16(max_base_index);
+ const uint16x8_t final_top_val = vdupq_n_u16(top[max_base_index]);
+ const int16x8_t index_offset = {0, 1, 2, 3, 4, 5, 6, 7};
+
+ const int base_step = 1 << upsample_shift;
+ const int base_step8 = base_step << 3;
+ const int16x8_t block_step = vdupq_n_s16(base_step8);
+
+ // All rows from |min_corner_only_y| down will simply use Memset.
+ // |max_base_x| is always greater than |height|, so clipping the denominator
+ // to 1 is enough to make the logic work.
+ const int xstep_units = std::max(xstep >> index_scale_bits, 1);
+ const int min_corner_only_y = std::min(max_base_index / xstep_units, height);
+
+ // Rows up to this y-value can be computed without checking for bounds.
+ const int max_no_corner_y = std::min(
+ ((max_base_index - (base_step * width)) << index_scale_bits) / xstep,
+ height);
+ // No need to check for exceeding |max_base_x| in the first loop.
+ int y = 0;
+ int top_x = xstep;
+ for (; y < max_no_corner_y; ++y, dst += stride, top_x += xstep) {
+ int top_base_x = top_x >> index_scale_bits;
+ // To accommodate reuse of this function in Zone2, permit negative values
+ // for |xstep|.
+ const uint16_t shift_0 = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+ const uint16_t shift_1 = 32 - shift_0;
+
+ int x = 0;
+ do {
+ uint16x8x2_t sampled_top_row;
+ LoadEdgeVals(&sampled_top_row, top + top_base_x, upsampled);
+ const uint16x8_t combined = WeightedBlend(
+ sampled_top_row.val[0], sampled_top_row.val[1], shift_1, shift_0);
+
+ vst1q_u16(dst + x, combined);
+
+ top_base_x += base_step8;
+ x += 8;
+ } while (x < width);
+ }
+
+ for (; y < min_corner_only_y; ++y, dst += stride, top_x += xstep) {
+ int top_base_x = top_x >> index_scale_bits;
+
+ // To accommodate reuse of this function in Zone2, permit negative values
+ // for |xstep|.
+ const uint16_t shift_0 = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+ const uint16_t shift_1 = 32 - shift_0;
+
+ // Use signed values to compare |top_base_x| to |max_base_x|.
+ int16x8_t base_x = vaddq_s16(vdupq_n_s16(top_base_x), index_offset);
+
+ int x = 0;
+ const int min_corner_only_x =
+ std::min(width, ((max_base_index - top_base_x) >> upsample_shift) + 7) &
+ ~7;
+ for (; x < min_corner_only_x; x += 8, top_base_x += base_step8,
+ base_x = vaddq_s16(base_x, block_step)) {
+ const uint16x8_t max_base_mask = vcltq_s16(base_x, max_base_x);
+
+ uint16x8x2_t sampled_top_row;
+ LoadEdgeVals(&sampled_top_row, top + top_base_x, upsampled);
+ const uint16x8_t combined = WeightedBlend(
+ sampled_top_row.val[0], sampled_top_row.val[1], shift_1, shift_0);
+
+ const uint16x8_t masked_result =
+ vbslq_u16(max_base_mask, combined, final_top_val);
+ vst1q_u16(dst + x, masked_result);
+ }
+ // Corner-only section of the row.
+ Memset(dst + x, top[max_base_index], width - x);
+ }
+ for (; y < height; ++y) {
+ Memset(dst, top[max_base_index], width);
+ dst += stride;
+ }
+}
+
+void DirectionalIntraPredictorZone1_NEON(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const int width, const int height,
+ const int xstep,
+ const bool upsampled_top) {
+ const uint16_t* const top = static_cast<const uint16_t*>(top_row);
+ uint16_t* dst = static_cast<uint16_t*>(dest);
+ stride /= sizeof(top[0]);
+
+ assert(xstep > 0);
+
+ if (xstep == 64) {
+ assert(!upsampled_top);
+ const uint16_t* top_ptr = top + 1;
+ const int width_bytes = width * sizeof(top[0]);
+ int y = height;
+ do {
+ memcpy(dst, top_ptr, width_bytes);
+ memcpy(dst + stride, top_ptr + 1, width_bytes);
+ memcpy(dst + 2 * stride, top_ptr + 2, width_bytes);
+ memcpy(dst + 3 * stride, top_ptr + 3, width_bytes);
+ dst += 4 * stride;
+ top_ptr += 4;
+ y -= 4;
+ } while (y != 0);
+ } else {
+ if (width == 4) {
+ if (upsampled_top) {
+ DirectionalZone1_4xH<true>(dst, stride, height, top, xstep);
+ } else {
+ DirectionalZone1_4xH<false>(dst, stride, height, top, xstep);
+ }
+ } else if (width >= 32) {
+ if (upsampled_top) {
+ DirectionalZone1_Large(dst, stride, width, height, top, xstep, true);
+ } else {
+ DirectionalZone1_Large(dst, stride, width, height, top, xstep, false);
+ }
+ } else if (upsampled_top) {
+ DirectionalZone1_WxH<true>(dst, stride, width, height, top, xstep);
+ } else {
+ DirectionalZone1_WxH<false>(dst, stride, width, height, top, xstep);
+ }
+ }
+}
+
+// -----------------------------------------------------------------------------
+// Zone 3
+// This can be considered "the transpose of Zone 1." In Zone 1, the fractional
+// step applies when moving vertically in the destination block, connected to
+// the change in |y|, whereas in this mode, the step applies when moving
+// horizontally, connected to the change in |x|. This makes vectorization very
+// complicated in row-order, because a given vector may need source pixels that
+// span 16 or 32 pixels in steep angles, requiring multiple expensive table
+// lookups and checked loads. Rather than work in row order, it is simpler to
+// compute |dest| in column order, and then store the transposed results.
+
+// Compute 4x4 sub-blocks.
+// Example of computed sub-blocks of a 4x8 block before and after transpose:
+// 00 10 20 30 00 01 02 03
+// 01 11 21 31 10 11 12 13
+// 02 12 22 32 20 21 22 23
+// 03 13 23 33 30 31 32 33
+// ----------- --> -----------
+// 40 50 60 70 40 41 42 43
+// 41 51 61 71 50 51 52 53
+// 42 52 62 72 60 61 62 63
+// 43 53 63 73 70 71 72 73
+template <bool upsampled>
+inline void DirectionalZone3_4x4(uint8_t* dst, const ptrdiff_t stride,
+ const uint16_t* const left, const int ystep,
+ const int base_left_y = 0) {
+ const int upsample_shift = static_cast<int>(upsampled);
+ const int index_scale_bits = 6 - upsample_shift;
+
+ // Compute one column at a time, then transpose for storage.
+ uint16x4_t result[4];
+
+ int left_y = base_left_y + ystep;
+ int left_offset = left_y >> index_scale_bits;
+ int shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ int shift_1 = 32 - shift_0;
+ uint16x4x2_t sampled_left_col;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[0] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+
+ left_y += ystep;
+ left_offset = left_y >> index_scale_bits;
+ shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ shift_1 = 32 - shift_0;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[1] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+
+ left_y += ystep;
+ left_offset = left_y >> index_scale_bits;
+ shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ shift_1 = 32 - shift_0;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[2] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+
+ left_y += ystep;
+ left_offset = left_y >> index_scale_bits;
+ shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ shift_1 = 32 - shift_0;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[3] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+
+ Transpose4x4(result);
+ Store4(dst, result[0]);
+ dst += stride;
+ Store4(dst, result[1]);
+ dst += stride;
+ Store4(dst, result[2]);
+ dst += stride;
+ Store4(dst, result[3]);
+}
+
+template <bool upsampled>
+inline void DirectionalZone3_4xH(uint8_t* dest, const ptrdiff_t stride,
+ const int height, const uint16_t* const left,
+ const int ystep) {
+ const int upsample_shift = static_cast<int>(upsampled);
+ int y = 0;
+ do {
+ DirectionalZone3_4x4<upsampled>(dest, stride, left + (y << upsample_shift),
+ ystep);
+ dest += 4 * stride;
+ y += 4;
+ } while (y < height);
+}
+
+template <bool upsampled>
+inline void DirectionalZone3_Wx4(uint8_t* dest, const ptrdiff_t stride,
+ const int width, const uint16_t* const left,
+ const int ystep) {
+ int x = 0;
+ int base_left_y = 0;
+ do {
+ // TODO(petersonab): Establish 8x4 transpose to reserve this function for
+ // 8x4 and 16x4.
+ DirectionalZone3_4x4<upsampled>(dest + 2 * x, stride, left, ystep,
+ base_left_y);
+ base_left_y += 4 * ystep;
+ x += 4;
+ } while (x < width);
+}
+
+template <bool upsampled>
+inline void DirectionalZone3_8x8(uint8_t* dest, const ptrdiff_t stride,
+ const uint16_t* const left, const int ystep,
+ const int base_left_y = 0) {
+ const int upsample_shift = static_cast<int>(upsampled);
+ const int index_scale_bits = 6 - upsample_shift;
+
+ // Compute one column at a time, then transpose for storage.
+ uint16x8_t result[8];
+
+ int left_y = base_left_y + ystep;
+ uint16x8x2_t sampled_left_col;
+ int left_offset = left_y >> index_scale_bits;
+ int shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ int shift_1 = 32 - shift_0;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[0] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+ left_y += ystep;
+ left_offset = left_y >> index_scale_bits;
+ shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ shift_1 = 32 - shift_0;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[1] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+
+ left_y += ystep;
+ left_offset = left_y >> index_scale_bits;
+ shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ shift_1 = 32 - shift_0;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[2] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+
+ left_y += ystep;
+ left_offset = left_y >> index_scale_bits;
+ shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ shift_1 = 32 - shift_0;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[3] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+
+ left_y += ystep;
+ left_offset = left_y >> index_scale_bits;
+ shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ shift_1 = 32 - shift_0;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[4] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+
+ left_y += ystep;
+ left_offset = left_y >> index_scale_bits;
+ shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ shift_1 = 32 - shift_0;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[5] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+
+ left_y += ystep;
+ left_offset = left_y >> index_scale_bits;
+ shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ shift_1 = 32 - shift_0;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[6] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+
+ left_y += ystep;
+ left_offset = left_y >> index_scale_bits;
+ shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ shift_1 = 32 - shift_0;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[7] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+
+ Transpose8x8(result);
+ Store8(dest, result[0]);
+ dest += stride;
+ Store8(dest, result[1]);
+ dest += stride;
+ Store8(dest, result[2]);
+ dest += stride;
+ Store8(dest, result[3]);
+ dest += stride;
+ Store8(dest, result[4]);
+ dest += stride;
+ Store8(dest, result[5]);
+ dest += stride;
+ Store8(dest, result[6]);
+ dest += stride;
+ Store8(dest, result[7]);
+}
+
+template <bool upsampled>
+inline void DirectionalZone3_WxH(uint8_t* dest, const ptrdiff_t stride,
+ const int width, const int height,
+ const uint16_t* const left, const int ystep) {
+ const int upsample_shift = static_cast<int>(upsampled);
+ // Zone3 never runs out of left_column values.
+ assert((width + height - 1) << upsample_shift > // max_base_y
+ ((ystep * width) >> (6 - upsample_shift)) +
+ (/* base_step */ 1 << upsample_shift) *
+ (height - 1)); // left_base_y
+ int y = 0;
+ do {
+ int x = 0;
+ uint8_t* dst_x = dest + y * stride;
+ do {
+ const int base_left_y = ystep * x;
+ DirectionalZone3_8x8<upsampled>(
+ dst_x, stride, left + (y << upsample_shift), ystep, base_left_y);
+ dst_x += 8 * sizeof(uint16_t);
+ x += 8;
+ } while (x < width);
+ y += 8;
+ } while (y < height);
+}
+
+void DirectionalIntraPredictorZone3_NEON(void* const dest,
+ const ptrdiff_t stride,
+ const void* const left_column,
+ const int width, const int height,
+ const int ystep,
+ const bool upsampled_left) {
+ const uint16_t* const left = static_cast<const uint16_t*>(left_column);
+ uint8_t* dst = static_cast<uint8_t*>(dest);
+
+ if (ystep == 64) {
+ assert(!upsampled_left);
+ const int width_bytes = width * sizeof(left[0]);
+ int y = height;
+ do {
+ const uint16_t* left_ptr = left + 1;
+ memcpy(dst, left_ptr, width_bytes);
+ memcpy(dst + stride, left_ptr + 1, width_bytes);
+ memcpy(dst + 2 * stride, left_ptr + 2, width_bytes);
+ memcpy(dst + 3 * stride, left_ptr + 3, width_bytes);
+ dst += 4 * stride;
+ left_ptr += 4;
+ y -= 4;
+ } while (y != 0);
+ return;
+ }
+ if (width == 4) {
+ if (upsampled_left) {
+ DirectionalZone3_4xH<true>(dst, stride, height, left, ystep);
+ } else {
+ DirectionalZone3_4xH<false>(dst, stride, height, left, ystep);
+ }
+ } else if (height == 4) {
+ if (upsampled_left) {
+ DirectionalZone3_Wx4<true>(dst, stride, width, left, ystep);
+ } else {
+ DirectionalZone3_Wx4<false>(dst, stride, width, left, ystep);
+ }
+ } else {
+ if (upsampled_left) {
+ // |upsampled_left| can only be true if |width| + |height| <= 16,
+ // therefore this is 8x8.
+ DirectionalZone3_8x8<true>(dst, stride, left, ystep);
+ } else {
+ DirectionalZone3_WxH<false>(dst, stride, width, height, left, ystep);
+ }
+ }
+}
+
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+ dsp->directional_intra_predictor_zone1 = DirectionalIntraPredictorZone1_NEON;
+ dsp->directional_intra_predictor_zone3 = DirectionalIntraPredictorZone3_NEON;
+}
+
+} // namespace
+} // namespace high_bitdepth
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+void IntraPredDirectionalInit_NEON() {
+ low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ high_bitdepth::Init10bpp();
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+}
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_NEON
+#else // !LIBGAV1_ENABLE_NEON
namespace libgav1 {
namespace dsp {
diff --git a/src/dsp/arm/intrapred_directional_neon.h b/src/dsp/arm/intrapred_directional_neon.h
new file mode 100644
index 0000000..f7d6235
--- /dev/null
+++ b/src/dsp/arm/intrapred_directional_neon.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_INTRAPRED_DIRECTIONAL_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_INTRAPRED_DIRECTIONAL_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::directional_intra_predictor_zone*, see the defines below for
+// specifics. These functions are not thread-safe.
+void IntraPredDirectionalInit_NEON();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1
+#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1 LIBGAV1_CPU_NEON
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2
+#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2 LIBGAV1_CPU_NEON
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3
+#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3 LIBGAV1_CPU_NEON
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone1
+#define LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone1 LIBGAV1_CPU_NEON
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone3
+#define LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone3 LIBGAV1_CPU_NEON
+#endif
+
+#endif // LIBGAV1_ENABLE_NEON
+
+#endif // LIBGAV1_SRC_DSP_ARM_INTRAPRED_DIRECTIONAL_NEON_H_
diff --git a/src/dsp/arm/intrapred_filter_intra_neon.cc b/src/dsp/arm/intrapred_filter_neon.cc
index 411708e..bd9f61d 100644
--- a/src/dsp/arm/intrapred_filter_intra_neon.cc
+++ b/src/dsp/arm/intrapred_filter_neon.cc
@@ -1,4 +1,4 @@
-// Copyright 2019 The libgav1 Authors
+// Copyright 2021 The libgav1 Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "src/dsp/intrapred.h"
+#include "src/dsp/intrapred_filter.h"
#include "src/utils/cpu.h"
#if LIBGAV1_ENABLE_NEON
@@ -160,16 +160,16 @@ void Init8bpp() {
} // namespace
} // namespace low_bitdepth
-void IntraPredFilterIntraInit_NEON() { low_bitdepth::Init8bpp(); }
+void IntraPredFilterInit_NEON() { low_bitdepth::Init8bpp(); }
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_NEON
+#else // !LIBGAV1_ENABLE_NEON
namespace libgav1 {
namespace dsp {
-void IntraPredFilterIntraInit_NEON() {}
+void IntraPredFilterInit_NEON() {}
} // namespace dsp
} // namespace libgav1
diff --git a/src/dsp/arm/intrapred_filter_neon.h b/src/dsp/arm/intrapred_filter_neon.h
new file mode 100644
index 0000000..283c1b1
--- /dev/null
+++ b/src/dsp/arm/intrapred_filter_neon.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_INTRAPRED_FILTER_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_INTRAPRED_FILTER_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::filter_intra_predictor, see the defines below for specifics.
+// These functions are not thread-safe.
+void IntraPredFilterInit_NEON();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_FilterIntraPredictor LIBGAV1_CPU_NEON
+#endif // LIBGAV1_ENABLE_NEON
+
+#endif // LIBGAV1_SRC_DSP_ARM_INTRAPRED_FILTER_NEON_H_
diff --git a/src/dsp/arm/intrapred_neon.cc b/src/dsp/arm/intrapred_neon.cc
index c967d82..c143648 100644
--- a/src/dsp/arm/intrapred_neon.cc
+++ b/src/dsp/arm/intrapred_neon.cc
@@ -26,6 +26,7 @@
#include "src/dsp/arm/common_neon.h"
#include "src/dsp/constants.h"
#include "src/dsp/dsp.h"
+#include "src/utils/constants.h"
namespace libgav1 {
namespace dsp {
@@ -964,6 +965,200 @@ struct DcDefs {
using _64x64 = DcPredFuncs_NEON<6, 6, DcSum_NEON, DcStore_NEON<64, 64>>;
};
+// IntraPredFuncs_NEON::Horizontal -- duplicate left column across all rows
+
+template <int block_height>
+void Horizontal4xH_NEON(void* const dest, ptrdiff_t stride,
+ const void* /*top_row*/,
+ const void* const left_column) {
+ const auto* const left = static_cast<const uint16_t*>(left_column);
+ auto* dst = static_cast<uint8_t*>(dest);
+ int y = 0;
+ do {
+ auto* dst16 = reinterpret_cast<uint16_t*>(dst);
+ const uint16x4_t row = vld1_dup_u16(left + y);
+ vst1_u16(dst16, row);
+ dst += stride;
+ } while (++y < block_height);
+}
+
+template <int block_height>
+void Horizontal8xH_NEON(void* const dest, ptrdiff_t stride,
+ const void* /*top_row*/,
+ const void* const left_column) {
+ const auto* const left = static_cast<const uint16_t*>(left_column);
+ auto* dst = static_cast<uint8_t*>(dest);
+ int y = 0;
+ do {
+ auto* dst16 = reinterpret_cast<uint16_t*>(dst);
+ const uint16x8_t row = vld1q_dup_u16(left + y);
+ vst1q_u16(dst16, row);
+ dst += stride;
+ } while (++y < block_height);
+}
+
+template <int block_height>
+void Horizontal16xH_NEON(void* const dest, ptrdiff_t stride,
+ const void* /*top_row*/,
+ const void* const left_column) {
+ const auto* const left = static_cast<const uint16_t*>(left_column);
+ auto* dst = static_cast<uint8_t*>(dest);
+ int y = 0;
+ do {
+ const uint16x8_t row0 = vld1q_dup_u16(left + y);
+ const uint16x8_t row1 = vld1q_dup_u16(left + y + 1);
+ auto* dst16 = reinterpret_cast<uint16_t*>(dst);
+ vst1q_u16(dst16, row0);
+ vst1q_u16(dst16 + 8, row0);
+ dst += stride;
+ dst16 = reinterpret_cast<uint16_t*>(dst);
+ vst1q_u16(dst16, row1);
+ vst1q_u16(dst16 + 8, row1);
+ dst += stride;
+ y += 2;
+ } while (y < block_height);
+}
+
+template <int block_height>
+void Horizontal32xH_NEON(void* const dest, ptrdiff_t stride,
+ const void* /*top_row*/,
+ const void* const left_column) {
+ const auto* const left = static_cast<const uint16_t*>(left_column);
+ auto* dst = static_cast<uint8_t*>(dest);
+ int y = 0;
+ do {
+ const uint16x8_t row0 = vld1q_dup_u16(left + y);
+ const uint16x8_t row1 = vld1q_dup_u16(left + y + 1);
+ auto* dst16 = reinterpret_cast<uint16_t*>(dst);
+ vst1q_u16(dst16, row0);
+ vst1q_u16(dst16 + 8, row0);
+ vst1q_u16(dst16 + 16, row0);
+ vst1q_u16(dst16 + 24, row0);
+ dst += stride;
+ dst16 = reinterpret_cast<uint16_t*>(dst);
+ vst1q_u16(dst16, row1);
+ vst1q_u16(dst16 + 8, row1);
+ vst1q_u16(dst16 + 16, row1);
+ vst1q_u16(dst16 + 24, row1);
+ dst += stride;
+ y += 2;
+ } while (y < block_height);
+}
+
+// IntraPredFuncs_NEON::Vertical -- copy top row to all rows
+
+template <int block_height>
+void Vertical4xH_NEON(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const /*left_column*/) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ auto* dst = static_cast<uint8_t*>(dest);
+ const uint8x8_t row = vld1_u8(top);
+ int y = block_height;
+ do {
+ vst1_u8(dst, row);
+ dst += stride;
+ } while (--y != 0);
+}
+
+template <int block_height>
+void Vertical8xH_NEON(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const /*left_column*/) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ auto* dst = static_cast<uint8_t*>(dest);
+ const uint8x16_t row = vld1q_u8(top);
+ int y = block_height;
+ do {
+ vst1q_u8(dst, row);
+ dst += stride;
+ } while (--y != 0);
+}
+
+template <int block_height>
+void Vertical16xH_NEON(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const /*left_column*/) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ auto* dst = static_cast<uint8_t*>(dest);
+ const uint8x16_t row0 = vld1q_u8(top);
+ const uint8x16_t row1 = vld1q_u8(top + 16);
+ int y = block_height;
+ do {
+ vst1q_u8(dst, row0);
+ vst1q_u8(dst + 16, row1);
+ dst += stride;
+ vst1q_u8(dst, row0);
+ vst1q_u8(dst + 16, row1);
+ dst += stride;
+ y -= 2;
+ } while (y != 0);
+}
+
+template <int block_height>
+void Vertical32xH_NEON(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const /*left_column*/) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ auto* dst = static_cast<uint8_t*>(dest);
+ const uint8x16_t row0 = vld1q_u8(top);
+ const uint8x16_t row1 = vld1q_u8(top + 16);
+ const uint8x16_t row2 = vld1q_u8(top + 32);
+ const uint8x16_t row3 = vld1q_u8(top + 48);
+ int y = block_height;
+ do {
+ vst1q_u8(dst, row0);
+ vst1q_u8(dst + 16, row1);
+ vst1q_u8(dst + 32, row2);
+ vst1q_u8(dst + 48, row3);
+ dst += stride;
+ vst1q_u8(dst, row0);
+ vst1q_u8(dst + 16, row1);
+ vst1q_u8(dst + 32, row2);
+ vst1q_u8(dst + 48, row3);
+ dst += stride;
+ y -= 2;
+ } while (y != 0);
+}
+
+template <int block_height>
+void Vertical64xH_NEON(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const /*left_column*/) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ auto* dst = static_cast<uint8_t*>(dest);
+ const uint8x16_t row0 = vld1q_u8(top);
+ const uint8x16_t row1 = vld1q_u8(top + 16);
+ const uint8x16_t row2 = vld1q_u8(top + 32);
+ const uint8x16_t row3 = vld1q_u8(top + 48);
+ const uint8x16_t row4 = vld1q_u8(top + 64);
+ const uint8x16_t row5 = vld1q_u8(top + 80);
+ const uint8x16_t row6 = vld1q_u8(top + 96);
+ const uint8x16_t row7 = vld1q_u8(top + 112);
+ int y = block_height;
+ do {
+ vst1q_u8(dst, row0);
+ vst1q_u8(dst + 16, row1);
+ vst1q_u8(dst + 32, row2);
+ vst1q_u8(dst + 48, row3);
+ vst1q_u8(dst + 64, row4);
+ vst1q_u8(dst + 80, row5);
+ vst1q_u8(dst + 96, row6);
+ vst1q_u8(dst + 112, row7);
+ dst += stride;
+ vst1q_u8(dst, row0);
+ vst1q_u8(dst + 16, row1);
+ vst1q_u8(dst + 32, row2);
+ vst1q_u8(dst + 48, row3);
+ vst1q_u8(dst + 64, row4);
+ vst1q_u8(dst + 80, row5);
+ vst1q_u8(dst + 96, row6);
+ vst1q_u8(dst + 112, row7);
+ dst += stride;
+ y -= 2;
+ } while (y != 0);
+}
+
void Init10bpp() {
Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
assert(dsp != nullptr);
@@ -973,6 +1168,8 @@ void Init10bpp() {
DcDefs::_4x4::DcLeft;
dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDc] =
DcDefs::_4x4::Dc;
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorVertical] =
+ Vertical4xH_NEON<4>;
// 4x8
dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcTop] =
@@ -981,6 +1178,10 @@ void Init10bpp() {
DcDefs::_4x8::DcLeft;
dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDc] =
DcDefs::_4x8::Dc;
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorHorizontal] =
+ Horizontal4xH_NEON<8>;
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorVertical] =
+ Vertical4xH_NEON<8>;
// 4x16
dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcTop] =
@@ -989,6 +1190,10 @@ void Init10bpp() {
DcDefs::_4x16::DcLeft;
dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDc] =
DcDefs::_4x16::Dc;
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorHorizontal] =
+ Horizontal4xH_NEON<16>;
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorVertical] =
+ Vertical4xH_NEON<16>;
// 8x4
dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcTop] =
@@ -997,6 +1202,8 @@ void Init10bpp() {
DcDefs::_8x4::DcLeft;
dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDc] =
DcDefs::_8x4::Dc;
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorVertical] =
+ Vertical8xH_NEON<4>;
// 8x8
dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcTop] =
@@ -1005,6 +1212,10 @@ void Init10bpp() {
DcDefs::_8x8::DcLeft;
dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDc] =
DcDefs::_8x8::Dc;
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorHorizontal] =
+ Horizontal8xH_NEON<8>;
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorVertical] =
+ Vertical8xH_NEON<8>;
// 8x16
dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcTop] =
@@ -1013,6 +1224,8 @@ void Init10bpp() {
DcDefs::_8x16::DcLeft;
dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDc] =
DcDefs::_8x16::Dc;
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorVertical] =
+ Vertical8xH_NEON<16>;
// 8x32
dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcTop] =
@@ -1021,6 +1234,10 @@ void Init10bpp() {
DcDefs::_8x32::DcLeft;
dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDc] =
DcDefs::_8x32::Dc;
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorHorizontal] =
+ Horizontal8xH_NEON<32>;
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorVertical] =
+ Vertical8xH_NEON<32>;
// 16x4
dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcTop] =
@@ -1029,6 +1246,8 @@ void Init10bpp() {
DcDefs::_16x4::DcLeft;
dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDc] =
DcDefs::_16x4::Dc;
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorVertical] =
+ Vertical16xH_NEON<4>;
// 16x8
dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcTop] =
@@ -1037,6 +1256,10 @@ void Init10bpp() {
DcDefs::_16x8::DcLeft;
dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDc] =
DcDefs::_16x8::Dc;
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorHorizontal] =
+ Horizontal16xH_NEON<8>;
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorVertical] =
+ Vertical16xH_NEON<8>;
// 16x16
dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcTop] =
@@ -1045,6 +1268,8 @@ void Init10bpp() {
DcDefs::_16x16::DcLeft;
dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDc] =
DcDefs::_16x16::Dc;
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorVertical] =
+ Vertical16xH_NEON<16>;
// 16x32
dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcTop] =
@@ -1053,6 +1278,8 @@ void Init10bpp() {
DcDefs::_16x32::DcLeft;
dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDc] =
DcDefs::_16x32::Dc;
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorVertical] =
+ Vertical16xH_NEON<32>;
// 16x64
dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcTop] =
@@ -1061,6 +1288,8 @@ void Init10bpp() {
DcDefs::_16x64::DcLeft;
dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDc] =
DcDefs::_16x64::Dc;
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorVertical] =
+ Vertical16xH_NEON<64>;
// 32x8
dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcTop] =
@@ -1069,6 +1298,8 @@ void Init10bpp() {
DcDefs::_32x8::DcLeft;
dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDc] =
DcDefs::_32x8::Dc;
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorVertical] =
+ Vertical32xH_NEON<8>;
// 32x16
dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcTop] =
@@ -1077,6 +1308,8 @@ void Init10bpp() {
DcDefs::_32x16::DcLeft;
dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDc] =
DcDefs::_32x16::Dc;
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorVertical] =
+ Vertical32xH_NEON<16>;
// 32x32
dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcTop] =
@@ -1085,6 +1318,8 @@ void Init10bpp() {
DcDefs::_32x32::DcLeft;
dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDc] =
DcDefs::_32x32::Dc;
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorVertical] =
+ Vertical32xH_NEON<32>;
// 32x64
dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcTop] =
@@ -1093,6 +1328,10 @@ void Init10bpp() {
DcDefs::_32x64::DcLeft;
dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDc] =
DcDefs::_32x64::Dc;
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorHorizontal] =
+ Horizontal32xH_NEON<64>;
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorVertical] =
+ Vertical32xH_NEON<64>;
// 64x16
dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcTop] =
@@ -1101,6 +1340,8 @@ void Init10bpp() {
DcDefs::_64x16::DcLeft;
dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDc] =
DcDefs::_64x16::Dc;
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorVertical] =
+ Vertical64xH_NEON<16>;
// 64x32
dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcTop] =
@@ -1109,6 +1350,8 @@ void Init10bpp() {
DcDefs::_64x32::DcLeft;
dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDc] =
DcDefs::_64x32::Dc;
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorVertical] =
+ Vertical64xH_NEON<32>;
// 64x64
dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcTop] =
@@ -1117,6 +1360,8 @@ void Init10bpp() {
DcDefs::_64x64::DcLeft;
dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDc] =
DcDefs::_64x64::Dc;
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorVertical] =
+ Vertical64xH_NEON<64>;
}
} // namespace
@@ -1133,7 +1378,7 @@ void IntraPredInit_NEON() {
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_NEON
+#else // !LIBGAV1_ENABLE_NEON
namespace libgav1 {
namespace dsp {
diff --git a/src/dsp/arm/intrapred_neon.h b/src/dsp/arm/intrapred_neon.h
index 16f858c..b27f29f 100644
--- a/src/dsp/arm/intrapred_neon.h
+++ b/src/dsp/arm/intrapred_neon.h
@@ -23,396 +23,282 @@
namespace libgav1 {
namespace dsp {
-// Initializes Dsp::intra_predictors, Dsp::directional_intra_predictor_zone*,
-// Dsp::cfl_intra_predictors, Dsp::cfl_subsamplers and
-// Dsp::filter_intra_predictor, see the defines below for specifics. These
-// functions are not thread-safe.
-void IntraPredCflInit_NEON();
-void IntraPredDirectionalInit_NEON();
-void IntraPredFilterIntraInit_NEON();
+// Initializes Dsp::intra_predictors.
+// See the defines below for specifics. These functions are not thread-safe.
void IntraPredInit_NEON();
-void IntraPredSmoothInit_NEON();
} // namespace dsp
} // namespace libgav1
#if LIBGAV1_ENABLE_NEON
-// 8 bit
-#define LIBGAV1_Dsp8bpp_FilterIntraPredictor LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3 LIBGAV1_CPU_NEON
-
// 4x4
#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDc LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler444 LIBGAV1_CPU_NEON
// 4x8
#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDc LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler444 LIBGAV1_CPU_NEON
// 4x16
#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDc LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler444 LIBGAV1_CPU_NEON
// 8x4
#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDc LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler444 LIBGAV1_CPU_NEON
// 8x8
#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDc LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler444 LIBGAV1_CPU_NEON
// 8x16
#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDc LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler444 LIBGAV1_CPU_NEON
// 8x32
#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDc LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler444 LIBGAV1_CPU_NEON
// 16x4
#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDc LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler444 LIBGAV1_CPU_NEON
// 16x8
#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDc LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler444 LIBGAV1_CPU_NEON
// 16x16
#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDc LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler444 LIBGAV1_CPU_NEON
// 16x32
#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDc LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler444 LIBGAV1_CPU_NEON
// 16x64
#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDc LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_NEON
// 32x8
#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDc LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler444 LIBGAV1_CPU_NEON
// 32x16
#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDc LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler444 LIBGAV1_CPU_NEON
// 32x32
#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDc LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler444 LIBGAV1_CPU_NEON
// 32x64
#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDc LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_NEON
// 64x16
#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDc LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_NEON
// 64x32
#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDc LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_NEON
// 64x64
#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDc LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_NEON
// 10 bit
// 4x4
#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
// 4x8
#define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorHorizontal \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
// 4x16
#define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorHorizontal \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
// 8x4
#define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
// 8x8
#define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorHorizontal \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
// 8x16
#define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
// 8x32
#define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorHorizontal \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
// 16x4
#define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
// 16x8
#define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorHorizontal \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
// 16x16
#define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorDcLeft \
LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
// 16x32
#define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorDcLeft \
LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
// 16x64
#define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorDcLeft \
LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
// 32x8
#define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
// 32x16
#define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorDcLeft \
LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
// 32x32
#define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorDcLeft \
LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
// 32x64
#define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorDcLeft \
LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorHorizontal \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
// 64x16
#define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorDcLeft \
LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
// 64x32
#define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorDcLeft \
LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
// 64x64
#define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorDcLeft \
LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
#endif // LIBGAV1_ENABLE_NEON
#endif // LIBGAV1_SRC_DSP_ARM_INTRAPRED_NEON_H_
diff --git a/src/dsp/arm/intrapred_smooth_neon.cc b/src/dsp/arm/intrapred_smooth_neon.cc
index abc93e8..c33f333 100644
--- a/src/dsp/arm/intrapred_smooth_neon.cc
+++ b/src/dsp/arm/intrapred_smooth_neon.cc
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "src/dsp/intrapred.h"
+#include "src/dsp/intrapred_smooth.h"
#include "src/utils/cpu.h"
#if LIBGAV1_ENABLE_NEON
@@ -26,6 +26,7 @@
#include "src/dsp/arm/common_neon.h"
#include "src/dsp/constants.h"
#include "src/dsp/dsp.h"
+#include "src/utils/constants.h"
namespace libgav1 {
namespace dsp {
@@ -605,7 +606,7 @@ void IntraPredSmoothInit_NEON() { low_bitdepth::Init8bpp(); }
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_NEON
+#else // !LIBGAV1_ENABLE_NEON
namespace libgav1 {
namespace dsp {
diff --git a/src/dsp/arm/intrapred_smooth_neon.h b/src/dsp/arm/intrapred_smooth_neon.h
new file mode 100644
index 0000000..edd01be
--- /dev/null
+++ b/src/dsp/arm/intrapred_smooth_neon.h
@@ -0,0 +1,149 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_INTRAPRED_SMOOTH_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_INTRAPRED_SMOOTH_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::intra_predictors[][kIntraPredictorSmooth.*].
+// This function is not thread-safe.
+void IntraPredSmoothInit_NEON();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+#endif // LIBGAV1_ENABLE_NEON
+
+#endif // LIBGAV1_SRC_DSP_ARM_INTRAPRED_SMOOTH_NEON_H_
diff --git a/src/dsp/arm/inverse_transform_10bit_neon.cc b/src/dsp/arm/inverse_transform_10bit_neon.cc
new file mode 100644
index 0000000..ff184a1
--- /dev/null
+++ b/src/dsp/arm/inverse_transform_10bit_neon.cc
@@ -0,0 +1,2543 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/inverse_transform.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10
+
+#include <arm_neon.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// Include the constants and utility functions inside the anonymous namespace.
+#include "src/dsp/inverse_transform.inc"
+
+//------------------------------------------------------------------------------
+
+LIBGAV1_ALWAYS_INLINE void Transpose4x4(const int32x4_t in[4],
+ int32x4_t out[4]) {
+ // in:
+ // 00 01 02 03
+ // 10 11 12 13
+ // 20 21 22 23
+ // 30 31 32 33
+
+ // 00 10 02 12 a.val[0]
+ // 01 11 03 13 a.val[1]
+ // 20 30 22 32 b.val[0]
+ // 21 31 23 33 b.val[1]
+ const int32x4x2_t a = vtrnq_s32(in[0], in[1]);
+ const int32x4x2_t b = vtrnq_s32(in[2], in[3]);
+ out[0] = vextq_s32(vextq_s32(a.val[0], a.val[0], 2), b.val[0], 2);
+ out[1] = vextq_s32(vextq_s32(a.val[1], a.val[1], 2), b.val[1], 2);
+ out[2] = vextq_s32(a.val[0], vextq_s32(b.val[0], b.val[0], 2), 2);
+ out[3] = vextq_s32(a.val[1], vextq_s32(b.val[1], b.val[1], 2), 2);
+ // out:
+ // 00 10 20 30
+ // 01 11 21 31
+ // 02 12 22 32
+ // 03 13 23 33
+}
+
+//------------------------------------------------------------------------------
+template <int store_count>
+LIBGAV1_ALWAYS_INLINE void StoreDst(int32_t* dst, int32_t stride, int32_t idx,
+ const int32x4_t* const s) {
+ assert(store_count % 4 == 0);
+ for (int i = 0; i < store_count; i += 4) {
+ vst1q_s32(&dst[i * stride + idx], s[i]);
+ vst1q_s32(&dst[(i + 1) * stride + idx], s[i + 1]);
+ vst1q_s32(&dst[(i + 2) * stride + idx], s[i + 2]);
+ vst1q_s32(&dst[(i + 3) * stride + idx], s[i + 3]);
+ }
+}
+
+template <int load_count>
+LIBGAV1_ALWAYS_INLINE void LoadSrc(const int32_t* src, int32_t stride,
+ int32_t idx, int32x4_t* x) {
+ assert(load_count % 4 == 0);
+ for (int i = 0; i < load_count; i += 4) {
+ x[i] = vld1q_s32(&src[i * stride + idx]);
+ x[i + 1] = vld1q_s32(&src[(i + 1) * stride + idx]);
+ x[i + 2] = vld1q_s32(&src[(i + 2) * stride + idx]);
+ x[i + 3] = vld1q_s32(&src[(i + 3) * stride + idx]);
+ }
+}
+
+// Butterfly rotate 4 values.
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_4(int32x4_t* a, int32x4_t* b,
+ const int angle,
+ const bool flip) {
+ const int32_t cos128 = Cos128(angle);
+ const int32_t sin128 = Sin128(angle);
+ const int32x4_t acc_x = vmulq_n_s32(*a, cos128);
+ const int32x4_t acc_y = vmulq_n_s32(*a, sin128);
+ // The max range for the input is 18 bits. The cos128/sin128 is 13 bits,
+ // which leaves 1 bit for the add/subtract. For 10bpp, x/y will fit in a 32
+ // bit lane.
+ const int32x4_t x0 = vmlsq_n_s32(acc_x, *b, sin128);
+ const int32x4_t y0 = vmlaq_n_s32(acc_y, *b, cos128);
+ const int32x4_t x = vrshrq_n_s32(x0, 12);
+ const int32x4_t y = vrshrq_n_s32(y0, 12);
+ if (flip) {
+ *a = y;
+ *b = x;
+ } else {
+ *a = x;
+ *b = y;
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_FirstIsZero(int32x4_t* a,
+ int32x4_t* b,
+ const int angle,
+ const bool flip) {
+ const int32_t cos128 = Cos128(angle);
+ const int32_t sin128 = Sin128(angle);
+ assert(sin128 <= 0xfff);
+ const int32x4_t x0 = vmulq_n_s32(*b, -sin128);
+ const int32x4_t y0 = vmulq_n_s32(*b, cos128);
+ const int32x4_t x = vrshrq_n_s32(x0, 12);
+ const int32x4_t y = vrshrq_n_s32(y0, 12);
+ if (flip) {
+ *a = y;
+ *b = x;
+ } else {
+ *a = x;
+ *b = y;
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_SecondIsZero(int32x4_t* a,
+ int32x4_t* b,
+ const int angle,
+ const bool flip) {
+ const int32_t cos128 = Cos128(angle);
+ const int32_t sin128 = Sin128(angle);
+ const int32x4_t x0 = vmulq_n_s32(*a, cos128);
+ const int32x4_t y0 = vmulq_n_s32(*a, sin128);
+ const int32x4_t x = vrshrq_n_s32(x0, 12);
+ const int32x4_t y = vrshrq_n_s32(y0, 12);
+ if (flip) {
+ *a = y;
+ *b = x;
+ } else {
+ *a = x;
+ *b = y;
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE void HadamardRotation(int32x4_t* a, int32x4_t* b,
+ bool flip) {
+ int32x4_t x, y;
+ if (flip) {
+ y = vqaddq_s32(*b, *a);
+ x = vqsubq_s32(*b, *a);
+ } else {
+ x = vqaddq_s32(*a, *b);
+ y = vqsubq_s32(*a, *b);
+ }
+ *a = x;
+ *b = y;
+}
+
+LIBGAV1_ALWAYS_INLINE void HadamardRotation(int32x4_t* a, int32x4_t* b,
+ bool flip, const int32x4_t* min,
+ const int32x4_t* max) {
+ int32x4_t x, y;
+ if (flip) {
+ y = vqaddq_s32(*b, *a);
+ x = vqsubq_s32(*b, *a);
+ } else {
+ x = vqaddq_s32(*a, *b);
+ y = vqsubq_s32(*a, *b);
+ }
+ *a = vmaxq_s32(vminq_s32(x, *max), *min);
+ *b = vmaxq_s32(vminq_s32(y, *max), *min);
+}
+
+using ButterflyRotationFunc = void (*)(int32x4_t* a, int32x4_t* b, int angle,
+ bool flip);
+
+//------------------------------------------------------------------------------
+// Discrete Cosine Transforms (DCT).
+
+template <int width>
+LIBGAV1_ALWAYS_INLINE bool DctDcOnly(void* dest, int adjusted_tx_height,
+ bool should_round, int row_shift) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int32_t*>(dest);
+ const int32x4_t v_src = vdupq_n_s32(dst[0]);
+ const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
+ const int32x4_t v_src_round =
+ vqrdmulhq_n_s32(v_src, kTransformRowMultiplier << (31 - 12));
+ const int32x4_t s0 = vbslq_s32(v_mask, v_src_round, v_src);
+ const int32_t cos128 = Cos128(32);
+ const int32x4_t xy = vqrdmulhq_n_s32(s0, cos128 << (31 - 12));
+ // vqrshlq_s32 will shift right if shift value is negative.
+ const int32x4_t xy_shifted = vqrshlq_s32(xy, vdupq_n_s32(-row_shift));
+ // Clamp result to signed 16 bits.
+ const int32x4_t result = vmovl_s16(vqmovn_s32(xy_shifted));
+ if (width == 4) {
+ vst1q_s32(dst, result);
+ } else {
+ for (int i = 0; i < width; i += 4) {
+ vst1q_s32(dst, result);
+ dst += 4;
+ }
+ }
+ return true;
+}
+
+template <int height>
+LIBGAV1_ALWAYS_INLINE bool DctDcOnlyColumn(void* dest, int adjusted_tx_height,
+ int width) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int32_t*>(dest);
+ const int32_t cos128 = Cos128(32);
+
+ // Calculate dc values for first row.
+ if (width == 4) {
+ const int32x4_t v_src = vld1q_s32(dst);
+ const int32x4_t xy = vqrdmulhq_n_s32(v_src, cos128 << (31 - 12));
+ vst1q_s32(dst, xy);
+ } else {
+ int i = 0;
+ do {
+ const int32x4_t v_src = vld1q_s32(&dst[i]);
+ const int32x4_t xy = vqrdmulhq_n_s32(v_src, cos128 << (31 - 12));
+ vst1q_s32(&dst[i], xy);
+ i += 4;
+ } while (i < width);
+ }
+
+ // Copy first row to the rest of the block.
+ for (int y = 1; y < height; ++y) {
+ memcpy(&dst[y * width], dst, width * sizeof(dst[0]));
+ }
+ return true;
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+ bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct4Stages(int32x4_t* s, const int32x4_t* min,
+ const int32x4_t* max,
+ const bool is_last_stage) {
+ // stage 12.
+ if (is_fast_butterfly) {
+ ButterflyRotation_SecondIsZero(&s[0], &s[1], 32, true);
+ ButterflyRotation_SecondIsZero(&s[2], &s[3], 48, false);
+ } else {
+ butterfly_rotation(&s[0], &s[1], 32, true);
+ butterfly_rotation(&s[2], &s[3], 48, false);
+ }
+
+ // stage 17.
+ if (is_last_stage) {
+ HadamardRotation(&s[0], &s[3], false);
+ HadamardRotation(&s[1], &s[2], false);
+ } else {
+ HadamardRotation(&s[0], &s[3], false, min, max);
+ HadamardRotation(&s[1], &s[2], false, min, max);
+ }
+}
+
+template <ButterflyRotationFunc butterfly_rotation>
+LIBGAV1_ALWAYS_INLINE void Dct4_NEON(void* dest, int32_t step, bool is_row,
+ int row_shift) {
+ auto* const dst = static_cast<int32_t*>(dest);
+ // When |is_row| is true, set range to the row range, otherwise, set to the
+ // column range.
+ const int32_t range = is_row ? kBitdepth10 + 7 : 15;
+ const int32x4_t min = vdupq_n_s32(-(1 << range));
+ const int32x4_t max = vdupq_n_s32((1 << range) - 1);
+ int32x4_t s[4], x[4];
+
+ LoadSrc<4>(dst, step, 0, x);
+ if (is_row) {
+ Transpose4x4(x, x);
+ }
+
+ // stage 1.
+ // kBitReverseLookup 0, 2, 1, 3
+ s[0] = x[0];
+ s[1] = x[2];
+ s[2] = x[1];
+ s[3] = x[3];
+
+ Dct4Stages<butterfly_rotation>(s, &min, &max, /*is_last_stage=*/true);
+
+ if (is_row) {
+ const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
+ for (int i = 0; i < 4; ++i) {
+ s[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(s[i], v_row_shift)));
+ }
+ Transpose4x4(s, s);
+ }
+ StoreDst<4>(dst, step, 0, s);
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+ bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct8Stages(int32x4_t* s, const int32x4_t* min,
+ const int32x4_t* max,
+ const bool is_last_stage) {
+ // stage 8.
+ if (is_fast_butterfly) {
+ ButterflyRotation_SecondIsZero(&s[4], &s[7], 56, false);
+ ButterflyRotation_FirstIsZero(&s[5], &s[6], 24, false);
+ } else {
+ butterfly_rotation(&s[4], &s[7], 56, false);
+ butterfly_rotation(&s[5], &s[6], 24, false);
+ }
+
+ // stage 13.
+ HadamardRotation(&s[4], &s[5], false, min, max);
+ HadamardRotation(&s[6], &s[7], true, min, max);
+
+ // stage 18.
+ butterfly_rotation(&s[6], &s[5], 32, true);
+
+ // stage 22.
+ if (is_last_stage) {
+ HadamardRotation(&s[0], &s[7], false);
+ HadamardRotation(&s[1], &s[6], false);
+ HadamardRotation(&s[2], &s[5], false);
+ HadamardRotation(&s[3], &s[4], false);
+ } else {
+ HadamardRotation(&s[0], &s[7], false, min, max);
+ HadamardRotation(&s[1], &s[6], false, min, max);
+ HadamardRotation(&s[2], &s[5], false, min, max);
+ HadamardRotation(&s[3], &s[4], false, min, max);
+ }
+}
+
+// Process dct8 rows or columns, depending on the |is_row| flag.
+template <ButterflyRotationFunc butterfly_rotation>
+LIBGAV1_ALWAYS_INLINE void Dct8_NEON(void* dest, int32_t step, bool is_row,
+ int row_shift) {
+ auto* const dst = static_cast<int32_t*>(dest);
+ const int32_t range = is_row ? kBitdepth10 + 7 : 15;
+ const int32x4_t min = vdupq_n_s32(-(1 << range));
+ const int32x4_t max = vdupq_n_s32((1 << range) - 1);
+ int32x4_t s[8], x[8];
+
+ if (is_row) {
+ LoadSrc<4>(dst, step, 0, &x[0]);
+ LoadSrc<4>(dst, step, 4, &x[4]);
+ Transpose4x4(&x[0], &x[0]);
+ Transpose4x4(&x[4], &x[4]);
+ } else {
+ LoadSrc<8>(dst, step, 0, &x[0]);
+ }
+
+ // stage 1.
+ // kBitReverseLookup 0, 4, 2, 6, 1, 5, 3, 7,
+ s[0] = x[0];
+ s[1] = x[4];
+ s[2] = x[2];
+ s[3] = x[6];
+ s[4] = x[1];
+ s[5] = x[5];
+ s[6] = x[3];
+ s[7] = x[7];
+
+ Dct4Stages<butterfly_rotation>(s, &min, &max, /*is_last_stage=*/false);
+ Dct8Stages<butterfly_rotation>(s, &min, &max, /*is_last_stage=*/true);
+
+ if (is_row) {
+ const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
+ for (int i = 0; i < 8; ++i) {
+ s[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(s[i], v_row_shift)));
+ }
+ Transpose4x4(&s[0], &s[0]);
+ Transpose4x4(&s[4], &s[4]);
+ StoreDst<4>(dst, step, 0, &s[0]);
+ StoreDst<4>(dst, step, 4, &s[4]);
+ } else {
+ StoreDst<8>(dst, step, 0, &s[0]);
+ }
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+ bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct16Stages(int32x4_t* s, const int32x4_t* min,
+ const int32x4_t* max,
+ const bool is_last_stage) {
+ // stage 5.
+ if (is_fast_butterfly) {
+ ButterflyRotation_SecondIsZero(&s[8], &s[15], 60, false);
+ ButterflyRotation_FirstIsZero(&s[9], &s[14], 28, false);
+ ButterflyRotation_SecondIsZero(&s[10], &s[13], 44, false);
+ ButterflyRotation_FirstIsZero(&s[11], &s[12], 12, false);
+ } else {
+ butterfly_rotation(&s[8], &s[15], 60, false);
+ butterfly_rotation(&s[9], &s[14], 28, false);
+ butterfly_rotation(&s[10], &s[13], 44, false);
+ butterfly_rotation(&s[11], &s[12], 12, false);
+ }
+
+ // stage 9.
+ HadamardRotation(&s[8], &s[9], false, min, max);
+ HadamardRotation(&s[10], &s[11], true, min, max);
+ HadamardRotation(&s[12], &s[13], false, min, max);
+ HadamardRotation(&s[14], &s[15], true, min, max);
+
+ // stage 14.
+ butterfly_rotation(&s[14], &s[9], 48, true);
+ butterfly_rotation(&s[13], &s[10], 112, true);
+
+ // stage 19.
+ HadamardRotation(&s[8], &s[11], false, min, max);
+ HadamardRotation(&s[9], &s[10], false, min, max);
+ HadamardRotation(&s[12], &s[15], true, min, max);
+ HadamardRotation(&s[13], &s[14], true, min, max);
+
+ // stage 23.
+ butterfly_rotation(&s[13], &s[10], 32, true);
+ butterfly_rotation(&s[12], &s[11], 32, true);
+
+ // stage 26.
+ if (is_last_stage) {
+ HadamardRotation(&s[0], &s[15], false);
+ HadamardRotation(&s[1], &s[14], false);
+ HadamardRotation(&s[2], &s[13], false);
+ HadamardRotation(&s[3], &s[12], false);
+ HadamardRotation(&s[4], &s[11], false);
+ HadamardRotation(&s[5], &s[10], false);
+ HadamardRotation(&s[6], &s[9], false);
+ HadamardRotation(&s[7], &s[8], false);
+ } else {
+ HadamardRotation(&s[0], &s[15], false, min, max);
+ HadamardRotation(&s[1], &s[14], false, min, max);
+ HadamardRotation(&s[2], &s[13], false, min, max);
+ HadamardRotation(&s[3], &s[12], false, min, max);
+ HadamardRotation(&s[4], &s[11], false, min, max);
+ HadamardRotation(&s[5], &s[10], false, min, max);
+ HadamardRotation(&s[6], &s[9], false, min, max);
+ HadamardRotation(&s[7], &s[8], false, min, max);
+ }
+}
+
+// Process dct16 rows or columns, depending on the |is_row| flag.
+template <ButterflyRotationFunc butterfly_rotation>
+LIBGAV1_ALWAYS_INLINE void Dct16_NEON(void* dest, int32_t step, bool is_row,
+ int row_shift) {
+ auto* const dst = static_cast<int32_t*>(dest);
+ const int32_t range = is_row ? kBitdepth10 + 7 : 15;
+ const int32x4_t min = vdupq_n_s32(-(1 << range));
+ const int32x4_t max = vdupq_n_s32((1 << range) - 1);
+ int32x4_t s[16], x[16];
+
+ if (is_row) {
+ for (int idx = 0; idx < 16; idx += 8) {
+ LoadSrc<4>(dst, step, idx, &x[idx]);
+ LoadSrc<4>(dst, step, idx + 4, &x[idx + 4]);
+ Transpose4x4(&x[idx], &x[idx]);
+ Transpose4x4(&x[idx + 4], &x[idx + 4]);
+ }
+ } else {
+ LoadSrc<16>(dst, step, 0, &x[0]);
+ }
+
+ // stage 1
+ // kBitReverseLookup 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15,
+ s[0] = x[0];
+ s[1] = x[8];
+ s[2] = x[4];
+ s[3] = x[12];
+ s[4] = x[2];
+ s[5] = x[10];
+ s[6] = x[6];
+ s[7] = x[14];
+ s[8] = x[1];
+ s[9] = x[9];
+ s[10] = x[5];
+ s[11] = x[13];
+ s[12] = x[3];
+ s[13] = x[11];
+ s[14] = x[7];
+ s[15] = x[15];
+
+ Dct4Stages<butterfly_rotation>(s, &min, &max, /*is_last_stage=*/false);
+ Dct8Stages<butterfly_rotation>(s, &min, &max, /*is_last_stage=*/false);
+ Dct16Stages<butterfly_rotation>(s, &min, &max, /*is_last_stage=*/true);
+
+ if (is_row) {
+ const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
+ for (int i = 0; i < 16; ++i) {
+ s[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(s[i], v_row_shift)));
+ }
+ for (int idx = 0; idx < 16; idx += 8) {
+ Transpose4x4(&s[idx], &s[idx]);
+ Transpose4x4(&s[idx + 4], &s[idx + 4]);
+ StoreDst<4>(dst, step, idx, &s[idx]);
+ StoreDst<4>(dst, step, idx + 4, &s[idx + 4]);
+ }
+ } else {
+ StoreDst<16>(dst, step, 0, &s[0]);
+ }
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+ bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct32Stages(int32x4_t* s, const int32x4_t* min,
+ const int32x4_t* max,
+ const bool is_last_stage) {
+ // stage 3
+ if (is_fast_butterfly) {
+ ButterflyRotation_SecondIsZero(&s[16], &s[31], 62, false);
+ ButterflyRotation_FirstIsZero(&s[17], &s[30], 30, false);
+ ButterflyRotation_SecondIsZero(&s[18], &s[29], 46, false);
+ ButterflyRotation_FirstIsZero(&s[19], &s[28], 14, false);
+ ButterflyRotation_SecondIsZero(&s[20], &s[27], 54, false);
+ ButterflyRotation_FirstIsZero(&s[21], &s[26], 22, false);
+ ButterflyRotation_SecondIsZero(&s[22], &s[25], 38, false);
+ ButterflyRotation_FirstIsZero(&s[23], &s[24], 6, false);
+ } else {
+ butterfly_rotation(&s[16], &s[31], 62, false);
+ butterfly_rotation(&s[17], &s[30], 30, false);
+ butterfly_rotation(&s[18], &s[29], 46, false);
+ butterfly_rotation(&s[19], &s[28], 14, false);
+ butterfly_rotation(&s[20], &s[27], 54, false);
+ butterfly_rotation(&s[21], &s[26], 22, false);
+ butterfly_rotation(&s[22], &s[25], 38, false);
+ butterfly_rotation(&s[23], &s[24], 6, false);
+ }
+
+ // stage 6.
+ HadamardRotation(&s[16], &s[17], false, min, max);
+ HadamardRotation(&s[18], &s[19], true, min, max);
+ HadamardRotation(&s[20], &s[21], false, min, max);
+ HadamardRotation(&s[22], &s[23], true, min, max);
+ HadamardRotation(&s[24], &s[25], false, min, max);
+ HadamardRotation(&s[26], &s[27], true, min, max);
+ HadamardRotation(&s[28], &s[29], false, min, max);
+ HadamardRotation(&s[30], &s[31], true, min, max);
+
+ // stage 10.
+ butterfly_rotation(&s[30], &s[17], 24 + 32, true);
+ butterfly_rotation(&s[29], &s[18], 24 + 64 + 32, true);
+ butterfly_rotation(&s[26], &s[21], 24, true);
+ butterfly_rotation(&s[25], &s[22], 24 + 64, true);
+
+ // stage 15.
+ HadamardRotation(&s[16], &s[19], false, min, max);
+ HadamardRotation(&s[17], &s[18], false, min, max);
+ HadamardRotation(&s[20], &s[23], true, min, max);
+ HadamardRotation(&s[21], &s[22], true, min, max);
+ HadamardRotation(&s[24], &s[27], false, min, max);
+ HadamardRotation(&s[25], &s[26], false, min, max);
+ HadamardRotation(&s[28], &s[31], true, min, max);
+ HadamardRotation(&s[29], &s[30], true, min, max);
+
+ // stage 20.
+ butterfly_rotation(&s[29], &s[18], 48, true);
+ butterfly_rotation(&s[28], &s[19], 48, true);
+ butterfly_rotation(&s[27], &s[20], 48 + 64, true);
+ butterfly_rotation(&s[26], &s[21], 48 + 64, true);
+
+ // stage 24.
+ HadamardRotation(&s[16], &s[23], false, min, max);
+ HadamardRotation(&s[17], &s[22], false, min, max);
+ HadamardRotation(&s[18], &s[21], false, min, max);
+ HadamardRotation(&s[19], &s[20], false, min, max);
+ HadamardRotation(&s[24], &s[31], true, min, max);
+ HadamardRotation(&s[25], &s[30], true, min, max);
+ HadamardRotation(&s[26], &s[29], true, min, max);
+ HadamardRotation(&s[27], &s[28], true, min, max);
+
+ // stage 27.
+ butterfly_rotation(&s[27], &s[20], 32, true);
+ butterfly_rotation(&s[26], &s[21], 32, true);
+ butterfly_rotation(&s[25], &s[22], 32, true);
+ butterfly_rotation(&s[24], &s[23], 32, true);
+
+ // stage 29.
+ if (is_last_stage) {
+ HadamardRotation(&s[0], &s[31], false);
+ HadamardRotation(&s[1], &s[30], false);
+ HadamardRotation(&s[2], &s[29], false);
+ HadamardRotation(&s[3], &s[28], false);
+ HadamardRotation(&s[4], &s[27], false);
+ HadamardRotation(&s[5], &s[26], false);
+ HadamardRotation(&s[6], &s[25], false);
+ HadamardRotation(&s[7], &s[24], false);
+ HadamardRotation(&s[8], &s[23], false);
+ HadamardRotation(&s[9], &s[22], false);
+ HadamardRotation(&s[10], &s[21], false);
+ HadamardRotation(&s[11], &s[20], false);
+ HadamardRotation(&s[12], &s[19], false);
+ HadamardRotation(&s[13], &s[18], false);
+ HadamardRotation(&s[14], &s[17], false);
+ HadamardRotation(&s[15], &s[16], false);
+ } else {
+ HadamardRotation(&s[0], &s[31], false, min, max);
+ HadamardRotation(&s[1], &s[30], false, min, max);
+ HadamardRotation(&s[2], &s[29], false, min, max);
+ HadamardRotation(&s[3], &s[28], false, min, max);
+ HadamardRotation(&s[4], &s[27], false, min, max);
+ HadamardRotation(&s[5], &s[26], false, min, max);
+ HadamardRotation(&s[6], &s[25], false, min, max);
+ HadamardRotation(&s[7], &s[24], false, min, max);
+ HadamardRotation(&s[8], &s[23], false, min, max);
+ HadamardRotation(&s[9], &s[22], false, min, max);
+ HadamardRotation(&s[10], &s[21], false, min, max);
+ HadamardRotation(&s[11], &s[20], false, min, max);
+ HadamardRotation(&s[12], &s[19], false, min, max);
+ HadamardRotation(&s[13], &s[18], false, min, max);
+ HadamardRotation(&s[14], &s[17], false, min, max);
+ HadamardRotation(&s[15], &s[16], false, min, max);
+ }
+}
+
+// Process dct32 rows or columns, depending on the |is_row| flag.
+LIBGAV1_ALWAYS_INLINE void Dct32_NEON(void* dest, const int32_t step,
+ const bool is_row, int row_shift) {
+ auto* const dst = static_cast<int32_t*>(dest);
+ const int32_t range = is_row ? kBitdepth10 + 7 : 15;
+ const int32x4_t min = vdupq_n_s32(-(1 << range));
+ const int32x4_t max = vdupq_n_s32((1 << range) - 1);
+ int32x4_t s[32], x[32];
+
+ if (is_row) {
+ for (int idx = 0; idx < 32; idx += 8) {
+ LoadSrc<4>(dst, step, idx, &x[idx]);
+ LoadSrc<4>(dst, step, idx + 4, &x[idx + 4]);
+ Transpose4x4(&x[idx], &x[idx]);
+ Transpose4x4(&x[idx + 4], &x[idx + 4]);
+ }
+ } else {
+ LoadSrc<32>(dst, step, 0, &x[0]);
+ }
+
+ // stage 1
+ // kBitReverseLookup
+ // 0, 16, 8, 24, 4, 20, 12, 28, 2, 18, 10, 26, 6, 22, 14, 30,
+ s[0] = x[0];
+ s[1] = x[16];
+ s[2] = x[8];
+ s[3] = x[24];
+ s[4] = x[4];
+ s[5] = x[20];
+ s[6] = x[12];
+ s[7] = x[28];
+ s[8] = x[2];
+ s[9] = x[18];
+ s[10] = x[10];
+ s[11] = x[26];
+ s[12] = x[6];
+ s[13] = x[22];
+ s[14] = x[14];
+ s[15] = x[30];
+
+ // 1, 17, 9, 25, 5, 21, 13, 29, 3, 19, 11, 27, 7, 23, 15, 31,
+ s[16] = x[1];
+ s[17] = x[17];
+ s[18] = x[9];
+ s[19] = x[25];
+ s[20] = x[5];
+ s[21] = x[21];
+ s[22] = x[13];
+ s[23] = x[29];
+ s[24] = x[3];
+ s[25] = x[19];
+ s[26] = x[11];
+ s[27] = x[27];
+ s[28] = x[7];
+ s[29] = x[23];
+ s[30] = x[15];
+ s[31] = x[31];
+
+ Dct4Stages<ButterflyRotation_4>(s, &min, &max, /*is_last_stage=*/false);
+ Dct8Stages<ButterflyRotation_4>(s, &min, &max, /*is_last_stage=*/false);
+ Dct16Stages<ButterflyRotation_4>(s, &min, &max, /*is_last_stage=*/false);
+ Dct32Stages<ButterflyRotation_4>(s, &min, &max, /*is_last_stage=*/true);
+
+ if (is_row) {
+ const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
+ for (int idx = 0; idx < 32; idx += 8) {
+ int32x4_t output[8];
+ Transpose4x4(&s[idx], &output[0]);
+ Transpose4x4(&s[idx + 4], &output[4]);
+ for (int i = 0; i < 8; ++i) {
+ output[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(output[i], v_row_shift)));
+ }
+ StoreDst<4>(dst, step, idx, &output[0]);
+ StoreDst<4>(dst, step, idx + 4, &output[4]);
+ }
+ } else {
+ StoreDst<32>(dst, step, 0, &s[0]);
+ }
+}
+
+void Dct64_NEON(void* dest, int32_t step, bool is_row, int row_shift) {
+ auto* const dst = static_cast<int32_t*>(dest);
+ const int32_t range = is_row ? kBitdepth10 + 7 : 15;
+ const int32x4_t min = vdupq_n_s32(-(1 << range));
+ const int32x4_t max = vdupq_n_s32((1 << range) - 1);
+ int32x4_t s[64], x[32];
+
+ if (is_row) {
+ // The last 32 values of every row are always zero if the |tx_width| is
+ // 64.
+ for (int idx = 0; idx < 32; idx += 8) {
+ LoadSrc<4>(dst, step, idx, &x[idx]);
+ LoadSrc<4>(dst, step, idx + 4, &x[idx + 4]);
+ Transpose4x4(&x[idx], &x[idx]);
+ Transpose4x4(&x[idx + 4], &x[idx + 4]);
+ }
+ } else {
+ // The last 32 values of every column are always zero if the |tx_height| is
+ // 64.
+ LoadSrc<32>(dst, step, 0, &x[0]);
+ }
+
+ // stage 1
+ // kBitReverseLookup
+ // 0, 32, 16, 48, 8, 40, 24, 56, 4, 36, 20, 52, 12, 44, 28, 60,
+ s[0] = x[0];
+ s[2] = x[16];
+ s[4] = x[8];
+ s[6] = x[24];
+ s[8] = x[4];
+ s[10] = x[20];
+ s[12] = x[12];
+ s[14] = x[28];
+
+ // 2, 34, 18, 50, 10, 42, 26, 58, 6, 38, 22, 54, 14, 46, 30, 62,
+ s[16] = x[2];
+ s[18] = x[18];
+ s[20] = x[10];
+ s[22] = x[26];
+ s[24] = x[6];
+ s[26] = x[22];
+ s[28] = x[14];
+ s[30] = x[30];
+
+ // 1, 33, 17, 49, 9, 41, 25, 57, 5, 37, 21, 53, 13, 45, 29, 61,
+ s[32] = x[1];
+ s[34] = x[17];
+ s[36] = x[9];
+ s[38] = x[25];
+ s[40] = x[5];
+ s[42] = x[21];
+ s[44] = x[13];
+ s[46] = x[29];
+
+ // 3, 35, 19, 51, 11, 43, 27, 59, 7, 39, 23, 55, 15, 47, 31, 63
+ s[48] = x[3];
+ s[50] = x[19];
+ s[52] = x[11];
+ s[54] = x[27];
+ s[56] = x[7];
+ s[58] = x[23];
+ s[60] = x[15];
+ s[62] = x[31];
+
+ Dct4Stages<ButterflyRotation_4, /*is_fast_butterfly=*/true>(
+ s, &min, &max, /*is_last_stage=*/false);
+ Dct8Stages<ButterflyRotation_4, /*is_fast_butterfly=*/true>(
+ s, &min, &max, /*is_last_stage=*/false);
+ Dct16Stages<ButterflyRotation_4, /*is_fast_butterfly=*/true>(
+ s, &min, &max, /*is_last_stage=*/false);
+ Dct32Stages<ButterflyRotation_4, /*is_fast_butterfly=*/true>(
+ s, &min, &max, /*is_last_stage=*/false);
+
+ //-- start dct 64 stages
+ // stage 2.
+ ButterflyRotation_SecondIsZero(&s[32], &s[63], 63 - 0, false);
+ ButterflyRotation_FirstIsZero(&s[33], &s[62], 63 - 32, false);
+ ButterflyRotation_SecondIsZero(&s[34], &s[61], 63 - 16, false);
+ ButterflyRotation_FirstIsZero(&s[35], &s[60], 63 - 48, false);
+ ButterflyRotation_SecondIsZero(&s[36], &s[59], 63 - 8, false);
+ ButterflyRotation_FirstIsZero(&s[37], &s[58], 63 - 40, false);
+ ButterflyRotation_SecondIsZero(&s[38], &s[57], 63 - 24, false);
+ ButterflyRotation_FirstIsZero(&s[39], &s[56], 63 - 56, false);
+ ButterflyRotation_SecondIsZero(&s[40], &s[55], 63 - 4, false);
+ ButterflyRotation_FirstIsZero(&s[41], &s[54], 63 - 36, false);
+ ButterflyRotation_SecondIsZero(&s[42], &s[53], 63 - 20, false);
+ ButterflyRotation_FirstIsZero(&s[43], &s[52], 63 - 52, false);
+ ButterflyRotation_SecondIsZero(&s[44], &s[51], 63 - 12, false);
+ ButterflyRotation_FirstIsZero(&s[45], &s[50], 63 - 44, false);
+ ButterflyRotation_SecondIsZero(&s[46], &s[49], 63 - 28, false);
+ ButterflyRotation_FirstIsZero(&s[47], &s[48], 63 - 60, false);
+
+ // stage 4.
+ HadamardRotation(&s[32], &s[33], false, &min, &max);
+ HadamardRotation(&s[34], &s[35], true, &min, &max);
+ HadamardRotation(&s[36], &s[37], false, &min, &max);
+ HadamardRotation(&s[38], &s[39], true, &min, &max);
+ HadamardRotation(&s[40], &s[41], false, &min, &max);
+ HadamardRotation(&s[42], &s[43], true, &min, &max);
+ HadamardRotation(&s[44], &s[45], false, &min, &max);
+ HadamardRotation(&s[46], &s[47], true, &min, &max);
+ HadamardRotation(&s[48], &s[49], false, &min, &max);
+ HadamardRotation(&s[50], &s[51], true, &min, &max);
+ HadamardRotation(&s[52], &s[53], false, &min, &max);
+ HadamardRotation(&s[54], &s[55], true, &min, &max);
+ HadamardRotation(&s[56], &s[57], false, &min, &max);
+ HadamardRotation(&s[58], &s[59], true, &min, &max);
+ HadamardRotation(&s[60], &s[61], false, &min, &max);
+ HadamardRotation(&s[62], &s[63], true, &min, &max);
+
+ // stage 7.
+ ButterflyRotation_4(&s[62], &s[33], 60 - 0, true);
+ ButterflyRotation_4(&s[61], &s[34], 60 - 0 + 64, true);
+ ButterflyRotation_4(&s[58], &s[37], 60 - 32, true);
+ ButterflyRotation_4(&s[57], &s[38], 60 - 32 + 64, true);
+ ButterflyRotation_4(&s[54], &s[41], 60 - 16, true);
+ ButterflyRotation_4(&s[53], &s[42], 60 - 16 + 64, true);
+ ButterflyRotation_4(&s[50], &s[45], 60 - 48, true);
+ ButterflyRotation_4(&s[49], &s[46], 60 - 48 + 64, true);
+
+ // stage 11.
+ HadamardRotation(&s[32], &s[35], false, &min, &max);
+ HadamardRotation(&s[33], &s[34], false, &min, &max);
+ HadamardRotation(&s[36], &s[39], true, &min, &max);
+ HadamardRotation(&s[37], &s[38], true, &min, &max);
+ HadamardRotation(&s[40], &s[43], false, &min, &max);
+ HadamardRotation(&s[41], &s[42], false, &min, &max);
+ HadamardRotation(&s[44], &s[47], true, &min, &max);
+ HadamardRotation(&s[45], &s[46], true, &min, &max);
+ HadamardRotation(&s[48], &s[51], false, &min, &max);
+ HadamardRotation(&s[49], &s[50], false, &min, &max);
+ HadamardRotation(&s[52], &s[55], true, &min, &max);
+ HadamardRotation(&s[53], &s[54], true, &min, &max);
+ HadamardRotation(&s[56], &s[59], false, &min, &max);
+ HadamardRotation(&s[57], &s[58], false, &min, &max);
+ HadamardRotation(&s[60], &s[63], true, &min, &max);
+ HadamardRotation(&s[61], &s[62], true, &min, &max);
+
+ // stage 16.
+ ButterflyRotation_4(&s[61], &s[34], 56, true);
+ ButterflyRotation_4(&s[60], &s[35], 56, true);
+ ButterflyRotation_4(&s[59], &s[36], 56 + 64, true);
+ ButterflyRotation_4(&s[58], &s[37], 56 + 64, true);
+ ButterflyRotation_4(&s[53], &s[42], 56 - 32, true);
+ ButterflyRotation_4(&s[52], &s[43], 56 - 32, true);
+ ButterflyRotation_4(&s[51], &s[44], 56 - 32 + 64, true);
+ ButterflyRotation_4(&s[50], &s[45], 56 - 32 + 64, true);
+
+ // stage 21.
+ HadamardRotation(&s[32], &s[39], false, &min, &max);
+ HadamardRotation(&s[33], &s[38], false, &min, &max);
+ HadamardRotation(&s[34], &s[37], false, &min, &max);
+ HadamardRotation(&s[35], &s[36], false, &min, &max);
+ HadamardRotation(&s[40], &s[47], true, &min, &max);
+ HadamardRotation(&s[41], &s[46], true, &min, &max);
+ HadamardRotation(&s[42], &s[45], true, &min, &max);
+ HadamardRotation(&s[43], &s[44], true, &min, &max);
+ HadamardRotation(&s[48], &s[55], false, &min, &max);
+ HadamardRotation(&s[49], &s[54], false, &min, &max);
+ HadamardRotation(&s[50], &s[53], false, &min, &max);
+ HadamardRotation(&s[51], &s[52], false, &min, &max);
+ HadamardRotation(&s[56], &s[63], true, &min, &max);
+ HadamardRotation(&s[57], &s[62], true, &min, &max);
+ HadamardRotation(&s[58], &s[61], true, &min, &max);
+ HadamardRotation(&s[59], &s[60], true, &min, &max);
+
+ // stage 25.
+ ButterflyRotation_4(&s[59], &s[36], 48, true);
+ ButterflyRotation_4(&s[58], &s[37], 48, true);
+ ButterflyRotation_4(&s[57], &s[38], 48, true);
+ ButterflyRotation_4(&s[56], &s[39], 48, true);
+ ButterflyRotation_4(&s[55], &s[40], 112, true);
+ ButterflyRotation_4(&s[54], &s[41], 112, true);
+ ButterflyRotation_4(&s[53], &s[42], 112, true);
+ ButterflyRotation_4(&s[52], &s[43], 112, true);
+
+ // stage 28.
+ HadamardRotation(&s[32], &s[47], false, &min, &max);
+ HadamardRotation(&s[33], &s[46], false, &min, &max);
+ HadamardRotation(&s[34], &s[45], false, &min, &max);
+ HadamardRotation(&s[35], &s[44], false, &min, &max);
+ HadamardRotation(&s[36], &s[43], false, &min, &max);
+ HadamardRotation(&s[37], &s[42], false, &min, &max);
+ HadamardRotation(&s[38], &s[41], false, &min, &max);
+ HadamardRotation(&s[39], &s[40], false, &min, &max);
+ HadamardRotation(&s[48], &s[63], true, &min, &max);
+ HadamardRotation(&s[49], &s[62], true, &min, &max);
+ HadamardRotation(&s[50], &s[61], true, &min, &max);
+ HadamardRotation(&s[51], &s[60], true, &min, &max);
+ HadamardRotation(&s[52], &s[59], true, &min, &max);
+ HadamardRotation(&s[53], &s[58], true, &min, &max);
+ HadamardRotation(&s[54], &s[57], true, &min, &max);
+ HadamardRotation(&s[55], &s[56], true, &min, &max);
+
+ // stage 30.
+ ButterflyRotation_4(&s[55], &s[40], 32, true);
+ ButterflyRotation_4(&s[54], &s[41], 32, true);
+ ButterflyRotation_4(&s[53], &s[42], 32, true);
+ ButterflyRotation_4(&s[52], &s[43], 32, true);
+ ButterflyRotation_4(&s[51], &s[44], 32, true);
+ ButterflyRotation_4(&s[50], &s[45], 32, true);
+ ButterflyRotation_4(&s[49], &s[46], 32, true);
+ ButterflyRotation_4(&s[48], &s[47], 32, true);
+
+ // stage 31.
+ for (int i = 0; i < 32; i += 4) {
+ HadamardRotation(&s[i], &s[63 - i], false, &min, &max);
+ HadamardRotation(&s[i + 1], &s[63 - i - 1], false, &min, &max);
+ HadamardRotation(&s[i + 2], &s[63 - i - 2], false, &min, &max);
+ HadamardRotation(&s[i + 3], &s[63 - i - 3], false, &min, &max);
+ }
+ //-- end dct 64 stages
+ if (is_row) {
+ const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
+ for (int idx = 0; idx < 64; idx += 8) {
+ int32x4_t output[8];
+ Transpose4x4(&s[idx], &output[0]);
+ Transpose4x4(&s[idx + 4], &output[4]);
+ for (int i = 0; i < 8; ++i) {
+ output[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(output[i], v_row_shift)));
+ }
+ StoreDst<4>(dst, step, idx, &output[0]);
+ StoreDst<4>(dst, step, idx + 4, &output[4]);
+ }
+ } else {
+ StoreDst<64>(dst, step, 0, &s[0]);
+ }
+}
+
+//------------------------------------------------------------------------------
+// Asymmetric Discrete Sine Transforms (ADST).
+LIBGAV1_ALWAYS_INLINE void Adst4_NEON(void* dest, int32_t step, bool is_row,
+ int row_shift) {
+ auto* const dst = static_cast<int32_t*>(dest);
+ int32x4_t s[8];
+ int32x4_t x[4];
+
+ LoadSrc<4>(dst, step, 0, x);
+ if (is_row) {
+ Transpose4x4(x, x);
+ }
+
+ // stage 1.
+ s[5] = vmulq_n_s32(x[3], kAdst4Multiplier[1]);
+ s[6] = vmulq_n_s32(x[3], kAdst4Multiplier[3]);
+
+ // stage 2.
+ const int32x4_t a7 = vsubq_s32(x[0], x[2]);
+ const int32x4_t b7 = vaddq_s32(a7, x[3]);
+
+ // stage 3.
+ s[0] = vmulq_n_s32(x[0], kAdst4Multiplier[0]);
+ s[1] = vmulq_n_s32(x[0], kAdst4Multiplier[1]);
+ // s[0] = s[0] + s[3]
+ s[0] = vmlaq_n_s32(s[0], x[2], kAdst4Multiplier[3]);
+ // s[1] = s[1] - s[4]
+ s[1] = vmlsq_n_s32(s[1], x[2], kAdst4Multiplier[0]);
+
+ s[3] = vmulq_n_s32(x[1], kAdst4Multiplier[2]);
+ s[2] = vmulq_n_s32(b7, kAdst4Multiplier[2]);
+
+ // stage 4.
+ s[0] = vaddq_s32(s[0], s[5]);
+ s[1] = vsubq_s32(s[1], s[6]);
+
+ // stages 5 and 6.
+ const int32x4_t x0 = vaddq_s32(s[0], s[3]);
+ const int32x4_t x1 = vaddq_s32(s[1], s[3]);
+ const int32x4_t x3_a = vaddq_s32(s[0], s[1]);
+ const int32x4_t x3 = vsubq_s32(x3_a, s[3]);
+ x[0] = vrshrq_n_s32(x0, 12);
+ x[1] = vrshrq_n_s32(x1, 12);
+ x[2] = vrshrq_n_s32(s[2], 12);
+ x[3] = vrshrq_n_s32(x3, 12);
+
+ if (is_row) {
+ const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
+ x[0] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[0], v_row_shift)));
+ x[1] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[1], v_row_shift)));
+ x[2] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[2], v_row_shift)));
+ x[3] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[3], v_row_shift)));
+ Transpose4x4(x, x);
+ }
+ StoreDst<4>(dst, step, 0, x);
+}
+
+alignas(16) constexpr int32_t kAdst4DcOnlyMultiplier[4] = {1321, 2482, 3344,
+ 2482};
+
+LIBGAV1_ALWAYS_INLINE bool Adst4DcOnly(void* dest, int adjusted_tx_height,
+ bool should_round, int row_shift) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int32_t*>(dest);
+ int32x4_t s[2];
+
+ const int32x4_t v_src0 = vdupq_n_s32(dst[0]);
+ const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
+ const int32x4_t v_src0_round =
+ vqrdmulhq_n_s32(v_src0, kTransformRowMultiplier << (31 - 12));
+
+ const int32x4_t v_src = vbslq_s32(v_mask, v_src0_round, v_src0);
+ const int32x4_t kAdst4DcOnlyMultipliers = vld1q_s32(kAdst4DcOnlyMultiplier);
+ s[1] = vdupq_n_s32(0);
+
+ // s0*k0 s0*k1 s0*k2 s0*k1
+ s[0] = vmulq_s32(kAdst4DcOnlyMultipliers, v_src);
+ // 0 0 0 s0*k0
+ s[1] = vextq_s32(s[1], s[0], 1);
+
+ const int32x4_t x3 = vaddq_s32(s[0], s[1]);
+ const int32x4_t dst_0 = vrshrq_n_s32(x3, 12);
+
+ // vqrshlq_s32 will shift right if shift value is negative.
+ vst1q_s32(dst,
+ vmovl_s16(vqmovn_s32(vqrshlq_s32(dst_0, vdupq_n_s32(-row_shift)))));
+
+ return true;
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst4DcOnlyColumn(void* dest, int adjusted_tx_height,
+ int width) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int32_t*>(dest);
+ int32x4_t s[4];
+
+ int i = 0;
+ do {
+ const int32x4_t v_src = vld1q_s32(&dst[i]);
+
+ s[0] = vmulq_n_s32(v_src, kAdst4Multiplier[0]);
+ s[1] = vmulq_n_s32(v_src, kAdst4Multiplier[1]);
+ s[2] = vmulq_n_s32(v_src, kAdst4Multiplier[2]);
+
+ const int32x4_t x0 = s[0];
+ const int32x4_t x1 = s[1];
+ const int32x4_t x2 = s[2];
+ const int32x4_t x3 = vaddq_s32(s[0], s[1]);
+ const int32x4_t dst_0 = vrshrq_n_s32(x0, 12);
+ const int32x4_t dst_1 = vrshrq_n_s32(x1, 12);
+ const int32x4_t dst_2 = vrshrq_n_s32(x2, 12);
+ const int32x4_t dst_3 = vrshrq_n_s32(x3, 12);
+
+ vst1q_s32(&dst[i], dst_0);
+ vst1q_s32(&dst[i + width * 1], dst_1);
+ vst1q_s32(&dst[i + width * 2], dst_2);
+ vst1q_s32(&dst[i + width * 3], dst_3);
+
+ i += 4;
+ } while (i < width);
+
+ return true;
+}
+
+template <ButterflyRotationFunc butterfly_rotation>
+LIBGAV1_ALWAYS_INLINE void Adst8_NEON(void* dest, int32_t step, bool is_row,
+ int row_shift) {
+ auto* const dst = static_cast<int32_t*>(dest);
+ const int32_t range = is_row ? kBitdepth10 + 7 : 15;
+ const int32x4_t min = vdupq_n_s32(-(1 << range));
+ const int32x4_t max = vdupq_n_s32((1 << range) - 1);
+ int32x4_t s[8], x[8];
+
+ if (is_row) {
+ LoadSrc<4>(dst, step, 0, &x[0]);
+ LoadSrc<4>(dst, step, 4, &x[4]);
+ Transpose4x4(&x[0], &x[0]);
+ Transpose4x4(&x[4], &x[4]);
+ } else {
+ LoadSrc<8>(dst, step, 0, &x[0]);
+ }
+
+ // stage 1.
+ s[0] = x[7];
+ s[1] = x[0];
+ s[2] = x[5];
+ s[3] = x[2];
+ s[4] = x[3];
+ s[5] = x[4];
+ s[6] = x[1];
+ s[7] = x[6];
+
+ // stage 2.
+ butterfly_rotation(&s[0], &s[1], 60 - 0, true);
+ butterfly_rotation(&s[2], &s[3], 60 - 16, true);
+ butterfly_rotation(&s[4], &s[5], 60 - 32, true);
+ butterfly_rotation(&s[6], &s[7], 60 - 48, true);
+
+ // stage 3.
+ HadamardRotation(&s[0], &s[4], false, &min, &max);
+ HadamardRotation(&s[1], &s[5], false, &min, &max);
+ HadamardRotation(&s[2], &s[6], false, &min, &max);
+ HadamardRotation(&s[3], &s[7], false, &min, &max);
+
+ // stage 4.
+ butterfly_rotation(&s[4], &s[5], 48 - 0, true);
+ butterfly_rotation(&s[7], &s[6], 48 - 32, true);
+
+ // stage 5.
+ HadamardRotation(&s[0], &s[2], false, &min, &max);
+ HadamardRotation(&s[4], &s[6], false, &min, &max);
+ HadamardRotation(&s[1], &s[3], false, &min, &max);
+ HadamardRotation(&s[5], &s[7], false, &min, &max);
+
+ // stage 6.
+ butterfly_rotation(&s[2], &s[3], 32, true);
+ butterfly_rotation(&s[6], &s[7], 32, true);
+
+ // stage 7.
+ x[0] = s[0];
+ x[1] = vqnegq_s32(s[4]);
+ x[2] = s[6];
+ x[3] = vqnegq_s32(s[2]);
+ x[4] = s[3];
+ x[5] = vqnegq_s32(s[7]);
+ x[6] = s[5];
+ x[7] = vqnegq_s32(s[1]);
+
+ if (is_row) {
+ const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
+ for (int i = 0; i < 8; ++i) {
+ x[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[i], v_row_shift)));
+ }
+ Transpose4x4(&x[0], &x[0]);
+ Transpose4x4(&x[4], &x[4]);
+ StoreDst<4>(dst, step, 0, &x[0]);
+ StoreDst<4>(dst, step, 4, &x[4]);
+ } else {
+ StoreDst<8>(dst, step, 0, &x[0]);
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst8DcOnly(void* dest, int adjusted_tx_height,
+ bool should_round, int row_shift) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int32_t*>(dest);
+ int32x4_t s[8];
+
+ const int32x4_t v_src = vdupq_n_s32(dst[0]);
+ const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
+ const int32x4_t v_src_round =
+ vqrdmulhq_n_s32(v_src, kTransformRowMultiplier << (31 - 12));
+ // stage 1.
+ s[1] = vbslq_s32(v_mask, v_src_round, v_src);
+
+ // stage 2.
+ ButterflyRotation_FirstIsZero(&s[0], &s[1], 60, true);
+
+ // stage 3.
+ s[4] = s[0];
+ s[5] = s[1];
+
+ // stage 4.
+ ButterflyRotation_4(&s[4], &s[5], 48, true);
+
+ // stage 5.
+ s[2] = s[0];
+ s[3] = s[1];
+ s[6] = s[4];
+ s[7] = s[5];
+
+ // stage 6.
+ ButterflyRotation_4(&s[2], &s[3], 32, true);
+ ButterflyRotation_4(&s[6], &s[7], 32, true);
+
+ // stage 7.
+ int32x4_t x[8];
+ x[0] = s[0];
+ x[1] = vqnegq_s32(s[4]);
+ x[2] = s[6];
+ x[3] = vqnegq_s32(s[2]);
+ x[4] = s[3];
+ x[5] = vqnegq_s32(s[7]);
+ x[6] = s[5];
+ x[7] = vqnegq_s32(s[1]);
+
+ for (int i = 0; i < 8; ++i) {
+ // vqrshlq_s32 will shift right if shift value is negative.
+ x[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[i], vdupq_n_s32(-row_shift))));
+ vst1q_lane_s32(&dst[i], x[i], 0);
+ }
+
+ return true;
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst8DcOnlyColumn(void* dest, int adjusted_tx_height,
+ int width) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int32_t*>(dest);
+ int32x4_t s[8];
+
+ int i = 0;
+ do {
+ const int32x4_t v_src = vld1q_s32(dst);
+ // stage 1.
+ s[1] = v_src;
+
+ // stage 2.
+ ButterflyRotation_FirstIsZero(&s[0], &s[1], 60, true);
+
+ // stage 3.
+ s[4] = s[0];
+ s[5] = s[1];
+
+ // stage 4.
+ ButterflyRotation_4(&s[4], &s[5], 48, true);
+
+ // stage 5.
+ s[2] = s[0];
+ s[3] = s[1];
+ s[6] = s[4];
+ s[7] = s[5];
+
+ // stage 6.
+ ButterflyRotation_4(&s[2], &s[3], 32, true);
+ ButterflyRotation_4(&s[6], &s[7], 32, true);
+
+ // stage 7.
+ int32x4_t x[8];
+ x[0] = s[0];
+ x[1] = vqnegq_s32(s[4]);
+ x[2] = s[6];
+ x[3] = vqnegq_s32(s[2]);
+ x[4] = s[3];
+ x[5] = vqnegq_s32(s[7]);
+ x[6] = s[5];
+ x[7] = vqnegq_s32(s[1]);
+
+ for (int j = 0; j < 8; ++j) {
+ vst1q_s32(&dst[j * width], x[j]);
+ }
+ i += 4;
+ dst += 4;
+ } while (i < width);
+
+ return true;
+}
+
+template <ButterflyRotationFunc butterfly_rotation>
+LIBGAV1_ALWAYS_INLINE void Adst16_NEON(void* dest, int32_t step, bool is_row,
+ int row_shift) {
+ auto* const dst = static_cast<int32_t*>(dest);
+ const int32_t range = is_row ? kBitdepth10 + 7 : 15;
+ const int32x4_t min = vdupq_n_s32(-(1 << range));
+ const int32x4_t max = vdupq_n_s32((1 << range) - 1);
+ int32x4_t s[16], x[16];
+
+ if (is_row) {
+ for (int idx = 0; idx < 16; idx += 8) {
+ LoadSrc<4>(dst, step, idx, &x[idx]);
+ LoadSrc<4>(dst, step, idx + 4, &x[idx + 4]);
+ Transpose4x4(&x[idx], &x[idx]);
+ Transpose4x4(&x[idx + 4], &x[idx + 4]);
+ }
+ } else {
+ LoadSrc<16>(dst, step, 0, &x[0]);
+ }
+
+ // stage 1.
+ s[0] = x[15];
+ s[1] = x[0];
+ s[2] = x[13];
+ s[3] = x[2];
+ s[4] = x[11];
+ s[5] = x[4];
+ s[6] = x[9];
+ s[7] = x[6];
+ s[8] = x[7];
+ s[9] = x[8];
+ s[10] = x[5];
+ s[11] = x[10];
+ s[12] = x[3];
+ s[13] = x[12];
+ s[14] = x[1];
+ s[15] = x[14];
+
+ // stage 2.
+ butterfly_rotation(&s[0], &s[1], 62 - 0, true);
+ butterfly_rotation(&s[2], &s[3], 62 - 8, true);
+ butterfly_rotation(&s[4], &s[5], 62 - 16, true);
+ butterfly_rotation(&s[6], &s[7], 62 - 24, true);
+ butterfly_rotation(&s[8], &s[9], 62 - 32, true);
+ butterfly_rotation(&s[10], &s[11], 62 - 40, true);
+ butterfly_rotation(&s[12], &s[13], 62 - 48, true);
+ butterfly_rotation(&s[14], &s[15], 62 - 56, true);
+
+ // stage 3.
+ HadamardRotation(&s[0], &s[8], false, &min, &max);
+ HadamardRotation(&s[1], &s[9], false, &min, &max);
+ HadamardRotation(&s[2], &s[10], false, &min, &max);
+ HadamardRotation(&s[3], &s[11], false, &min, &max);
+ HadamardRotation(&s[4], &s[12], false, &min, &max);
+ HadamardRotation(&s[5], &s[13], false, &min, &max);
+ HadamardRotation(&s[6], &s[14], false, &min, &max);
+ HadamardRotation(&s[7], &s[15], false, &min, &max);
+
+ // stage 4.
+ butterfly_rotation(&s[8], &s[9], 56 - 0, true);
+ butterfly_rotation(&s[13], &s[12], 8 + 0, true);
+ butterfly_rotation(&s[10], &s[11], 56 - 32, true);
+ butterfly_rotation(&s[15], &s[14], 8 + 32, true);
+
+ // stage 5.
+ HadamardRotation(&s[0], &s[4], false, &min, &max);
+ HadamardRotation(&s[8], &s[12], false, &min, &max);
+ HadamardRotation(&s[1], &s[5], false, &min, &max);
+ HadamardRotation(&s[9], &s[13], false, &min, &max);
+ HadamardRotation(&s[2], &s[6], false, &min, &max);
+ HadamardRotation(&s[10], &s[14], false, &min, &max);
+ HadamardRotation(&s[3], &s[7], false, &min, &max);
+ HadamardRotation(&s[11], &s[15], false, &min, &max);
+
+ // stage 6.
+ butterfly_rotation(&s[4], &s[5], 48 - 0, true);
+ butterfly_rotation(&s[12], &s[13], 48 - 0, true);
+ butterfly_rotation(&s[7], &s[6], 48 - 32, true);
+ butterfly_rotation(&s[15], &s[14], 48 - 32, true);
+
+ // stage 7.
+ HadamardRotation(&s[0], &s[2], false, &min, &max);
+ HadamardRotation(&s[4], &s[6], false, &min, &max);
+ HadamardRotation(&s[8], &s[10], false, &min, &max);
+ HadamardRotation(&s[12], &s[14], false, &min, &max);
+ HadamardRotation(&s[1], &s[3], false, &min, &max);
+ HadamardRotation(&s[5], &s[7], false, &min, &max);
+ HadamardRotation(&s[9], &s[11], false, &min, &max);
+ HadamardRotation(&s[13], &s[15], false, &min, &max);
+
+ // stage 8.
+ butterfly_rotation(&s[2], &s[3], 32, true);
+ butterfly_rotation(&s[6], &s[7], 32, true);
+ butterfly_rotation(&s[10], &s[11], 32, true);
+ butterfly_rotation(&s[14], &s[15], 32, true);
+
+ // stage 9.
+ x[0] = s[0];
+ x[1] = vqnegq_s32(s[8]);
+ x[2] = s[12];
+ x[3] = vqnegq_s32(s[4]);
+ x[4] = s[6];
+ x[5] = vqnegq_s32(s[14]);
+ x[6] = s[10];
+ x[7] = vqnegq_s32(s[2]);
+ x[8] = s[3];
+ x[9] = vqnegq_s32(s[11]);
+ x[10] = s[15];
+ x[11] = vqnegq_s32(s[7]);
+ x[12] = s[5];
+ x[13] = vqnegq_s32(s[13]);
+ x[14] = s[9];
+ x[15] = vqnegq_s32(s[1]);
+
+ if (is_row) {
+ const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
+ for (int i = 0; i < 16; ++i) {
+ x[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[i], v_row_shift)));
+ }
+ for (int idx = 0; idx < 16; idx += 8) {
+ Transpose4x4(&x[idx], &x[idx]);
+ Transpose4x4(&x[idx + 4], &x[idx + 4]);
+ StoreDst<4>(dst, step, idx, &x[idx]);
+ StoreDst<4>(dst, step, idx + 4, &x[idx + 4]);
+ }
+ } else {
+ StoreDst<16>(dst, step, 0, &x[0]);
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE void Adst16DcOnlyInternal(int32x4_t* s, int32x4_t* x) {
+ // stage 2.
+ ButterflyRotation_FirstIsZero(&s[0], &s[1], 62, true);
+
+ // stage 3.
+ s[8] = s[0];
+ s[9] = s[1];
+
+ // stage 4.
+ ButterflyRotation_4(&s[8], &s[9], 56, true);
+
+ // stage 5.
+ s[4] = s[0];
+ s[12] = s[8];
+ s[5] = s[1];
+ s[13] = s[9];
+
+ // stage 6.
+ ButterflyRotation_4(&s[4], &s[5], 48, true);
+ ButterflyRotation_4(&s[12], &s[13], 48, true);
+
+ // stage 7.
+ s[2] = s[0];
+ s[6] = s[4];
+ s[10] = s[8];
+ s[14] = s[12];
+ s[3] = s[1];
+ s[7] = s[5];
+ s[11] = s[9];
+ s[15] = s[13];
+
+ // stage 8.
+ ButterflyRotation_4(&s[2], &s[3], 32, true);
+ ButterflyRotation_4(&s[6], &s[7], 32, true);
+ ButterflyRotation_4(&s[10], &s[11], 32, true);
+ ButterflyRotation_4(&s[14], &s[15], 32, true);
+
+ // stage 9.
+ x[0] = s[0];
+ x[1] = vqnegq_s32(s[8]);
+ x[2] = s[12];
+ x[3] = vqnegq_s32(s[4]);
+ x[4] = s[6];
+ x[5] = vqnegq_s32(s[14]);
+ x[6] = s[10];
+ x[7] = vqnegq_s32(s[2]);
+ x[8] = s[3];
+ x[9] = vqnegq_s32(s[11]);
+ x[10] = s[15];
+ x[11] = vqnegq_s32(s[7]);
+ x[12] = s[5];
+ x[13] = vqnegq_s32(s[13]);
+ x[14] = s[9];
+ x[15] = vqnegq_s32(s[1]);
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst16DcOnly(void* dest, int adjusted_tx_height,
+ bool should_round, int row_shift) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int32_t*>(dest);
+ int32x4_t s[16];
+ int32x4_t x[16];
+ const int32x4_t v_src = vdupq_n_s32(dst[0]);
+ const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
+ const int32x4_t v_src_round =
+ vqrdmulhq_n_s32(v_src, kTransformRowMultiplier << (31 - 12));
+ // stage 1.
+ s[1] = vbslq_s32(v_mask, v_src_round, v_src);
+
+ Adst16DcOnlyInternal(s, x);
+
+ for (int i = 0; i < 16; ++i) {
+ // vqrshlq_s32 will shift right if shift value is negative.
+ x[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[i], vdupq_n_s32(-row_shift))));
+ vst1q_lane_s32(&dst[i], x[i], 0);
+ }
+
+ return true;
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst16DcOnlyColumn(void* dest,
+ int adjusted_tx_height,
+ int width) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int32_t*>(dest);
+ int i = 0;
+ do {
+ int32x4_t s[16];
+ int32x4_t x[16];
+ const int32x4_t v_src = vld1q_s32(dst);
+ // stage 1.
+ s[1] = v_src;
+
+ Adst16DcOnlyInternal(s, x);
+
+ for (int j = 0; j < 16; ++j) {
+ vst1q_s32(&dst[j * width], x[j]);
+ }
+ i += 4;
+ dst += 4;
+ } while (i < width);
+
+ return true;
+}
+
+//------------------------------------------------------------------------------
+// Identity Transforms.
+
+LIBGAV1_ALWAYS_INLINE void Identity4_NEON(void* dest, int32_t step, int shift) {
+ auto* const dst = static_cast<int32_t*>(dest);
+ const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11);
+ const int32x4_t v_multiplier = vdupq_n_s32(kIdentity4Multiplier);
+ const int32x4_t v_shift = vdupq_n_s32(-(12 + shift));
+ for (int i = 0; i < 4; ++i) {
+ const int32x4_t v_src = vld1q_s32(&dst[i * step]);
+ const int32x4_t v_src_mult_lo =
+ vmlaq_s32(v_dual_round, v_src, v_multiplier);
+ const int32x4_t shift_lo = vqshlq_s32(v_src_mult_lo, v_shift);
+ vst1q_s32(&dst[i * step], vmovl_s16(vqmovn_s32(shift_lo)));
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity4DcOnly(void* dest, int adjusted_tx_height,
+ bool should_round, int tx_height) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int32_t*>(dest);
+ const int32x4_t v_src0 = vdupq_n_s32(dst[0]);
+ const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
+ const int32x4_t v_src_round =
+ vqrdmulhq_n_s32(v_src0, kTransformRowMultiplier << (31 - 12));
+ const int32x4_t v_src = vbslq_s32(v_mask, v_src_round, v_src0);
+ const int shift = tx_height < 16 ? 0 : 1;
+ const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11);
+ const int32x4_t v_multiplier = vdupq_n_s32(kIdentity4Multiplier);
+ const int32x4_t v_shift = vdupq_n_s32(-(12 + shift));
+ const int32x4_t v_src_mult_lo = vmlaq_s32(v_dual_round, v_src, v_multiplier);
+ const int32x4_t dst_0 = vqshlq_s32(v_src_mult_lo, v_shift);
+ vst1q_lane_s32(dst, vmovl_s16(vqmovn_s32(dst_0)), 0);
+ return true;
+}
+
+template <int identity_size>
+LIBGAV1_ALWAYS_INLINE void IdentityColumnStoreToFrame(
+ Array2DView<uint16_t> frame, const int start_x, const int start_y,
+ const int tx_width, const int tx_height, const int32_t* source) {
+ static_assert(identity_size == 4 || identity_size == 8 || identity_size == 16,
+ "Invalid identity_size.");
+ const int stride = frame.columns();
+ uint16_t* dst = frame[start_y] + start_x;
+ const int32x4_t v_dual_round = vdupq_n_s32((1 + (1 << 4)) << 11);
+ const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1);
+
+ if (tx_width == 4) {
+ int i = 0;
+ do {
+ int32x4x2_t v_src, v_dst_i, a, b;
+ v_src.val[0] = vld1q_s32(&source[i * 4]);
+ v_src.val[1] = vld1q_s32(&source[(i * 4) + 4]);
+ if (identity_size == 4) {
+ v_dst_i.val[0] =
+ vmlaq_n_s32(v_dual_round, v_src.val[0], kIdentity4Multiplier);
+ v_dst_i.val[1] =
+ vmlaq_n_s32(v_dual_round, v_src.val[1], kIdentity4Multiplier);
+ a.val[0] = vshrq_n_s32(v_dst_i.val[0], 4 + 12);
+ a.val[1] = vshrq_n_s32(v_dst_i.val[1], 4 + 12);
+ } else if (identity_size == 8) {
+ v_dst_i.val[0] = vaddq_s32(v_src.val[0], v_src.val[0]);
+ v_dst_i.val[1] = vaddq_s32(v_src.val[1], v_src.val[1]);
+ a.val[0] = vrshrq_n_s32(v_dst_i.val[0], 4);
+ a.val[1] = vrshrq_n_s32(v_dst_i.val[1], 4);
+ } else { // identity_size == 16
+ v_dst_i.val[0] =
+ vmlaq_n_s32(v_dual_round, v_src.val[0], kIdentity16Multiplier);
+ v_dst_i.val[1] =
+ vmlaq_n_s32(v_dual_round, v_src.val[1], kIdentity16Multiplier);
+ a.val[0] = vshrq_n_s32(v_dst_i.val[0], 4 + 12);
+ a.val[1] = vshrq_n_s32(v_dst_i.val[1], 4 + 12);
+ }
+ uint16x4x2_t frame_data;
+ frame_data.val[0] = vld1_u16(dst);
+ frame_data.val[1] = vld1_u16(dst + stride);
+ b.val[0] = vaddw_s16(a.val[0], vreinterpret_s16_u16(frame_data.val[0]));
+ b.val[1] = vaddw_s16(a.val[1], vreinterpret_s16_u16(frame_data.val[1]));
+ vst1_u16(dst, vmin_u16(vqmovun_s32(b.val[0]), v_max_bitdepth));
+ vst1_u16(dst + stride, vmin_u16(vqmovun_s32(b.val[1]), v_max_bitdepth));
+ dst += stride << 1;
+ i += 2;
+ } while (i < tx_height);
+ } else {
+ int i = 0;
+ do {
+ const int row = i * tx_width;
+ int j = 0;
+ do {
+ int32x4x2_t v_src, v_dst_i, a, b;
+ v_src.val[0] = vld1q_s32(&source[row + j]);
+ v_src.val[1] = vld1q_s32(&source[row + j + 4]);
+ if (identity_size == 4) {
+ v_dst_i.val[0] =
+ vmlaq_n_s32(v_dual_round, v_src.val[0], kIdentity4Multiplier);
+ v_dst_i.val[1] =
+ vmlaq_n_s32(v_dual_round, v_src.val[1], kIdentity4Multiplier);
+ a.val[0] = vshrq_n_s32(v_dst_i.val[0], 4 + 12);
+ a.val[1] = vshrq_n_s32(v_dst_i.val[1], 4 + 12);
+ } else if (identity_size == 8) {
+ v_dst_i.val[0] = vaddq_s32(v_src.val[0], v_src.val[0]);
+ v_dst_i.val[1] = vaddq_s32(v_src.val[1], v_src.val[1]);
+ a.val[0] = vrshrq_n_s32(v_dst_i.val[0], 4);
+ a.val[1] = vrshrq_n_s32(v_dst_i.val[1], 4);
+ } else { // identity_size == 16
+ v_dst_i.val[0] =
+ vmlaq_n_s32(v_dual_round, v_src.val[0], kIdentity16Multiplier);
+ v_dst_i.val[1] =
+ vmlaq_n_s32(v_dual_round, v_src.val[1], kIdentity16Multiplier);
+ a.val[0] = vshrq_n_s32(v_dst_i.val[0], 4 + 12);
+ a.val[1] = vshrq_n_s32(v_dst_i.val[1], 4 + 12);
+ }
+ uint16x4x2_t frame_data;
+ frame_data.val[0] = vld1_u16(dst + j);
+ frame_data.val[1] = vld1_u16(dst + j + 4);
+ b.val[0] = vaddw_s16(a.val[0], vreinterpret_s16_u16(frame_data.val[0]));
+ b.val[1] = vaddw_s16(a.val[1], vreinterpret_s16_u16(frame_data.val[1]));
+ vst1_u16(dst + j, vmin_u16(vqmovun_s32(b.val[0]), v_max_bitdepth));
+ vst1_u16(dst + j + 4, vmin_u16(vqmovun_s32(b.val[1]), v_max_bitdepth));
+ j += 8;
+ } while (j < tx_width);
+ dst += stride;
+ } while (++i < tx_height);
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity4RowColumnStoreToFrame(
+ Array2DView<uint16_t> frame, const int start_x, const int start_y,
+ const int tx_width, const int tx_height, const int32_t* source) {
+ const int stride = frame.columns();
+ uint16_t* dst = frame[start_y] + start_x;
+ const int32x4_t v_round = vdupq_n_s32((1 + (0)) << 11);
+ const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1);
+
+ if (tx_width == 4) {
+ int i = 0;
+ do {
+ const int32x4_t v_src = vld1q_s32(&source[i * 4]);
+ const int32x4_t v_dst_row =
+ vshrq_n_s32(vmlaq_n_s32(v_round, v_src, kIdentity4Multiplier), 12);
+ const int32x4_t v_dst_col =
+ vmlaq_n_s32(v_round, v_dst_row, kIdentity4Multiplier);
+ const uint16x4_t frame_data = vld1_u16(dst);
+ const int32x4_t a = vrshrq_n_s32(v_dst_col, 4 + 12);
+ const int32x4_t b = vaddw_s16(a, vreinterpret_s16_u16(frame_data));
+ vst1_u16(dst, vmin_u16(vqmovun_s32(b), v_max_bitdepth));
+ dst += stride;
+ } while (++i < tx_height);
+ } else {
+ int i = 0;
+ do {
+ const int row = i * tx_width;
+ int j = 0;
+ do {
+ int32x4x2_t v_src, v_src_round, v_dst_row, v_dst_col, a, b;
+ v_src.val[0] = vld1q_s32(&source[row + j]);
+ v_src.val[1] = vld1q_s32(&source[row + j + 4]);
+ v_src_round.val[0] = vshrq_n_s32(
+ vmlaq_n_s32(v_round, v_src.val[0], kTransformRowMultiplier), 12);
+ v_src_round.val[1] = vshrq_n_s32(
+ vmlaq_n_s32(v_round, v_src.val[1], kTransformRowMultiplier), 12);
+ v_dst_row.val[0] = vqaddq_s32(v_src_round.val[0], v_src_round.val[0]);
+ v_dst_row.val[1] = vqaddq_s32(v_src_round.val[1], v_src_round.val[1]);
+ v_dst_col.val[0] =
+ vmlaq_n_s32(v_round, v_dst_row.val[0], kIdentity4Multiplier);
+ v_dst_col.val[1] =
+ vmlaq_n_s32(v_round, v_dst_row.val[1], kIdentity4Multiplier);
+ uint16x4x2_t frame_data;
+ frame_data.val[0] = vld1_u16(dst + j);
+ frame_data.val[1] = vld1_u16(dst + j + 4);
+ a.val[0] = vrshrq_n_s32(v_dst_col.val[0], 4 + 12);
+ a.val[1] = vrshrq_n_s32(v_dst_col.val[1], 4 + 12);
+ b.val[0] = vaddw_s16(a.val[0], vreinterpret_s16_u16(frame_data.val[0]));
+ b.val[1] = vaddw_s16(a.val[1], vreinterpret_s16_u16(frame_data.val[1]));
+ vst1_u16(dst + j, vmin_u16(vqmovun_s32(b.val[0]), v_max_bitdepth));
+ vst1_u16(dst + j + 4, vmin_u16(vqmovun_s32(b.val[1]), v_max_bitdepth));
+ j += 8;
+ } while (j < tx_width);
+ dst += stride;
+ } while (++i < tx_height);
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity8Row32_NEON(void* dest, int32_t step) {
+ auto* const dst = static_cast<int32_t*>(dest);
+
+ // When combining the identity8 multiplier with the row shift, the
+ // calculations for tx_height equal to 32 can be simplified from
+ // ((A * 2) + 2) >> 2) to ((A + 1) >> 1).
+ for (int i = 0; i < 4; ++i) {
+ const int32x4_t v_src_lo = vld1q_s32(&dst[i * step]);
+ const int32x4_t v_src_hi = vld1q_s32(&dst[(i * step) + 4]);
+ const int32x4_t a_lo = vrshrq_n_s32(v_src_lo, 1);
+ const int32x4_t a_hi = vrshrq_n_s32(v_src_hi, 1);
+ vst1q_s32(&dst[i * step], vmovl_s16(vqmovn_s32(a_lo)));
+ vst1q_s32(&dst[(i * step) + 4], vmovl_s16(vqmovn_s32(a_hi)));
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity8Row4_NEON(void* dest, int32_t step) {
+ auto* const dst = static_cast<int32_t*>(dest);
+
+ for (int i = 0; i < 4; ++i) {
+ const int32x4_t v_src_lo = vld1q_s32(&dst[i * step]);
+ const int32x4_t v_src_hi = vld1q_s32(&dst[(i * step) + 4]);
+ const int32x4_t v_srcx2_lo = vqaddq_s32(v_src_lo, v_src_lo);
+ const int32x4_t v_srcx2_hi = vqaddq_s32(v_src_hi, v_src_hi);
+ vst1q_s32(&dst[i * step], vmovl_s16(vqmovn_s32(v_srcx2_lo)));
+ vst1q_s32(&dst[(i * step) + 4], vmovl_s16(vqmovn_s32(v_srcx2_hi)));
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity8DcOnly(void* dest, int adjusted_tx_height,
+ bool should_round, int row_shift) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int32_t*>(dest);
+ const int32x4_t v_src0 = vdupq_n_s32(dst[0]);
+ const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
+ const int32x4_t v_src_round =
+ vqrdmulhq_n_s32(v_src0, kTransformRowMultiplier << (31 - 12));
+ const int32x4_t v_src = vbslq_s32(v_mask, v_src_round, v_src0);
+ const int32x4_t v_srcx2 = vaddq_s32(v_src, v_src);
+ const int32x4_t dst_0 = vqrshlq_s32(v_srcx2, vdupq_n_s32(-row_shift));
+ vst1q_lane_s32(dst, vmovl_s16(vqmovn_s32(dst_0)), 0);
+ return true;
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity16Row_NEON(void* dest, int32_t step,
+ int shift) {
+ auto* const dst = static_cast<int32_t*>(dest);
+ const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11);
+ const int32x4_t v_shift = vdupq_n_s32(-(12 + shift));
+
+ for (int i = 0; i < 4; ++i) {
+ for (int j = 0; j < 2; ++j) {
+ int32x4x2_t v_src;
+ v_src.val[0] = vld1q_s32(&dst[i * step + j * 8]);
+ v_src.val[1] = vld1q_s32(&dst[i * step + j * 8 + 4]);
+ const int32x4_t v_src_mult_lo =
+ vmlaq_n_s32(v_dual_round, v_src.val[0], kIdentity16Multiplier);
+ const int32x4_t v_src_mult_hi =
+ vmlaq_n_s32(v_dual_round, v_src.val[1], kIdentity16Multiplier);
+ const int32x4_t shift_lo = vqshlq_s32(v_src_mult_lo, v_shift);
+ const int32x4_t shift_hi = vqshlq_s32(v_src_mult_hi, v_shift);
+ vst1q_s32(&dst[i * step + j * 8], vmovl_s16(vqmovn_s32(shift_lo)));
+ vst1q_s32(&dst[i * step + j * 8 + 4], vmovl_s16(vqmovn_s32(shift_hi)));
+ }
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity16DcOnly(void* dest, int adjusted_tx_height,
+ bool should_round, int shift) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int32_t*>(dest);
+ const int32x4_t v_src0 = vdupq_n_s32(dst[0]);
+ const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
+ const int32x4_t v_src_round =
+ vqrdmulhq_n_s32(v_src0, kTransformRowMultiplier << (31 - 12));
+ const int32x4_t v_src = vbslq_s32(v_mask, v_src_round, v_src0);
+ const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11);
+ const int32x4_t v_src_mult_lo =
+ vmlaq_n_s32(v_dual_round, v_src, kIdentity16Multiplier);
+ const int32x4_t dst_0 = vqshlq_s32(v_src_mult_lo, vdupq_n_s32(-(12 + shift)));
+ vst1q_lane_s32(dst, vmovl_s16(vqmovn_s32(dst_0)), 0);
+ return true;
+}
+
+//------------------------------------------------------------------------------
+// row/column transform loops
+
+template <int tx_height>
+LIBGAV1_ALWAYS_INLINE void FlipColumns(int32_t* source, int tx_width) {
+ if (tx_width >= 16) {
+ int i = 0;
+ do {
+ // 00 01 02 03
+ const int32x4_t a = vld1q_s32(&source[i]);
+ const int32x4_t b = vld1q_s32(&source[i + 4]);
+ const int32x4_t c = vld1q_s32(&source[i + 8]);
+ const int32x4_t d = vld1q_s32(&source[i + 12]);
+ // 01 00 03 02
+ const int32x4_t a_rev = vrev64q_s32(a);
+ const int32x4_t b_rev = vrev64q_s32(b);
+ const int32x4_t c_rev = vrev64q_s32(c);
+ const int32x4_t d_rev = vrev64q_s32(d);
+ // 03 02 01 00
+ vst1q_s32(&source[i], vextq_s32(d_rev, d_rev, 2));
+ vst1q_s32(&source[i + 4], vextq_s32(c_rev, c_rev, 2));
+ vst1q_s32(&source[i + 8], vextq_s32(b_rev, b_rev, 2));
+ vst1q_s32(&source[i + 12], vextq_s32(a_rev, a_rev, 2));
+ i += 16;
+ } while (i < tx_width * tx_height);
+ } else if (tx_width == 8) {
+ for (int i = 0; i < 8 * tx_height; i += 8) {
+ // 00 01 02 03
+ const int32x4_t a = vld1q_s32(&source[i]);
+ const int32x4_t b = vld1q_s32(&source[i + 4]);
+ // 01 00 03 02
+ const int32x4_t a_rev = vrev64q_s32(a);
+ const int32x4_t b_rev = vrev64q_s32(b);
+ // 03 02 01 00
+ vst1q_s32(&source[i], vextq_s32(b_rev, b_rev, 2));
+ vst1q_s32(&source[i + 4], vextq_s32(a_rev, a_rev, 2));
+ }
+ } else {
+ // Process two rows per iteration.
+ for (int i = 0; i < 4 * tx_height; i += 8) {
+ // 00 01 02 03
+ const int32x4_t a = vld1q_s32(&source[i]);
+ const int32x4_t b = vld1q_s32(&source[i + 4]);
+ // 01 00 03 02
+ const int32x4_t a_rev = vrev64q_s32(a);
+ const int32x4_t b_rev = vrev64q_s32(b);
+ // 03 02 01 00
+ vst1q_s32(&source[i], vextq_s32(a_rev, a_rev, 2));
+ vst1q_s32(&source[i + 4], vextq_s32(b_rev, b_rev, 2));
+ }
+ }
+}
+
+template <int tx_width>
+LIBGAV1_ALWAYS_INLINE void ApplyRounding(int32_t* source, int num_rows) {
+ // Process two rows per iteration.
+ int i = 0;
+ do {
+ const int32x4_t a_lo = vld1q_s32(&source[i]);
+ const int32x4_t a_hi = vld1q_s32(&source[i + 4]);
+ const int32x4_t b_lo =
+ vqrdmulhq_n_s32(a_lo, kTransformRowMultiplier << (31 - 12));
+ const int32x4_t b_hi =
+ vqrdmulhq_n_s32(a_hi, kTransformRowMultiplier << (31 - 12));
+ vst1q_s32(&source[i], b_lo);
+ vst1q_s32(&source[i + 4], b_hi);
+ i += 8;
+ } while (i < tx_width * num_rows);
+}
+
+template <int tx_width>
+LIBGAV1_ALWAYS_INLINE void RowShift(int32_t* source, int num_rows,
+ int row_shift) {
+ // vqrshlq_s32 will shift right if shift value is negative.
+ row_shift = -row_shift;
+
+ // Process two rows per iteration.
+ int i = 0;
+ do {
+ const int32x4_t residual0 = vld1q_s32(&source[i]);
+ const int32x4_t residual1 = vld1q_s32(&source[i + 4]);
+ vst1q_s32(&source[i], vqrshlq_s32(residual0, vdupq_n_s32(row_shift)));
+ vst1q_s32(&source[i + 4], vqrshlq_s32(residual1, vdupq_n_s32(row_shift)));
+ i += 8;
+ } while (i < tx_width * num_rows);
+}
+
+template <int tx_height, bool enable_flip_rows = false>
+LIBGAV1_ALWAYS_INLINE void StoreToFrameWithRound(
+ Array2DView<uint16_t> frame, const int start_x, const int start_y,
+ const int tx_width, const int32_t* source, TransformType tx_type) {
+ const bool flip_rows =
+ enable_flip_rows ? kTransformFlipRowsMask.Contains(tx_type) : false;
+ const int stride = frame.columns();
+ uint16_t* dst = frame[start_y] + start_x;
+
+ if (tx_width == 4) {
+ for (int i = 0; i < tx_height; ++i) {
+ const int row = flip_rows ? (tx_height - i - 1) * 4 : i * 4;
+ const int32x4_t residual = vld1q_s32(&source[row]);
+ const uint16x4_t frame_data = vld1_u16(dst);
+ const int32x4_t a = vrshrq_n_s32(residual, 4);
+ const uint32x4_t b = vaddw_u16(vreinterpretq_u32_s32(a), frame_data);
+ const uint16x4_t d = vqmovun_s32(vreinterpretq_s32_u32(b));
+ vst1_u16(dst, vmin_u16(d, vdup_n_u16((1 << kBitdepth10) - 1)));
+ dst += stride;
+ }
+ } else {
+ for (int i = 0; i < tx_height; ++i) {
+ const int y = start_y + i;
+ const int row = flip_rows ? (tx_height - i - 1) * tx_width : i * tx_width;
+ int j = 0;
+ do {
+ const int x = start_x + j;
+ const int32x4_t residual = vld1q_s32(&source[row + j]);
+ const int32x4_t residual_hi = vld1q_s32(&source[row + j + 4]);
+ const uint16x8_t frame_data = vld1q_u16(frame[y] + x);
+ const int32x4_t a = vrshrq_n_s32(residual, 4);
+ const int32x4_t a_hi = vrshrq_n_s32(residual_hi, 4);
+ const uint32x4_t b =
+ vaddw_u16(vreinterpretq_u32_s32(a), vget_low_u16(frame_data));
+ const uint32x4_t b_hi =
+ vaddw_u16(vreinterpretq_u32_s32(a_hi), vget_high_u16(frame_data));
+ const uint16x4_t d = vqmovun_s32(vreinterpretq_s32_u32(b));
+ const uint16x4_t d_hi = vqmovun_s32(vreinterpretq_s32_u32(b_hi));
+ vst1q_u16(frame[y] + x, vminq_u16(vcombine_u16(d, d_hi),
+ vdupq_n_u16((1 << kBitdepth10) - 1)));
+ j += 8;
+ } while (j < tx_width);
+ }
+ }
+}
+
+void Dct4TransformLoopRow_NEON(TransformType /*tx_type*/, TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int /*start_x*/, int /*start_y*/,
+ void* /*dst_frame*/) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const int tx_height = kTransformHeight[tx_size];
+ const bool should_round = (tx_height == 8);
+ const int row_shift = (tx_height == 16);
+
+ if (DctDcOnly<4>(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<4>(src, adjusted_tx_height);
+ }
+
+ // Process 4 1d dct4 rows in parallel per iteration.
+ int i = adjusted_tx_height;
+ auto* data = src;
+ do {
+ Dct4_NEON<ButterflyRotation_4>(data, /*step=*/4, /*is_row=*/true,
+ row_shift);
+ data += 16;
+ i -= 4;
+ } while (i != 0);
+}
+
+void Dct4TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y, void* dst_frame) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<4>(src, tx_width);
+ }
+
+ if (!DctDcOnlyColumn<4>(src, adjusted_tx_height, tx_width)) {
+ // Process 4 1d dct4 columns in parallel per iteration.
+ int i = tx_width;
+ auto* data = src;
+ do {
+ Dct4_NEON<ButterflyRotation_4>(data, tx_width, /*transpose=*/false,
+ /*row_shift=*/0);
+ data += 4;
+ i -= 4;
+ } while (i != 0);
+ }
+
+ auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+ StoreToFrameWithRound<4>(frame, start_x, start_y, tx_width, src, tx_type);
+}
+
+void Dct8TransformLoopRow_NEON(TransformType /*tx_type*/, TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int /*start_x*/, int /*start_y*/,
+ void* /*dst_frame*/) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
+
+ if (DctDcOnly<8>(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<8>(src, adjusted_tx_height);
+ }
+
+ // Process 4 1d dct8 rows in parallel per iteration.
+ int i = adjusted_tx_height;
+ auto* data = src;
+ do {
+ Dct8_NEON<ButterflyRotation_4>(data, /*step=*/8, /*is_row=*/true,
+ row_shift);
+ data += 32;
+ i -= 4;
+ } while (i != 0);
+}
+
+void Dct8TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y, void* dst_frame) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<8>(src, tx_width);
+ }
+
+ if (!DctDcOnlyColumn<8>(src, adjusted_tx_height, tx_width)) {
+ // Process 4 1d dct8 columns in parallel per iteration.
+ int i = tx_width;
+ auto* data = src;
+ do {
+ Dct8_NEON<ButterflyRotation_4>(data, tx_width, /*is_row=*/false,
+ /*row_shift=*/0);
+ data += 4;
+ i -= 4;
+ } while (i != 0);
+ }
+ auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+ StoreToFrameWithRound<8>(frame, start_x, start_y, tx_width, src, tx_type);
+}
+
+void Dct16TransformLoopRow_NEON(TransformType /*tx_type*/,
+ TransformSize tx_size, int adjusted_tx_height,
+ void* src_buffer, int /*start_x*/,
+ int /*start_y*/, void* /*dst_frame*/) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
+
+ if (DctDcOnly<16>(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<16>(src, adjusted_tx_height);
+ }
+
+ assert(adjusted_tx_height % 4 == 0);
+ int i = adjusted_tx_height;
+ auto* data = src;
+ do {
+ // Process 4 1d dct16 rows in parallel per iteration.
+ Dct16_NEON<ButterflyRotation_4>(data, 16, /*is_row=*/true, row_shift);
+ data += 64;
+ i -= 4;
+ } while (i != 0);
+}
+
+void Dct16TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y, void* dst_frame) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<16>(src, tx_width);
+ }
+
+ if (!DctDcOnlyColumn<16>(src, adjusted_tx_height, tx_width)) {
+ // Process 4 1d dct16 columns in parallel per iteration.
+ int i = tx_width;
+ auto* data = src;
+ do {
+ Dct16_NEON<ButterflyRotation_4>(data, tx_width, /*is_row=*/false,
+ /*row_shift=*/0);
+ data += 4;
+ i -= 4;
+ } while (i != 0);
+ }
+ auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+ StoreToFrameWithRound<16>(frame, start_x, start_y, tx_width, src, tx_type);
+}
+
+void Dct32TransformLoopRow_NEON(TransformType /*tx_type*/,
+ TransformSize tx_size, int adjusted_tx_height,
+ void* src_buffer, int /*start_x*/,
+ int /*start_y*/, void* /*dst_frame*/) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
+
+ if (DctDcOnly<32>(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<32>(src, adjusted_tx_height);
+ }
+
+ assert(adjusted_tx_height % 4 == 0);
+ int i = adjusted_tx_height;
+ auto* data = src;
+ do {
+ // Process 4 1d dct32 rows in parallel per iteration.
+ Dct32_NEON(data, 32, /*is_row=*/true, row_shift);
+ data += 128;
+ i -= 4;
+ } while (i != 0);
+}
+
+void Dct32TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y, void* dst_frame) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<32>(src, tx_width);
+ }
+
+ if (!DctDcOnlyColumn<32>(src, adjusted_tx_height, tx_width)) {
+ // Process 4 1d dct32 columns in parallel per iteration.
+ int i = tx_width;
+ auto* data = src;
+ do {
+ Dct32_NEON(data, tx_width, /*is_row=*/false, /*row_shift=*/0);
+ data += 4;
+ i -= 4;
+ } while (i != 0);
+ }
+ auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+ StoreToFrameWithRound<32>(frame, start_x, start_y, tx_width, src, tx_type);
+}
+
+void Dct64TransformLoopRow_NEON(TransformType /*tx_type*/,
+ TransformSize tx_size, int adjusted_tx_height,
+ void* src_buffer, int /*start_x*/,
+ int /*start_y*/, void* /*dst_frame*/) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
+
+ if (DctDcOnly<64>(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<64>(src, adjusted_tx_height);
+ }
+
+ assert(adjusted_tx_height % 4 == 0);
+ int i = adjusted_tx_height;
+ auto* data = src;
+ do {
+ // Process 4 1d dct64 rows in parallel per iteration.
+ Dct64_NEON(data, 64, /*is_row=*/true, row_shift);
+ data += 128 * 2;
+ i -= 4;
+ } while (i != 0);
+}
+
+void Dct64TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y, void* dst_frame) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<64>(src, tx_width);
+ }
+
+ if (!DctDcOnlyColumn<64>(src, adjusted_tx_height, tx_width)) {
+ // Process 4 1d dct64 columns in parallel per iteration.
+ int i = tx_width;
+ auto* data = src;
+ do {
+ Dct64_NEON(data, tx_width, /*is_row=*/false, /*row_shift=*/0);
+ data += 4;
+ i -= 4;
+ } while (i != 0);
+ }
+ auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+ StoreToFrameWithRound<64>(frame, start_x, start_y, tx_width, src, tx_type);
+}
+
+void Adst4TransformLoopRow_NEON(TransformType /*tx_type*/,
+ TransformSize tx_size, int adjusted_tx_height,
+ void* src_buffer, int /*start_x*/,
+ int /*start_y*/, void* /*dst_frame*/) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const int tx_height = kTransformHeight[tx_size];
+ const int row_shift = static_cast<int>(tx_height == 16);
+ const bool should_round = (tx_height == 8);
+
+ if (Adst4DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<4>(src, adjusted_tx_height);
+ }
+
+ // Process 4 1d adst4 rows in parallel per iteration.
+ int i = adjusted_tx_height;
+ auto* data = src;
+ do {
+ Adst4_NEON(data, /*step=*/4, /*is_row=*/true, row_shift);
+ data += 16;
+ i -= 4;
+ } while (i != 0);
+}
+
+void Adst4TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y, void* dst_frame) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<4>(src, tx_width);
+ }
+
+ if (!Adst4DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
+ // Process 4 1d adst4 columns in parallel per iteration.
+ int i = tx_width;
+ auto* data = src;
+ do {
+ Adst4_NEON(data, tx_width, /*is_row=*/false, /*row_shift=*/0);
+ data += 4;
+ i -= 4;
+ } while (i != 0);
+ }
+
+ auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+ StoreToFrameWithRound<4, /*enable_flip_rows=*/true>(frame, start_x, start_y,
+ tx_width, src, tx_type);
+}
+
+void Adst8TransformLoopRow_NEON(TransformType /*tx_type*/,
+ TransformSize tx_size, int adjusted_tx_height,
+ void* src_buffer, int /*start_x*/,
+ int /*start_y*/, void* /*dst_frame*/) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
+
+ if (Adst8DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<8>(src, adjusted_tx_height);
+ }
+
+ // Process 4 1d adst8 rows in parallel per iteration.
+ assert(adjusted_tx_height % 4 == 0);
+ int i = adjusted_tx_height;
+ auto* data = src;
+ do {
+ Adst8_NEON<ButterflyRotation_4>(data, /*step=*/8,
+ /*transpose=*/true, row_shift);
+ data += 32;
+ i -= 4;
+ } while (i != 0);
+}
+
+void Adst8TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y, void* dst_frame) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<8>(src, tx_width);
+ }
+
+ if (!Adst8DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
+ // Process 4 1d adst8 columns in parallel per iteration.
+ int i = tx_width;
+ auto* data = src;
+ do {
+ Adst8_NEON<ButterflyRotation_4>(data, tx_width, /*transpose=*/false,
+ /*row_shift=*/0);
+ data += 4;
+ i -= 4;
+ } while (i != 0);
+ }
+ auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+ StoreToFrameWithRound<8, /*enable_flip_rows=*/true>(frame, start_x, start_y,
+ tx_width, src, tx_type);
+}
+
+void Adst16TransformLoopRow_NEON(TransformType /*tx_type*/,
+ TransformSize tx_size, int adjusted_tx_height,
+ void* src_buffer, int /*start_x*/,
+ int /*start_y*/, void* /*dst_frame*/) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
+
+ if (Adst16DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<16>(src, adjusted_tx_height);
+ }
+
+ assert(adjusted_tx_height % 4 == 0);
+ int i = adjusted_tx_height;
+ do {
+ // Process 4 1d adst16 rows in parallel per iteration.
+ Adst16_NEON<ButterflyRotation_4>(src, 16, /*is_row=*/true, row_shift);
+ src += 64;
+ i -= 4;
+ } while (i != 0);
+}
+
+void Adst16TransformLoopColumn_NEON(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y, void* dst_frame) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<16>(src, tx_width);
+ }
+
+ if (!Adst16DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
+ int i = tx_width;
+ auto* data = src;
+ do {
+ // Process 4 1d adst16 columns in parallel per iteration.
+ Adst16_NEON<ButterflyRotation_4>(data, tx_width, /*is_row=*/false,
+ /*row_shift=*/0);
+ data += 4;
+ i -= 4;
+ } while (i != 0);
+ }
+ auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+ StoreToFrameWithRound<16, /*enable_flip_rows=*/true>(frame, start_x, start_y,
+ tx_width, src, tx_type);
+}
+
+void Identity4TransformLoopRow_NEON(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int /*start_x*/, int /*start_y*/,
+ void* /*dst_frame*/) {
+ // Special case: Process row calculations during column transform call.
+ // Improves performance.
+ if (tx_type == kTransformTypeIdentityIdentity &&
+ tx_size == kTransformSize4x4) {
+ return;
+ }
+
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const int tx_height = kTransformHeight[tx_size];
+ const bool should_round = (tx_height == 8);
+
+ if (Identity4DcOnly(src, adjusted_tx_height, should_round, tx_height)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<4>(src, adjusted_tx_height);
+ }
+
+ const int shift = tx_height > 8 ? 1 : 0;
+ int i = adjusted_tx_height;
+ do {
+ Identity4_NEON(src, /*step=*/4, shift);
+ src += 16;
+ i -= 4;
+ } while (i != 0);
+}
+
+void Identity4TransformLoopColumn_NEON(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y,
+ void* dst_frame) {
+ auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ // Special case: Process row calculations during column transform call.
+ if (tx_type == kTransformTypeIdentityIdentity &&
+ (tx_size == kTransformSize4x4 || tx_size == kTransformSize8x4)) {
+ Identity4RowColumnStoreToFrame(frame, start_x, start_y, tx_width,
+ adjusted_tx_height, src);
+ return;
+ }
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<4>(src, tx_width);
+ }
+
+ IdentityColumnStoreToFrame<4>(frame, start_x, start_y, tx_width,
+ adjusted_tx_height, src);
+}
+
+void Identity8TransformLoopRow_NEON(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int /*start_x*/, int /*start_y*/,
+ void* /*dst_frame*/) {
+ // Special case: Process row calculations during column transform call.
+ // Improves performance.
+ if (tx_type == kTransformTypeIdentityIdentity &&
+ tx_size == kTransformSize8x4) {
+ return;
+ }
+
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const int tx_height = kTransformHeight[tx_size];
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
+
+ if (Identity8DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+ if (should_round) {
+ ApplyRounding<8>(src, adjusted_tx_height);
+ }
+
+ // When combining the identity8 multiplier with the row shift, the
+ // calculations for tx_height == 8 and tx_height == 16 can be simplified
+ // from ((A * 2) + 1) >> 1) to A. For 10bpp, A must be clamped to a signed 16
+ // bit value.
+ if ((tx_height & 0x18) != 0) {
+ for (int i = 0; i < tx_height; ++i) {
+ const int32x4_t v_src_lo = vld1q_s32(&src[i * 8]);
+ const int32x4_t v_src_hi = vld1q_s32(&src[(i * 8) + 4]);
+ vst1q_s32(&src[i * 8], vmovl_s16(vqmovn_s32(v_src_lo)));
+ vst1q_s32(&src[(i * 8) + 4], vmovl_s16(vqmovn_s32(v_src_hi)));
+ }
+ return;
+ }
+ if (tx_height == 32) {
+ int i = adjusted_tx_height;
+ do {
+ Identity8Row32_NEON(src, /*step=*/8);
+ src += 32;
+ i -= 4;
+ } while (i != 0);
+ return;
+ }
+
+ assert(tx_size == kTransformSize8x4);
+ int i = adjusted_tx_height;
+ do {
+ Identity8Row4_NEON(src, /*step=*/8);
+ src += 32;
+ i -= 4;
+ } while (i != 0);
+}
+
+void Identity8TransformLoopColumn_NEON(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y,
+ void* dst_frame) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<8>(src, tx_width);
+ }
+ auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+ IdentityColumnStoreToFrame<8>(frame, start_x, start_y, tx_width,
+ adjusted_tx_height, src);
+}
+
+void Identity16TransformLoopRow_NEON(TransformType /*tx_type*/,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int /*start_x*/, int /*start_y*/,
+ void* /*dst_frame*/) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
+
+ if (Identity16DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<16>(src, adjusted_tx_height);
+ }
+ int i = adjusted_tx_height;
+ do {
+ Identity16Row_NEON(src, /*step=*/16, row_shift);
+ src += 64;
+ i -= 4;
+ } while (i != 0);
+}
+
+void Identity16TransformLoopColumn_NEON(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height,
+ void* src_buffer, int start_x,
+ int start_y, void* dst_frame) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<16>(src, tx_width);
+ }
+ auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+ IdentityColumnStoreToFrame<16>(frame, start_x, start_y, tx_width,
+ adjusted_tx_height, src);
+}
+
+//------------------------------------------------------------------------------
+
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+ // Maximum transform size for Dct is 64.
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kRow] =
+ Dct4TransformLoopRow_NEON;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kColumn] =
+ Dct4TransformLoopColumn_NEON;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kRow] =
+ Dct8TransformLoopRow_NEON;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kColumn] =
+ Dct8TransformLoopColumn_NEON;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kRow] =
+ Dct16TransformLoopRow_NEON;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kColumn] =
+ Dct16TransformLoopColumn_NEON;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kRow] =
+ Dct32TransformLoopRow_NEON;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kColumn] =
+ Dct32TransformLoopColumn_NEON;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kRow] =
+ Dct64TransformLoopRow_NEON;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kColumn] =
+ Dct64TransformLoopColumn_NEON;
+
+ // Maximum transform size for Adst is 16.
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kRow] =
+ Adst4TransformLoopRow_NEON;
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kColumn] =
+ Adst4TransformLoopColumn_NEON;
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kRow] =
+ Adst8TransformLoopRow_NEON;
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kColumn] =
+ Adst8TransformLoopColumn_NEON;
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kRow] =
+ Adst16TransformLoopRow_NEON;
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kColumn] =
+ Adst16TransformLoopColumn_NEON;
+
+ // Maximum transform size for Identity transform is 32.
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kRow] =
+ Identity4TransformLoopRow_NEON;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kColumn] =
+ Identity4TransformLoopColumn_NEON;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kRow] =
+ Identity8TransformLoopRow_NEON;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kColumn] =
+ Identity8TransformLoopColumn_NEON;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kRow] =
+ Identity16TransformLoopRow_NEON;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kColumn] =
+ Identity16TransformLoopColumn_NEON;
+}
+
+} // namespace
+
+void InverseTransformInit10bpp_NEON() { Init10bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+#else // !LIBGAV1_ENABLE_NEON || LIBGAV1_MAX_BITDEPTH < 10
+namespace libgav1 {
+namespace dsp {
+
+void InverseTransformInit10bpp_NEON() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10
diff --git a/src/dsp/arm/inverse_transform_neon.cc b/src/dsp/arm/inverse_transform_neon.cc
index 072991a..315d5e9 100644
--- a/src/dsp/arm/inverse_transform_neon.cc
+++ b/src/dsp/arm/inverse_transform_neon.cc
@@ -3117,7 +3117,7 @@ void InverseTransformInit_NEON() { low_bitdepth::Init8bpp(); }
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_NEON
+#else // !LIBGAV1_ENABLE_NEON
namespace libgav1 {
namespace dsp {
diff --git a/src/dsp/arm/inverse_transform_neon.h b/src/dsp/arm/inverse_transform_neon.h
index af647e8..91e0e83 100644
--- a/src/dsp/arm/inverse_transform_neon.h
+++ b/src/dsp/arm/inverse_transform_neon.h
@@ -26,6 +26,7 @@ namespace dsp {
// Initializes Dsp::inverse_transforms, see the defines below for specifics.
// This function is not thread-safe.
void InverseTransformInit_NEON();
+void InverseTransformInit10bpp_NEON();
} // namespace dsp
} // namespace libgav1
@@ -47,6 +48,21 @@ void InverseTransformInit_NEON();
#define LIBGAV1_Dsp8bpp_1DTransformSize32_1DTransformIdentity LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformWht LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_1DTransformSize4_1DTransformDct LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_1DTransformSize8_1DTransformDct LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_1DTransformSize16_1DTransformDct LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_1DTransformSize32_1DTransformDct LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_1DTransformSize64_1DTransformDct LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_1DTransformSize4_1DTransformAdst LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_1DTransformSize8_1DTransformAdst LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_1DTransformSize16_1DTransformAdst LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_1DTransformSize4_1DTransformIdentity LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_1DTransformSize8_1DTransformIdentity LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_1DTransformSize16_1DTransformIdentity LIBGAV1_CPU_NEON
+
#endif // LIBGAV1_ENABLE_NEON
#endif // LIBGAV1_SRC_DSP_ARM_INVERSE_TRANSFORM_NEON_H_
diff --git a/src/dsp/arm/loop_filter_neon.cc b/src/dsp/arm/loop_filter_neon.cc
index 146c983..8d72892 100644
--- a/src/dsp/arm/loop_filter_neon.cc
+++ b/src/dsp/arm/loop_filter_neon.cc
@@ -35,7 +35,7 @@ namespace {
// (abs(p1 - p0) > thresh) || (abs(q1 - q0) > thresh)
inline uint8x8_t Hev(const uint8x8_t abd_p0p1_q0q1, const uint8_t thresh) {
const uint8x8_t a = vcgt_u8(abd_p0p1_q0q1, vdup_n_u8(thresh));
- return vorr_u8(a, RightShift<32>(a));
+ return vorr_u8(a, RightShiftVector<32>(a));
}
// abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= outer_thresh
@@ -44,7 +44,7 @@ inline uint8x8_t OuterThreshold(const uint8x8_t p0q0, const uint8x8_t p1q1,
const uint8x8x2_t a = Interleave32(p0q0, p1q1);
const uint8x8_t b = vabd_u8(a.val[0], a.val[1]);
const uint8x8_t p0q0_double = vqadd_u8(b, b);
- const uint8x8_t p1q1_half = RightShift<32>(vshr_n_u8(b, 1));
+ const uint8x8_t p1q1_half = RightShiftVector<32>(vshr_n_u8(b, 1));
const uint8x8_t c = vqadd_u8(p0q0_double, p1q1_half);
return vcle_u8(c, vdup_n_u8(outer_thresh));
}
@@ -56,7 +56,7 @@ inline uint8x8_t NeedsFilter4(const uint8x8_t abd_p0p1_q0q1,
const uint8_t inner_thresh,
const uint8_t outer_thresh) {
const uint8x8_t a = vcle_u8(abd_p0p1_q0q1, vdup_n_u8(inner_thresh));
- const uint8x8_t inner_mask = vand_u8(a, RightShift<32>(a));
+ const uint8x8_t inner_mask = vand_u8(a, RightShiftVector<32>(a));
const uint8x8_t outer_mask = OuterThreshold(p0q0, p1q1, outer_thresh);
return vand_u8(inner_mask, outer_mask);
}
@@ -121,7 +121,7 @@ inline void Filter4(const uint8x8_t q0p1, const uint8x8_t p0q1,
vcombine_s16(vget_low_s16(p0q1_l), vget_low_s16(q0p1_l));
// Need to shift the second term or we end up with a2_ma2.
const int8x8_t a2_ma1 =
- InterleaveLow32(a2_a1, RightShift<32>(vneg_s8(a2_a1)));
+ InterleaveLow32(a2_a1, RightShiftVector<32>(vneg_s8(a2_a1)));
const int16x8_t p0q0_a = vaddw_s8(p0q0_l, a2_ma1);
*p1q1_result = vqmovun_s16(p1q1_a3);
@@ -251,7 +251,7 @@ inline uint8x8_t IsFlat3(const uint8x8_t abd_p0p1_q0q1,
const uint8x8_t abd_p0p2_q0q2) {
const uint8x8_t a = vmax_u8(abd_p0p1_q0q1, abd_p0p2_q0q2);
const uint8x8_t b = vcle_u8(a, vdup_n_u8(1));
- return vand_u8(b, RightShift<32>(b));
+ return vand_u8(b, RightShiftVector<32>(b));
}
// abs(p2 - p1) <= inner_thresh && abs(p1 - p0) <= inner_thresh &&
@@ -264,7 +264,7 @@ inline uint8x8_t NeedsFilter6(const uint8x8_t abd_p0p1_q0q1,
const uint8_t outer_thresh) {
const uint8x8_t a = vmax_u8(abd_p0p1_q0q1, abd_p1p2_q1q2);
const uint8x8_t b = vcle_u8(a, vdup_n_u8(inner_thresh));
- const uint8x8_t inner_mask = vand_u8(b, RightShift<32>(b));
+ const uint8x8_t inner_mask = vand_u8(b, RightShiftVector<32>(b));
const uint8x8_t outer_mask = OuterThreshold(p0q0, p1q1, outer_thresh);
return vand_u8(inner_mask, outer_mask);
}
@@ -482,7 +482,7 @@ inline uint8x8_t IsFlat4(const uint8x8_t abd_p0n0_q0n0,
const uint8x8_t a = vmax_u8(abd_p0n0_q0n0, abd_p0n1_q0n1);
const uint8x8_t b = vmax_u8(a, abd_p0n2_q0n2);
const uint8x8_t c = vcle_u8(b, vdup_n_u8(1));
- return vand_u8(c, RightShift<32>(c));
+ return vand_u8(c, RightShiftVector<32>(c));
}
// abs(p3 - p2) <= inner_thresh && abs(p2 - p1) <= inner_thresh &&
@@ -498,7 +498,7 @@ inline uint8x8_t NeedsFilter8(const uint8x8_t abd_p0p1_q0q1,
const uint8x8_t a = vmax_u8(abd_p0p1_q0q1, abd_p1p2_q1q2);
const uint8x8_t b = vmax_u8(a, abd_p2p3_q2q3);
const uint8x8_t c = vcle_u8(b, vdup_n_u8(inner_thresh));
- const uint8x8_t inner_mask = vand_u8(c, RightShift<32>(c));
+ const uint8x8_t inner_mask = vand_u8(c, RightShiftVector<32>(c));
const uint8x8_t outer_mask = OuterThreshold(p0q0, p1q1, outer_thresh);
return vand_u8(inner_mask, outer_mask);
}
@@ -1179,7 +1179,7 @@ void LoopFilterInit_NEON() { low_bitdepth::Init8bpp(); }
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_NEON
+#else // !LIBGAV1_ENABLE_NEON
namespace libgav1 {
namespace dsp {
diff --git a/src/dsp/arm/loop_restoration_neon.cc b/src/dsp/arm/loop_restoration_neon.cc
index 337c9b4..e6ceb66 100644
--- a/src/dsp/arm/loop_restoration_neon.cc
+++ b/src/dsp/arm/loop_restoration_neon.cc
@@ -41,10 +41,25 @@ inline uint8x8_t VshrU128(const uint8x8x2_t src) {
}
template <int bytes>
+inline uint8x8_t VshrU128(const uint8x8_t src[2]) {
+ return vext_u8(src[0], src[1], bytes);
+}
+
+template <int bytes>
+inline uint8x16_t VshrU128(const uint8x16_t src[2]) {
+ return vextq_u8(src[0], src[1], bytes);
+}
+
+template <int bytes>
inline uint16x8_t VshrU128(const uint16x8x2_t src) {
return vextq_u16(src.val[0], src.val[1], bytes / 2);
}
+template <int bytes>
+inline uint16x8_t VshrU128(const uint16x8_t src[2]) {
+ return vextq_u16(src[0], src[1], bytes / 2);
+}
+
// Wiener
// Must make a local copy of coefficients to help compiler know that they have
@@ -177,18 +192,17 @@ inline void WienerHorizontalTap3(const uint8_t* src, const ptrdiff_t src_stride,
int16_t** const wiener_buffer) {
for (int y = height; y != 0; --y) {
const uint8_t* src_ptr = src;
- uint8x16_t s[4];
- s[0] = vld1q_u8(src_ptr);
+ uint8x16_t s[3];
ptrdiff_t x = width;
do {
- src_ptr += 16;
- s[3] = vld1q_u8(src_ptr);
- s[1] = vextq_u8(s[0], s[3], 1);
- s[2] = vextq_u8(s[0], s[3], 2);
+ // Slightly faster than using vextq_u8().
+ s[0] = vld1q_u8(src_ptr);
+ s[1] = vld1q_u8(src_ptr + 1);
+ s[2] = vld1q_u8(src_ptr + 2);
int16x8x2_t sum;
sum.val[0] = sum.val[1] = vdupq_n_s16(0);
WienerHorizontalSum(s, filter, sum, *wiener_buffer);
- s[0] = s[3];
+ src_ptr += 16;
*wiener_buffer += 16;
x -= 16;
} while (x != 0);
@@ -476,12 +490,12 @@ inline void WienerVerticalTap1(const int16_t* wiener_buffer,
// For width 16 and up, store the horizontal results, and then do the vertical
// filter row by row. This is faster than doing it column by column when
// considering cache issues.
-void WienerFilter_NEON(const RestorationUnitInfo& restoration_info,
- const void* const source, const void* const top_border,
- const void* const bottom_border, const ptrdiff_t stride,
- const int width, const int height,
- RestorationBuffer* const restoration_buffer,
- void* const dest) {
+void WienerFilter_NEON(
+ const RestorationUnitInfo& restoration_info, const void* const source,
+ const ptrdiff_t stride, const void* const top_border,
+ const ptrdiff_t top_border_stride, const void* const bottom_border,
+ const ptrdiff_t bottom_border_stride, const int width, const int height,
+ RestorationBuffer* const restoration_buffer, void* const dest) {
const int16_t* const number_leading_zero_coefficients =
restoration_info.wiener_info.number_leading_zero_coefficients;
const int number_rows_to_skip = std::max(
@@ -509,39 +523,42 @@ void WienerFilter_NEON(const RestorationUnitInfo& restoration_info,
const auto* const top = static_cast<const uint8_t*>(top_border);
const auto* const bottom = static_cast<const uint8_t*>(bottom_border);
if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) {
- WienerHorizontalTap7(top + (2 - height_extra) * stride - 3, stride,
- wiener_stride, height_extra, filter_horizontal,
- &wiener_buffer_horizontal);
- WienerHorizontalTap7(src - 3, stride, wiener_stride, height,
+ WienerHorizontalTap7(top + (2 - height_extra) * top_border_stride - 3,
+ top_border_stride, wiener_stride, height_extra,
filter_horizontal, &wiener_buffer_horizontal);
- WienerHorizontalTap7(bottom - 3, stride, wiener_stride, height_extra,
+ WienerHorizontalTap7(src - 3, stride, wiener_stride, height,
filter_horizontal, &wiener_buffer_horizontal);
- } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
- WienerHorizontalTap5(top + (2 - height_extra) * stride - 2, stride,
- wiener_stride, height_extra, filter_horizontal,
+ WienerHorizontalTap7(bottom - 3, bottom_border_stride, wiener_stride,
+ height_extra, filter_horizontal,
&wiener_buffer_horizontal);
- WienerHorizontalTap5(src - 2, stride, wiener_stride, height,
+ } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
+ WienerHorizontalTap5(top + (2 - height_extra) * top_border_stride - 2,
+ top_border_stride, wiener_stride, height_extra,
filter_horizontal, &wiener_buffer_horizontal);
- WienerHorizontalTap5(bottom - 2, stride, wiener_stride, height_extra,
+ WienerHorizontalTap5(src - 2, stride, wiener_stride, height,
filter_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap5(bottom - 2, bottom_border_stride, wiener_stride,
+ height_extra, filter_horizontal,
+ &wiener_buffer_horizontal);
} else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) {
// The maximum over-reads happen here.
- WienerHorizontalTap3(top + (2 - height_extra) * stride - 1, stride,
- wiener_stride, height_extra, filter_horizontal,
- &wiener_buffer_horizontal);
- WienerHorizontalTap3(src - 1, stride, wiener_stride, height,
+ WienerHorizontalTap3(top + (2 - height_extra) * top_border_stride - 1,
+ top_border_stride, wiener_stride, height_extra,
filter_horizontal, &wiener_buffer_horizontal);
- WienerHorizontalTap3(bottom - 1, stride, wiener_stride, height_extra,
+ WienerHorizontalTap3(src - 1, stride, wiener_stride, height,
filter_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap3(bottom - 1, bottom_border_stride, wiener_stride,
+ height_extra, filter_horizontal,
+ &wiener_buffer_horizontal);
} else {
assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3);
- WienerHorizontalTap1(top + (2 - height_extra) * stride, stride,
- wiener_stride, height_extra,
+ WienerHorizontalTap1(top + (2 - height_extra) * top_border_stride,
+ top_border_stride, wiener_stride, height_extra,
&wiener_buffer_horizontal);
WienerHorizontalTap1(src, stride, wiener_stride, height,
&wiener_buffer_horizontal);
- WienerHorizontalTap1(bottom, stride, wiener_stride, height_extra,
- &wiener_buffer_horizontal);
+ WienerHorizontalTap1(bottom, bottom_border_stride, wiener_stride,
+ height_extra, &wiener_buffer_horizontal);
}
// vertical filtering.
@@ -574,13 +591,20 @@ void WienerFilter_NEON(const RestorationUnitInfo& restoration_info,
//------------------------------------------------------------------------------
// SGR
-inline void Prepare3_8(const uint8x8x2_t src, uint8x8_t dst[3]) {
+inline void Prepare3_8(const uint8x8_t src[2], uint8x8_t dst[3]) {
dst[0] = VshrU128<0>(src);
dst[1] = VshrU128<1>(src);
dst[2] = VshrU128<2>(src);
}
-inline void Prepare3_16(const uint16x8x2_t src, uint16x4_t low[3],
+template <int offset>
+inline void Prepare3_8(const uint8x16_t src[2], uint8x16_t dst[3]) {
+ dst[0] = VshrU128<offset + 0>(src);
+ dst[1] = VshrU128<offset + 1>(src);
+ dst[2] = VshrU128<offset + 2>(src);
+}
+
+inline void Prepare3_16(const uint16x8_t src[2], uint16x4_t low[3],
uint16x4_t high[3]) {
uint16x8_t s[3];
s[0] = VshrU128<0>(src);
@@ -594,7 +618,7 @@ inline void Prepare3_16(const uint16x8x2_t src, uint16x4_t low[3],
high[2] = vget_high_u16(s[2]);
}
-inline void Prepare5_8(const uint8x8x2_t src, uint8x8_t dst[5]) {
+inline void Prepare5_8(const uint8x8_t src[2], uint8x8_t dst[5]) {
dst[0] = VshrU128<0>(src);
dst[1] = VshrU128<1>(src);
dst[2] = VshrU128<2>(src);
@@ -602,7 +626,16 @@ inline void Prepare5_8(const uint8x8x2_t src, uint8x8_t dst[5]) {
dst[4] = VshrU128<4>(src);
}
-inline void Prepare5_16(const uint16x8x2_t src, uint16x4_t low[5],
+template <int offset>
+inline void Prepare5_8(const uint8x16_t src[2], uint8x16_t dst[5]) {
+ dst[0] = VshrU128<offset + 0>(src);
+ dst[1] = VshrU128<offset + 1>(src);
+ dst[2] = VshrU128<offset + 2>(src);
+ dst[3] = VshrU128<offset + 3>(src);
+ dst[4] = VshrU128<offset + 4>(src);
+}
+
+inline void Prepare5_16(const uint16x8_t src[2], uint16x4_t low[5],
uint16x4_t high[5]) {
Prepare3_16(src, low, high);
const uint16x8_t s3 = VshrU128<6>(src);
@@ -641,6 +674,30 @@ inline uint16x8_t Sum3W_16(const uint8x8_t src[3]) {
return vaddw_u8(sum, src[2]);
}
+inline uint16x8_t Sum3WLo16(const uint8x16_t src[3]) {
+ const uint16x8_t sum = vaddl_u8(vget_low_u8(src[0]), vget_low_u8(src[1]));
+ return vaddw_u8(sum, vget_low_u8(src[2]));
+}
+
+inline uint16x8_t Sum3WHi16(const uint8x16_t src[3]) {
+ const uint16x8_t sum = vaddl_u8(vget_high_u8(src[0]), vget_high_u8(src[1]));
+ return vaddw_u8(sum, vget_high_u8(src[2]));
+}
+
+inline uint16x8_t Sum5WLo16(const uint8x16_t src[5]) {
+ const uint16x8_t sum01 = vaddl_u8(vget_low_u8(src[0]), vget_low_u8(src[1]));
+ const uint16x8_t sum23 = vaddl_u8(vget_low_u8(src[2]), vget_low_u8(src[3]));
+ const uint16x8_t sum = vaddq_u16(sum01, sum23);
+ return vaddw_u8(sum, vget_low_u8(src[4]));
+}
+
+inline uint16x8_t Sum5WHi16(const uint8x16_t src[5]) {
+ const uint16x8_t sum01 = vaddl_u8(vget_high_u8(src[0]), vget_high_u8(src[1]));
+ const uint16x8_t sum23 = vaddl_u8(vget_high_u8(src[2]), vget_high_u8(src[3]));
+ const uint16x8_t sum = vaddq_u16(sum01, sum23);
+ return vaddw_u8(sum, vget_high_u8(src[4]));
+}
+
inline uint32x4_t Sum3W_32(const uint16x4_t src[3]) {
const uint32x4_t sum = vaddl_u16(src[0], src[1]);
return vaddw_u16(sum, src[2]);
@@ -678,13 +735,28 @@ inline uint32x4_t Sum5W_32(const uint16x4_t src[5]) {
return vaddw_u16(sum0123, src[4]);
}
-inline uint16x8_t Sum3Horizontal(const uint8x8x2_t src) {
+inline uint16x8_t Sum3Horizontal(const uint8x8_t src[2]) {
uint8x8_t s[3];
Prepare3_8(src, s);
return Sum3W_16(s);
}
-inline uint32x4x2_t Sum3WHorizontal(const uint16x8x2_t src) {
+inline uint16x8_t Sum3Horizontal(const uint8x16_t src) {
+ uint8x8_t s[2];
+ s[0] = vget_low_u8(src);
+ s[1] = vget_high_u8(src);
+ return Sum3Horizontal(s);
+}
+
+template <int offset>
+inline void Sum3Horizontal(const uint8x16_t src[2], uint16x8_t dst[2]) {
+ uint8x16_t s[3];
+ Prepare3_8<offset>(src, s);
+ dst[0] = Sum3WLo16(s);
+ dst[1] = Sum3WHi16(s);
+}
+
+inline uint32x4x2_t Sum3WHorizontal(const uint16x8_t src[2]) {
uint16x4_t low[3], high[3];
uint32x4x2_t sum;
Prepare3_16(src, low, high);
@@ -693,7 +765,7 @@ inline uint32x4x2_t Sum3WHorizontal(const uint16x8x2_t src) {
return sum;
}
-inline uint16x8_t Sum5Horizontal(const uint8x8x2_t src) {
+inline uint16x8_t Sum5Horizontal(const uint8x8_t src[2]) {
uint8x8_t s[5];
Prepare5_8(src, s);
const uint16x8_t sum01 = vaddl_u8(s[0], s[1]);
@@ -702,7 +774,23 @@ inline uint16x8_t Sum5Horizontal(const uint8x8x2_t src) {
return vaddw_u8(sum0123, s[4]);
}
-inline uint32x4x2_t Sum5WHorizontal(const uint16x8x2_t src) {
+inline uint16x8_t Sum5Horizontal(const uint8x16_t src) {
+ uint8x8_t s[2];
+ s[0] = vget_low_u8(src);
+ s[1] = vget_high_u8(src);
+ return Sum5Horizontal(s);
+}
+
+template <int offset>
+inline void Sum5Horizontal(const uint8x16_t src[2], uint16x8_t* const dst0,
+ uint16x8_t* const dst1) {
+ uint8x16_t s[5];
+ Prepare5_8<offset>(src, s);
+ *dst0 = Sum5WLo16(s);
+ *dst1 = Sum5WHi16(s);
+}
+
+inline uint32x4x2_t Sum5WHorizontal(const uint16x8_t src[2]) {
uint16x4_t low[5], high[5];
Prepare5_16(src, low, high);
uint32x4x2_t sum;
@@ -711,35 +799,68 @@ inline uint32x4x2_t Sum5WHorizontal(const uint16x8x2_t src) {
return sum;
}
-void SumHorizontal(const uint16x4_t src[5], uint32x4_t* const row_sq3,
- uint32x4_t* const row_sq5) {
- const uint32x4_t sum04 = vaddl_u16(src[0], src[4]);
- const uint32x4_t sum12 = vaddl_u16(src[1], src[2]);
- *row_sq3 = vaddw_u16(sum12, src[3]);
- *row_sq5 = vaddq_u32(sum04, *row_sq3);
+template <int offset>
+void SumHorizontal(const uint8x16_t src[2], uint16x8_t* const row3_0,
+ uint16x8_t* const row3_1, uint16x8_t* const row5_0,
+ uint16x8_t* const row5_1) {
+ uint8x16_t s[5];
+ Prepare5_8<offset>(src, s);
+ const uint16x8_t sum04_lo = vaddl_u8(vget_low_u8(s[0]), vget_low_u8(s[4]));
+ const uint16x8_t sum04_hi = vaddl_u8(vget_high_u8(s[0]), vget_high_u8(s[4]));
+ *row3_0 = Sum3WLo16(s + 1);
+ *row3_1 = Sum3WHi16(s + 1);
+ *row5_0 = vaddq_u16(sum04_lo, *row3_0);
+ *row5_1 = vaddq_u16(sum04_hi, *row3_1);
}
-void SumHorizontal(const uint8x8x2_t src, const uint16x8x2_t sq,
- uint16x8_t* const row3, uint16x8_t* const row5,
- uint32x4x2_t* const row_sq3, uint32x4x2_t* const row_sq5) {
+void SumHorizontal(const uint8x8_t src[2], uint16x8_t* const row3,
+ uint16x8_t* const row5) {
uint8x8_t s[5];
Prepare5_8(src, s);
const uint16x8_t sum04 = vaddl_u8(s[0], s[4]);
const uint16x8_t sum12 = vaddl_u8(s[1], s[2]);
*row3 = vaddw_u8(sum12, s[3]);
*row5 = vaddq_u16(sum04, *row3);
+}
+
+void SumHorizontal(const uint16x4_t src[5], uint32x4_t* const row_sq3,
+ uint32x4_t* const row_sq5) {
+ const uint32x4_t sum04 = vaddl_u16(src[0], src[4]);
+ const uint32x4_t sum12 = vaddl_u16(src[1], src[2]);
+ *row_sq3 = vaddw_u16(sum12, src[3]);
+ *row_sq5 = vaddq_u32(sum04, *row_sq3);
+}
+
+void SumHorizontal(const uint16x8_t sq[2], uint32x4x2_t* const row_sq3,
+ uint32x4x2_t* const row_sq5) {
uint16x4_t low[5], high[5];
Prepare5_16(sq, low, high);
SumHorizontal(low, &row_sq3->val[0], &row_sq5->val[0]);
SumHorizontal(high, &row_sq3->val[1], &row_sq5->val[1]);
}
-inline uint16x8_t Sum343(const uint8x8x2_t src) {
- uint8x8_t s[3];
- Prepare3_8(src, s);
- const uint16x8_t sum = Sum3W_16(s);
+void SumHorizontal(const uint8x8_t src[2], const uint16x8_t sq[2],
+ uint16x8_t* const row3, uint16x8_t* const row5,
+ uint32x4x2_t* const row_sq3, uint32x4x2_t* const row_sq5) {
+ SumHorizontal(src, row3, row5);
+ SumHorizontal(sq, row_sq3, row_sq5);
+}
+
+void SumHorizontal(const uint8x16_t src, const uint16x8_t sq[2],
+ uint16x8_t* const row3, uint16x8_t* const row5,
+ uint32x4x2_t* const row_sq3, uint32x4x2_t* const row_sq5) {
+ uint8x8_t s[2];
+ s[0] = vget_low_u8(src);
+ s[1] = vget_high_u8(src);
+ return SumHorizontal(s, sq, row3, row5, row_sq3, row_sq5);
+}
+
+template <int offset>
+inline uint16x8_t Sum343(const uint8x16_t ma3[2]) {
+ const uint16x8_t sum = (offset == 0) ? Sum3WLo16(ma3) : Sum3WHi16(ma3);
const uint16x8_t sum3 = Sum3_16(sum, sum, sum);
- return vaddw_u8(sum3, s[1]);
+ return vaddw_u8(sum3,
+ (offset == 0) ? vget_low_u8(ma3[1]) : vget_high_u8(ma3[1]));
}
inline uint32x4_t Sum343W(const uint16x4_t src[3]) {
@@ -748,7 +869,7 @@ inline uint32x4_t Sum343W(const uint16x4_t src[3]) {
return vaddw_u16(sum3, src[1]);
}
-inline uint32x4x2_t Sum343W(const uint16x8x2_t src) {
+inline uint32x4x2_t Sum343W(const uint16x8_t src[2]) {
uint16x4_t low[3], high[3];
uint32x4x2_t d;
Prepare3_16(src, low, high);
@@ -757,13 +878,13 @@ inline uint32x4x2_t Sum343W(const uint16x8x2_t src) {
return d;
}
-inline uint16x8_t Sum565(const uint8x8x2_t src) {
- uint8x8_t s[3];
- Prepare3_8(src, s);
- const uint16x8_t sum = Sum3W_16(s);
+template <int offset>
+inline uint16x8_t Sum565(const uint8x16_t ma5[2]) {
+ const uint16x8_t sum = (offset == 0) ? Sum3WLo16(ma5) : Sum3WHi16(ma5);
const uint16x8_t sum4 = vshlq_n_u16(sum, 2);
const uint16x8_t sum5 = vaddq_u16(sum4, sum);
- return vaddw_u8(sum5, s[1]);
+ return vaddw_u8(sum5,
+ (offset == 0) ? vget_low_u8(ma5[1]) : vget_high_u8(ma5[1]));
}
inline uint32x4_t Sum565W(const uint16x4_t src[3]) {
@@ -773,7 +894,7 @@ inline uint32x4_t Sum565W(const uint16x4_t src[3]) {
return vaddw_u16(sum5, src[1]);
}
-inline uint32x4x2_t Sum565W(const uint16x8x2_t src) {
+inline uint32x4x2_t Sum565W(const uint16x8_t src[2]) {
uint16x4_t low[3], high[3];
uint32x4x2_t d;
Prepare3_16(src, low, high);
@@ -783,21 +904,21 @@ inline uint32x4x2_t Sum565W(const uint16x8x2_t src) {
}
inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
- const int height, const ptrdiff_t sum_stride, uint16_t* sum3,
- uint16_t* sum5, uint32_t* square_sum3,
- uint32_t* square_sum5) {
- int y = height;
+ const ptrdiff_t sum_stride, uint16_t* sum3, uint16_t* sum5,
+ uint32_t* square_sum3, uint32_t* square_sum5) {
+ int y = 2;
+ // Don't change loop width to 16, which is even slower.
do {
- uint8x8x2_t s;
- uint16x8x2_t sq;
- s.val[0] = vld1_u8(src);
- sq.val[0] = vmull_u8(s.val[0], s.val[0]);
+ uint8x8_t s[2];
+ uint16x8_t sq[2];
+ s[0] = vld1_u8(src);
+ sq[0] = vmull_u8(s[0], s[0]);
ptrdiff_t x = 0;
do {
uint16x8_t row3, row5;
uint32x4x2_t row_sq3, row_sq5;
- s.val[1] = vld1_u8(src + x + 8);
- sq.val[1] = vmull_u8(s.val[1], s.val[1]);
+ s[1] = vld1_u8(src + x + 8);
+ sq[1] = vmull_u8(s[1], s[1]);
SumHorizontal(s, sq, &row3, &row5, &row_sq3, &row_sq5);
vst1q_u16(sum3, row3);
vst1q_u16(sum5, row5);
@@ -805,8 +926,8 @@ inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
vst1q_u32(square_sum3 + 4, row_sq3.val[1]);
vst1q_u32(square_sum5 + 0, row_sq5.val[0]);
vst1q_u32(square_sum5 + 4, row_sq5.val[1]);
- s.val[0] = s.val[1];
- sq.val[0] = sq.val[1];
+ s[0] = s[1];
+ sq[0] = sq[1];
sum3 += 8;
sum5 += 8;
square_sum3 += 8;
@@ -819,21 +940,22 @@ inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
template <int size>
inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
- const int height, const ptrdiff_t sum_stride, uint16_t* sums,
+ const ptrdiff_t sum_stride, uint16_t* sums,
uint32_t* square_sums) {
static_assert(size == 3 || size == 5, "");
- int y = height;
+ int y = 2;
+ // Don't change loop width to 16, which is even slower.
do {
- uint8x8x2_t s;
- uint16x8x2_t sq;
- s.val[0] = vld1_u8(src);
- sq.val[0] = vmull_u8(s.val[0], s.val[0]);
+ uint8x8_t s[2];
+ uint16x8_t sq[2];
+ s[0] = vld1_u8(src);
+ sq[0] = vmull_u8(s[0], s[0]);
ptrdiff_t x = 0;
do {
uint16x8_t row;
uint32x4x2_t row_sq;
- s.val[1] = vld1_u8(src + x + 8);
- sq.val[1] = vmull_u8(s.val[1], s.val[1]);
+ s[1] = vld1_u8(src + x + 8);
+ sq[1] = vmull_u8(s[1], s[1]);
if (size == 3) {
row = Sum3Horizontal(s);
row_sq = Sum3WHorizontal(sq);
@@ -844,8 +966,8 @@ inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
vst1q_u16(sums, row);
vst1q_u32(square_sums + 0, row_sq.val[0]);
vst1q_u32(square_sums + 4, row_sq.val[1]);
- s.val[0] = s.val[1];
- sq.val[0] = sq.val[1];
+ s[0] = s[1];
+ sq[0] = sq[1];
sums += 8;
square_sums += 8;
x += 8;
@@ -871,10 +993,18 @@ inline uint16x4_t CalculateMa(const uint16x4_t sum, const uint32x4_t sum_sq,
return vmovn_u32(shifted);
}
-template <int n>
+inline uint8x8_t AdjustValue(const uint8x8_t value, const uint8x8_t index,
+ const int threshold) {
+ const uint8x8_t thresholds = vdup_n_u8(threshold);
+ const uint8x8_t offset = vcgt_u8(index, thresholds);
+ // Adding 255 is equivalent to subtracting 1 for 8-bit data.
+ return vadd_u8(value, offset);
+}
+
+template <int n, int offset>
inline void CalculateIntermediate(const uint16x8_t sum,
const uint32x4x2_t sum_sq,
- const uint32_t scale, uint8x8_t* const ma,
+ const uint32_t scale, uint8x16_t* const ma,
uint16x8_t* const b) {
constexpr uint32_t one_over_n =
((1 << kSgrProjReciprocalBits) + (n >> 1)) / n;
@@ -882,19 +1012,39 @@ inline void CalculateIntermediate(const uint16x8_t sum,
const uint16x4_t z1 =
CalculateMa<n>(vget_high_u16(sum), sum_sq.val[1], scale);
const uint16x8_t z01 = vcombine_u16(z0, z1);
- // Using vqmovn_u16() needs an extra sign extension instruction.
- const uint16x8_t z = vminq_u16(z01, vdupq_n_u16(255));
- // Using vgetq_lane_s16() can save the sign extension instruction.
- const uint8_t lookup[8] = {
- kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 0)],
- kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 1)],
- kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 2)],
- kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 3)],
- kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 4)],
- kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 5)],
- kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 6)],
- kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 7)]};
- *ma = vld1_u8(lookup);
+ const uint8x8_t idx = vqmovn_u16(z01);
+ // Use table lookup to read elements whose indices are less than 48.
+ // Using one uint8x8x4_t vector and one uint8x8x2_t vector is faster than
+ // using two uint8x8x3_t vectors.
+ uint8x8x4_t table0;
+ uint8x8x2_t table1;
+ table0.val[0] = vld1_u8(kSgrMaLookup + 0 * 8);
+ table0.val[1] = vld1_u8(kSgrMaLookup + 1 * 8);
+ table0.val[2] = vld1_u8(kSgrMaLookup + 2 * 8);
+ table0.val[3] = vld1_u8(kSgrMaLookup + 3 * 8);
+ table1.val[0] = vld1_u8(kSgrMaLookup + 4 * 8);
+ table1.val[1] = vld1_u8(kSgrMaLookup + 5 * 8);
+ // All elements whose indices are out of range [0, 47] are set to 0.
+ uint8x8_t val = vtbl4_u8(table0, idx); // Range [0, 31].
+ // Subtract 8 to shuffle the next index range.
+ const uint8x8_t index = vsub_u8(idx, vdup_n_u8(32));
+ const uint8x8_t res = vtbl2_u8(table1, index); // Range [32, 47].
+ // Use OR instruction to combine shuffle results together.
+ val = vorr_u8(val, res);
+
+ // For elements whose indices are larger than 47, since they seldom change
+ // values with the increase of the index, we use comparison and arithmetic
+ // operations to calculate their values.
+ // Elements whose indices are larger than 47 (with value 0) are set to 5.
+ val = vmax_u8(val, vdup_n_u8(5));
+ val = AdjustValue(val, idx, 55); // 55 is the last index which value is 5.
+ val = AdjustValue(val, idx, 72); // 72 is the last index which value is 4.
+ val = AdjustValue(val, idx, 101); // 101 is the last index which value is 3.
+ val = AdjustValue(val, idx, 169); // 169 is the last index which value is 2.
+ val = AdjustValue(val, idx, 254); // 254 is the last index which value is 1.
+ *ma = (offset == 0) ? vcombine_u8(val, vget_high_u8(*ma))
+ : vcombine_u8(vget_low_u8(*ma), val);
+
// b = ma * b * one_over_n
// |ma| = [0, 255]
// |sum| is a box sum with radius 1 or 2.
@@ -906,7 +1056,8 @@ inline void CalculateIntermediate(const uint16x8_t sum,
// |kSgrProjReciprocalBits| is 12.
// Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
// Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
- const uint16x8_t maq = vmovl_u8(*ma);
+ const uint16x8_t maq =
+ vmovl_u8((offset == 0) ? vget_low_u8(*ma) : vget_high_u8(*ma));
const uint32x4_t m0 = vmull_u16(vget_low_u16(maq), vget_low_u16(sum));
const uint32x4_t m1 = vmull_u16(vget_high_u16(maq), vget_high_u16(sum));
const uint32x4_t m2 = vmulq_n_u32(m0, one_over_n);
@@ -916,37 +1067,39 @@ inline void CalculateIntermediate(const uint16x8_t sum,
*b = vcombine_u16(b_lo, b_hi);
}
+template <int offset>
inline void CalculateIntermediate5(const uint16x8_t s5[5],
const uint32x4x2_t sq5[5],
- const uint32_t scale, uint8x8_t* const ma,
+ const uint32_t scale, uint8x16_t* const ma,
uint16x8_t* const b) {
const uint16x8_t sum = Sum5_16(s5);
const uint32x4x2_t sum_sq = Sum5_32(sq5);
- CalculateIntermediate<25>(sum, sum_sq, scale, ma, b);
+ CalculateIntermediate<25, offset>(sum, sum_sq, scale, ma, b);
}
+template <int offset>
inline void CalculateIntermediate3(const uint16x8_t s3[3],
const uint32x4x2_t sq3[3],
- const uint32_t scale, uint8x8_t* const ma,
+ const uint32_t scale, uint8x16_t* const ma,
uint16x8_t* const b) {
const uint16x8_t sum = Sum3_16(s3);
const uint32x4x2_t sum_sq = Sum3_32(sq3);
- CalculateIntermediate<9>(sum, sum_sq, scale, ma, b);
+ CalculateIntermediate<9, offset>(sum, sum_sq, scale, ma, b);
}
-inline void Store343_444(const uint8x8x2_t ma3, const uint16x8x2_t b3,
+template <int offset>
+inline void Store343_444(const uint8x16_t ma3[3], const uint16x8_t b3[2],
const ptrdiff_t x, uint16x8_t* const sum_ma343,
uint16x8_t* const sum_ma444,
uint32x4x2_t* const sum_b343,
uint32x4x2_t* const sum_b444, uint16_t* const ma343,
uint16_t* const ma444, uint32_t* const b343,
uint32_t* const b444) {
- uint8x8_t s[3];
- Prepare3_8(ma3, s);
- const uint16x8_t sum_ma111 = Sum3W_16(s);
+ const uint16x8_t sum_ma111 = (offset == 0) ? Sum3WLo16(ma3) : Sum3WHi16(ma3);
*sum_ma444 = vshlq_n_u16(sum_ma111, 2);
const uint16x8_t sum333 = vsubq_u16(*sum_ma444, sum_ma111);
- *sum_ma343 = vaddw_u8(sum333, s[1]);
+ *sum_ma343 = vaddw_u8(
+ sum333, (offset == 0) ? vget_low_u8(ma3[1]) : vget_high_u8(ma3[1]));
uint16x4_t low[3], high[3];
uint32x4x2_t sum_b111;
Prepare3_16(b3, low, high);
@@ -966,93 +1119,211 @@ inline void Store343_444(const uint8x8x2_t ma3, const uint16x8x2_t b3,
vst1q_u32(b444 + x + 4, sum_b444->val[1]);
}
-inline void Store343_444(const uint8x8x2_t ma3, const uint16x8x2_t b3,
+template <int offset>
+inline void Store343_444(const uint8x16_t ma3[3], const uint16x8_t b3[2],
const ptrdiff_t x, uint16x8_t* const sum_ma343,
uint32x4x2_t* const sum_b343, uint16_t* const ma343,
uint16_t* const ma444, uint32_t* const b343,
uint32_t* const b444) {
uint16x8_t sum_ma444;
uint32x4x2_t sum_b444;
- Store343_444(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, &sum_b444, ma343,
- ma444, b343, b444);
+ Store343_444<offset>(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, &sum_b444,
+ ma343, ma444, b343, b444);
}
-inline void Store343_444(const uint8x8x2_t ma3, const uint16x8x2_t b3,
+template <int offset>
+inline void Store343_444(const uint8x16_t ma3[3], const uint16x8_t b3[2],
const ptrdiff_t x, uint16_t* const ma343,
uint16_t* const ma444, uint32_t* const b343,
uint32_t* const b444) {
uint16x8_t sum_ma343;
uint32x4x2_t sum_b343;
- Store343_444(ma3, b3, x, &sum_ma343, &sum_b343, ma343, ma444, b343, b444);
+ Store343_444<offset>(ma3, b3, x, &sum_ma343, &sum_b343, ma343, ma444, b343,
+ b444);
}
-LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5(
- const uint8_t* const src0, const uint8_t* const src1, const ptrdiff_t x,
- const uint32_t scale, uint16_t* const sum5[5],
- uint32_t* const square_sum5[5], uint8x8x2_t s[2], uint16x8x2_t sq[2],
- uint8x8_t* const ma, uint16x8_t* const b) {
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5Lo(
+ const uint8_t* const src0, const uint8_t* const src1, const uint32_t scale,
+ uint8x16_t s[2][2], uint16_t* const sum5[5], uint32_t* const square_sum5[5],
+ uint16x8_t sq[2][4], uint8x16_t* const ma, uint16x8_t* const b) {
uint16x8_t s5[5];
uint32x4x2_t sq5[5];
- s[0].val[1] = vld1_u8(src0 + x + 8);
- s[1].val[1] = vld1_u8(src1 + x + 8);
- sq[0].val[1] = vmull_u8(s[0].val[1], s[0].val[1]);
- sq[1].val[1] = vmull_u8(s[1].val[1], s[1].val[1]);
- s5[3] = Sum5Horizontal(s[0]);
- s5[4] = Sum5Horizontal(s[1]);
+ s[0][0] = vld1q_u8(src0);
+ s[1][0] = vld1q_u8(src1);
+ sq[0][0] = vmull_u8(vget_low_u8(s[0][0]), vget_low_u8(s[0][0]));
+ sq[1][0] = vmull_u8(vget_low_u8(s[1][0]), vget_low_u8(s[1][0]));
+ sq[0][1] = vmull_u8(vget_high_u8(s[0][0]), vget_high_u8(s[0][0]));
+ sq[1][1] = vmull_u8(vget_high_u8(s[1][0]), vget_high_u8(s[1][0]));
+ s5[3] = Sum5Horizontal(s[0][0]);
+ s5[4] = Sum5Horizontal(s[1][0]);
sq5[3] = Sum5WHorizontal(sq[0]);
sq5[4] = Sum5WHorizontal(sq[1]);
- vst1q_u16(sum5[3] + x, s5[3]);
- vst1q_u16(sum5[4] + x, s5[4]);
+ vst1q_u16(sum5[3], s5[3]);
+ vst1q_u16(sum5[4], s5[4]);
+ vst1q_u32(square_sum5[3] + 0, sq5[3].val[0]);
+ vst1q_u32(square_sum5[3] + 4, sq5[3].val[1]);
+ vst1q_u32(square_sum5[4] + 0, sq5[4].val[0]);
+ vst1q_u32(square_sum5[4] + 4, sq5[4].val[1]);
+ s5[0] = vld1q_u16(sum5[0]);
+ s5[1] = vld1q_u16(sum5[1]);
+ s5[2] = vld1q_u16(sum5[2]);
+ sq5[0].val[0] = vld1q_u32(square_sum5[0] + 0);
+ sq5[0].val[1] = vld1q_u32(square_sum5[0] + 4);
+ sq5[1].val[0] = vld1q_u32(square_sum5[1] + 0);
+ sq5[1].val[1] = vld1q_u32(square_sum5[1] + 4);
+ sq5[2].val[0] = vld1q_u32(square_sum5[2] + 0);
+ sq5[2].val[1] = vld1q_u32(square_sum5[2] + 4);
+ CalculateIntermediate5<0>(s5, sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5(
+ const uint8_t* const src0, const uint8_t* const src1, const ptrdiff_t x,
+ const uint32_t scale, uint8x16_t s[2][2], uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5], uint16x8_t sq[2][4], uint8x16_t ma[2],
+ uint16x8_t b[2]) {
+ uint16x8_t s5[2][5];
+ uint32x4x2_t sq5[5];
+ s[0][1] = vld1q_u8(src0 + x + 8);
+ s[1][1] = vld1q_u8(src1 + x + 8);
+ sq[0][2] = vmull_u8(vget_low_u8(s[0][1]), vget_low_u8(s[0][1]));
+ sq[1][2] = vmull_u8(vget_low_u8(s[1][1]), vget_low_u8(s[1][1]));
+ Sum5Horizontal<8>(s[0], &s5[0][3], &s5[1][3]);
+ Sum5Horizontal<8>(s[1], &s5[0][4], &s5[1][4]);
+ sq5[3] = Sum5WHorizontal(sq[0] + 1);
+ sq5[4] = Sum5WHorizontal(sq[1] + 1);
+ vst1q_u16(sum5[3] + x, s5[0][3]);
+ vst1q_u16(sum5[4] + x, s5[0][4]);
vst1q_u32(square_sum5[3] + x + 0, sq5[3].val[0]);
vst1q_u32(square_sum5[3] + x + 4, sq5[3].val[1]);
vst1q_u32(square_sum5[4] + x + 0, sq5[4].val[0]);
vst1q_u32(square_sum5[4] + x + 4, sq5[4].val[1]);
- s5[0] = vld1q_u16(sum5[0] + x);
- s5[1] = vld1q_u16(sum5[1] + x);
- s5[2] = vld1q_u16(sum5[2] + x);
+ s5[0][0] = vld1q_u16(sum5[0] + x);
+ s5[0][1] = vld1q_u16(sum5[1] + x);
+ s5[0][2] = vld1q_u16(sum5[2] + x);
sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 0);
sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 4);
sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 0);
sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 4);
sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 0);
sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 4);
- CalculateIntermediate5(s5, sq5, scale, ma, b);
+ CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], &b[0]);
+
+ sq[0][3] = vmull_u8(vget_high_u8(s[0][1]), vget_high_u8(s[0][1]));
+ sq[1][3] = vmull_u8(vget_high_u8(s[1][1]), vget_high_u8(s[1][1]));
+ sq5[3] = Sum5WHorizontal(sq[0] + 2);
+ sq5[4] = Sum5WHorizontal(sq[1] + 2);
+ vst1q_u16(sum5[3] + x + 8, s5[1][3]);
+ vst1q_u16(sum5[4] + x + 8, s5[1][4]);
+ vst1q_u32(square_sum5[3] + x + 8, sq5[3].val[0]);
+ vst1q_u32(square_sum5[3] + x + 12, sq5[3].val[1]);
+ vst1q_u32(square_sum5[4] + x + 8, sq5[4].val[0]);
+ vst1q_u32(square_sum5[4] + x + 12, sq5[4].val[1]);
+ s5[1][0] = vld1q_u16(sum5[0] + x + 8);
+ s5[1][1] = vld1q_u16(sum5[1] + x + 8);
+ s5[1][2] = vld1q_u16(sum5[2] + x + 8);
+ sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 8);
+ sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 12);
+ sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 8);
+ sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 12);
+ sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 8);
+ sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 12);
+ CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], &b[1]);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRowLo(
+ const uint8_t* const src, const uint32_t scale, uint8x16_t* const s,
+ const uint16_t* const sum5[5], const uint32_t* const square_sum5[5],
+ uint16x8_t sq[2], uint8x16_t* const ma, uint16x8_t* const b) {
+ uint16x8_t s5[5];
+ uint32x4x2_t sq5[5];
+ *s = vld1q_u8(src);
+ sq[0] = vmull_u8(vget_low_u8(*s), vget_low_u8(*s));
+ sq[1] = vmull_u8(vget_high_u8(*s), vget_high_u8(*s));
+ s5[3] = s5[4] = Sum5Horizontal(*s);
+ sq5[3] = sq5[4] = Sum5WHorizontal(sq);
+ s5[0] = vld1q_u16(sum5[0]);
+ s5[1] = vld1q_u16(sum5[1]);
+ s5[2] = vld1q_u16(sum5[2]);
+ sq5[0].val[0] = vld1q_u32(square_sum5[0] + 0);
+ sq5[0].val[1] = vld1q_u32(square_sum5[0] + 4);
+ sq5[1].val[0] = vld1q_u32(square_sum5[1] + 0);
+ sq5[1].val[1] = vld1q_u32(square_sum5[1] + 4);
+ sq5[2].val[0] = vld1q_u32(square_sum5[2] + 0);
+ sq5[2].val[1] = vld1q_u32(square_sum5[2] + 4);
+ CalculateIntermediate5<0>(s5, sq5, scale, ma, b);
}
LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRow(
const uint8_t* const src, const ptrdiff_t x, const uint32_t scale,
- const uint16_t* const sum5[5], const uint32_t* const square_sum5[5],
- uint8x8x2_t* const s, uint16x8x2_t* const sq, uint8x8_t* const ma,
- uint16x8_t* const b) {
- uint16x8_t s5[5];
+ uint8x16_t s[2], const uint16_t* const sum5[5],
+ const uint32_t* const square_sum5[5], uint16x8_t sq[3], uint8x16_t ma[2],
+ uint16x8_t b[2]) {
+ uint16x8_t s5[2][5];
uint32x4x2_t sq5[5];
- s->val[1] = vld1_u8(src + x + 8);
- sq->val[1] = vmull_u8(s->val[1], s->val[1]);
- s5[3] = s5[4] = Sum5Horizontal(*s);
- sq5[3] = sq5[4] = Sum5WHorizontal(*sq);
- s5[0] = vld1q_u16(sum5[0] + x);
- s5[1] = vld1q_u16(sum5[1] + x);
- s5[2] = vld1q_u16(sum5[2] + x);
+ s[1] = vld1q_u8(src + x + 8);
+ sq[1] = vmull_u8(vget_low_u8(s[1]), vget_low_u8(s[1]));
+ Sum5Horizontal<8>(s, &s5[0][3], &s5[1][3]);
+ sq5[3] = sq5[4] = Sum5WHorizontal(sq);
+ s5[0][0] = vld1q_u16(sum5[0] + x);
+ s5[0][1] = vld1q_u16(sum5[1] + x);
+ s5[0][2] = vld1q_u16(sum5[2] + x);
+ s5[0][4] = s5[0][3];
sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 0);
sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 4);
sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 0);
sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 4);
sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 0);
sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 4);
- CalculateIntermediate5(s5, sq5, scale, ma, b);
+ CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], &b[0]);
+
+ sq[2] = vmull_u8(vget_high_u8(s[1]), vget_high_u8(s[1]));
+ sq5[3] = sq5[4] = Sum5WHorizontal(sq + 1);
+ s5[1][0] = vld1q_u16(sum5[0] + x + 8);
+ s5[1][1] = vld1q_u16(sum5[1] + x + 8);
+ s5[1][2] = vld1q_u16(sum5[2] + x + 8);
+ s5[1][4] = s5[1][3];
+ sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 8);
+ sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 12);
+ sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 8);
+ sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 12);
+ sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 8);
+ sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 12);
+ CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], &b[1]);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3Lo(
+ const uint8_t* const src, const uint32_t scale, uint8x16_t* const s,
+ uint16_t* const sum3[3], uint32_t* const square_sum3[3], uint16x8_t sq[2],
+ uint8x16_t* const ma, uint16x8_t* const b) {
+ uint16x8_t s3[3];
+ uint32x4x2_t sq3[3];
+ *s = vld1q_u8(src);
+ sq[0] = vmull_u8(vget_low_u8(*s), vget_low_u8(*s));
+ sq[1] = vmull_u8(vget_high_u8(*s), vget_high_u8(*s));
+ s3[2] = Sum3Horizontal(*s);
+ sq3[2] = Sum3WHorizontal(sq);
+ vst1q_u16(sum3[2], s3[2]);
+ vst1q_u32(square_sum3[2] + 0, sq3[2].val[0]);
+ vst1q_u32(square_sum3[2] + 4, sq3[2].val[1]);
+ s3[0] = vld1q_u16(sum3[0]);
+ s3[1] = vld1q_u16(sum3[1]);
+ sq3[0].val[0] = vld1q_u32(square_sum3[0] + 0);
+ sq3[0].val[1] = vld1q_u32(square_sum3[0] + 4);
+ sq3[1].val[0] = vld1q_u32(square_sum3[1] + 0);
+ sq3[1].val[1] = vld1q_u32(square_sum3[1] + 4);
+ CalculateIntermediate3<0>(s3, sq3, scale, ma, b);
}
LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3(
const uint8_t* const src, const ptrdiff_t x, const uint32_t scale,
- uint16_t* const sum3[3], uint32_t* const square_sum3[3],
- uint8x8x2_t* const s, uint16x8x2_t* const sq, uint8x8_t* const ma,
- uint16x8_t* const b) {
- uint16x8_t s3[3];
+ uint16_t* const sum3[3], uint32_t* const square_sum3[3], uint8x16_t s[2],
+ uint16x8_t sq[3], uint8x16_t ma[2], uint16x8_t b[2]) {
+ uint16x8_t s3[4];
uint32x4x2_t sq3[3];
- s->val[1] = vld1_u8(src + x + 8);
- sq->val[1] = vmull_u8(s->val[1], s->val[1]);
- s3[2] = Sum3Horizontal(*s);
- sq3[2] = Sum3WHorizontal(*sq);
+ s[1] = vld1q_u8(src + x + 8);
+ sq[1] = vmull_u8(vget_low_u8(s[1]), vget_low_u8(s[1]));
+ Sum3Horizontal<8>(s, s3 + 2);
+ sq3[2] = Sum3WHorizontal(sq);
vst1q_u16(sum3[2] + x, s3[2]);
vst1q_u32(square_sum3[2] + x + 0, sq3[2].val[0]);
vst1q_u32(square_sum3[2] + x + 4, sq3[2].val[1]);
@@ -1062,71 +1333,204 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3(
sq3[0].val[1] = vld1q_u32(square_sum3[0] + x + 4);
sq3[1].val[0] = vld1q_u32(square_sum3[1] + x + 0);
sq3[1].val[1] = vld1q_u32(square_sum3[1] + x + 4);
- CalculateIntermediate3(s3, sq3, scale, ma, b);
+ CalculateIntermediate3<8>(s3, sq3, scale, &ma[0], &b[0]);
+
+ sq[2] = vmull_u8(vget_high_u8(s[1]), vget_high_u8(s[1]));
+ sq3[2] = Sum3WHorizontal(sq + 1);
+ vst1q_u16(sum3[2] + x + 8, s3[3]);
+ vst1q_u32(square_sum3[2] + x + 8, sq3[2].val[0]);
+ vst1q_u32(square_sum3[2] + x + 12, sq3[2].val[1]);
+ s3[1] = vld1q_u16(sum3[0] + x + 8);
+ s3[2] = vld1q_u16(sum3[1] + x + 8);
+ sq3[0].val[0] = vld1q_u32(square_sum3[0] + x + 8);
+ sq3[0].val[1] = vld1q_u32(square_sum3[0] + x + 12);
+ sq3[1].val[0] = vld1q_u32(square_sum3[1] + x + 8);
+ sq3[1].val[1] = vld1q_u32(square_sum3[1] + x + 12);
+ CalculateIntermediate3<0>(s3 + 1, sq3, scale, &ma[1], &b[1]);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLo(
+ const uint8_t* const src0, const uint8_t* const src1,
+ const uint16_t scales[2], uint8x16_t s[2][2], uint16_t* const sum3[4],
+ uint16_t* const sum5[5], uint32_t* const square_sum3[4],
+ uint32_t* const square_sum5[5], uint16x8_t sq[2][4], uint8x16_t ma3[2][2],
+ uint16x8_t b3[2][3], uint8x16_t* const ma5, uint16x8_t* const b5) {
+ uint16x8_t s3[4], s5[5];
+ uint32x4x2_t sq3[4], sq5[5];
+ s[0][0] = vld1q_u8(src0);
+ s[1][0] = vld1q_u8(src1);
+ sq[0][0] = vmull_u8(vget_low_u8(s[0][0]), vget_low_u8(s[0][0]));
+ sq[1][0] = vmull_u8(vget_low_u8(s[1][0]), vget_low_u8(s[1][0]));
+ sq[0][1] = vmull_u8(vget_high_u8(s[0][0]), vget_high_u8(s[0][0]));
+ sq[1][1] = vmull_u8(vget_high_u8(s[1][0]), vget_high_u8(s[1][0]));
+ SumHorizontal(s[0][0], sq[0], &s3[2], &s5[3], &sq3[2], &sq5[3]);
+ SumHorizontal(s[1][0], sq[1], &s3[3], &s5[4], &sq3[3], &sq5[4]);
+ vst1q_u16(sum3[2], s3[2]);
+ vst1q_u16(sum3[3], s3[3]);
+ vst1q_u32(square_sum3[2] + 0, sq3[2].val[0]);
+ vst1q_u32(square_sum3[2] + 4, sq3[2].val[1]);
+ vst1q_u32(square_sum3[3] + 0, sq3[3].val[0]);
+ vst1q_u32(square_sum3[3] + 4, sq3[3].val[1]);
+ vst1q_u16(sum5[3], s5[3]);
+ vst1q_u16(sum5[4], s5[4]);
+ vst1q_u32(square_sum5[3] + 0, sq5[3].val[0]);
+ vst1q_u32(square_sum5[3] + 4, sq5[3].val[1]);
+ vst1q_u32(square_sum5[4] + 0, sq5[4].val[0]);
+ vst1q_u32(square_sum5[4] + 4, sq5[4].val[1]);
+ s3[0] = vld1q_u16(sum3[0]);
+ s3[1] = vld1q_u16(sum3[1]);
+ sq3[0].val[0] = vld1q_u32(square_sum3[0] + 0);
+ sq3[0].val[1] = vld1q_u32(square_sum3[0] + 4);
+ sq3[1].val[0] = vld1q_u32(square_sum3[1] + 0);
+ sq3[1].val[1] = vld1q_u32(square_sum3[1] + 4);
+ s5[0] = vld1q_u16(sum5[0]);
+ s5[1] = vld1q_u16(sum5[1]);
+ s5[2] = vld1q_u16(sum5[2]);
+ sq5[0].val[0] = vld1q_u32(square_sum5[0] + 0);
+ sq5[0].val[1] = vld1q_u32(square_sum5[0] + 4);
+ sq5[1].val[0] = vld1q_u32(square_sum5[1] + 0);
+ sq5[1].val[1] = vld1q_u32(square_sum5[1] + 4);
+ sq5[2].val[0] = vld1q_u32(square_sum5[2] + 0);
+ sq5[2].val[1] = vld1q_u32(square_sum5[2] + 4);
+ CalculateIntermediate3<0>(s3, sq3, scales[1], ma3[0], b3[0]);
+ CalculateIntermediate3<0>(s3 + 1, sq3 + 1, scales[1], ma3[1], b3[1]);
+ CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5);
}
LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess(
const uint8_t* const src0, const uint8_t* const src1, const ptrdiff_t x,
- const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
- uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
- uint8x8x2_t s[2], uint16x8x2_t sq[2], uint8x8_t* const ma3_0,
- uint8x8_t* const ma3_1, uint16x8_t* const b3_0, uint16x8_t* const b3_1,
- uint8x8_t* const ma5, uint16x8_t* const b5) {
- uint16x8_t s3[4], s5[5];
+ const uint16_t scales[2], uint8x16_t s[2][2], uint16_t* const sum3[4],
+ uint16_t* const sum5[5], uint32_t* const square_sum3[4],
+ uint32_t* const square_sum5[5], uint16x8_t sq[2][4], uint8x16_t ma3[2][2],
+ uint16x8_t b3[2][3], uint8x16_t ma5[2], uint16x8_t b5[2]) {
+ uint16x8_t s3[2][4], s5[2][5];
uint32x4x2_t sq3[4], sq5[5];
- s[0].val[1] = vld1_u8(src0 + x + 8);
- s[1].val[1] = vld1_u8(src1 + x + 8);
- sq[0].val[1] = vmull_u8(s[0].val[1], s[0].val[1]);
- sq[1].val[1] = vmull_u8(s[1].val[1], s[1].val[1]);
- SumHorizontal(s[0], sq[0], &s3[2], &s5[3], &sq3[2], &sq5[3]);
- SumHorizontal(s[1], sq[1], &s3[3], &s5[4], &sq3[3], &sq5[4]);
- vst1q_u16(sum3[2] + x, s3[2]);
- vst1q_u16(sum3[3] + x, s3[3]);
+ s[0][1] = vld1q_u8(src0 + x + 8);
+ s[1][1] = vld1q_u8(src1 + x + 8);
+ sq[0][2] = vmull_u8(vget_low_u8(s[0][1]), vget_low_u8(s[0][1]));
+ sq[1][2] = vmull_u8(vget_low_u8(s[1][1]), vget_low_u8(s[1][1]));
+ SumHorizontal<8>(s[0], &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]);
+ SumHorizontal<8>(s[1], &s3[0][3], &s3[1][3], &s5[0][4], &s5[1][4]);
+ SumHorizontal(sq[0] + 1, &sq3[2], &sq5[3]);
+ SumHorizontal(sq[1] + 1, &sq3[3], &sq5[4]);
+ vst1q_u16(sum3[2] + x, s3[0][2]);
+ vst1q_u16(sum3[3] + x, s3[0][3]);
vst1q_u32(square_sum3[2] + x + 0, sq3[2].val[0]);
vst1q_u32(square_sum3[2] + x + 4, sq3[2].val[1]);
vst1q_u32(square_sum3[3] + x + 0, sq3[3].val[0]);
vst1q_u32(square_sum3[3] + x + 4, sq3[3].val[1]);
- vst1q_u16(sum5[3] + x, s5[3]);
- vst1q_u16(sum5[4] + x, s5[4]);
+ vst1q_u16(sum5[3] + x, s5[0][3]);
+ vst1q_u16(sum5[4] + x, s5[0][4]);
vst1q_u32(square_sum5[3] + x + 0, sq5[3].val[0]);
vst1q_u32(square_sum5[3] + x + 4, sq5[3].val[1]);
vst1q_u32(square_sum5[4] + x + 0, sq5[4].val[0]);
vst1q_u32(square_sum5[4] + x + 4, sq5[4].val[1]);
- s3[0] = vld1q_u16(sum3[0] + x);
- s3[1] = vld1q_u16(sum3[1] + x);
+ s3[0][0] = vld1q_u16(sum3[0] + x);
+ s3[0][1] = vld1q_u16(sum3[1] + x);
sq3[0].val[0] = vld1q_u32(square_sum3[0] + x + 0);
sq3[0].val[1] = vld1q_u32(square_sum3[0] + x + 4);
sq3[1].val[0] = vld1q_u32(square_sum3[1] + x + 0);
sq3[1].val[1] = vld1q_u32(square_sum3[1] + x + 4);
- s5[0] = vld1q_u16(sum5[0] + x);
- s5[1] = vld1q_u16(sum5[1] + x);
- s5[2] = vld1q_u16(sum5[2] + x);
+ s5[0][0] = vld1q_u16(sum5[0] + x);
+ s5[0][1] = vld1q_u16(sum5[1] + x);
+ s5[0][2] = vld1q_u16(sum5[2] + x);
sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 0);
sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 4);
sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 0);
sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 4);
sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 0);
sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 4);
- CalculateIntermediate3(s3, sq3, scales[1], ma3_0, b3_0);
- CalculateIntermediate3(s3 + 1, sq3 + 1, scales[1], ma3_1, b3_1);
- CalculateIntermediate5(s5, sq5, scales[0], ma5, b5);
+ CalculateIntermediate3<8>(s3[0], sq3, scales[1], &ma3[0][0], &b3[0][1]);
+ CalculateIntermediate3<8>(s3[0] + 1, sq3 + 1, scales[1], &ma3[1][0],
+ &b3[1][1]);
+ CalculateIntermediate5<8>(s5[0], sq5, scales[0], &ma5[0], &b5[0]);
+
+ sq[0][3] = vmull_u8(vget_high_u8(s[0][1]), vget_high_u8(s[0][1]));
+ sq[1][3] = vmull_u8(vget_high_u8(s[1][1]), vget_high_u8(s[1][1]));
+ SumHorizontal(sq[0] + 2, &sq3[2], &sq5[3]);
+ SumHorizontal(sq[1] + 2, &sq3[3], &sq5[4]);
+ vst1q_u16(sum3[2] + x + 8, s3[1][2]);
+ vst1q_u16(sum3[3] + x + 8, s3[1][3]);
+ vst1q_u32(square_sum3[2] + x + 8, sq3[2].val[0]);
+ vst1q_u32(square_sum3[2] + x + 12, sq3[2].val[1]);
+ vst1q_u32(square_sum3[3] + x + 8, sq3[3].val[0]);
+ vst1q_u32(square_sum3[3] + x + 12, sq3[3].val[1]);
+ vst1q_u16(sum5[3] + x + 8, s5[1][3]);
+ vst1q_u16(sum5[4] + x + 8, s5[1][4]);
+ vst1q_u32(square_sum5[3] + x + 8, sq5[3].val[0]);
+ vst1q_u32(square_sum5[3] + x + 12, sq5[3].val[1]);
+ vst1q_u32(square_sum5[4] + x + 8, sq5[4].val[0]);
+ vst1q_u32(square_sum5[4] + x + 12, sq5[4].val[1]);
+ s3[1][0] = vld1q_u16(sum3[0] + x + 8);
+ s3[1][1] = vld1q_u16(sum3[1] + x + 8);
+ sq3[0].val[0] = vld1q_u32(square_sum3[0] + x + 8);
+ sq3[0].val[1] = vld1q_u32(square_sum3[0] + x + 12);
+ sq3[1].val[0] = vld1q_u32(square_sum3[1] + x + 8);
+ sq3[1].val[1] = vld1q_u32(square_sum3[1] + x + 12);
+ s5[1][0] = vld1q_u16(sum5[0] + x + 8);
+ s5[1][1] = vld1q_u16(sum5[1] + x + 8);
+ s5[1][2] = vld1q_u16(sum5[2] + x + 8);
+ sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 8);
+ sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 12);
+ sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 8);
+ sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 12);
+ sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 8);
+ sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 12);
+ CalculateIntermediate3<0>(s3[1], sq3, scales[1], &ma3[0][1], &b3[0][2]);
+ CalculateIntermediate3<0>(s3[1] + 1, sq3 + 1, scales[1], &ma3[1][1],
+ &b3[1][2]);
+ CalculateIntermediate5<0>(s5[1], sq5, scales[0], &ma5[1], &b5[1]);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRowLo(
+ const uint8_t* const src, const uint16_t scales[2],
+ const uint16_t* const sum3[4], const uint16_t* const sum5[5],
+ const uint32_t* const square_sum3[4], const uint32_t* const square_sum5[5],
+ uint8x16_t* const s, uint16x8_t sq[2], uint8x16_t* const ma3,
+ uint8x16_t* const ma5, uint16x8_t* const b3, uint16x8_t* const b5) {
+ uint16x8_t s3[3], s5[5];
+ uint32x4x2_t sq3[3], sq5[5];
+ *s = vld1q_u8(src);
+ sq[0] = vmull_u8(vget_low_u8(*s), vget_low_u8(*s));
+ sq[1] = vmull_u8(vget_high_u8(*s), vget_high_u8(*s));
+ SumHorizontal(*s, sq, &s3[2], &s5[3], &sq3[2], &sq5[3]);
+ s5[0] = vld1q_u16(sum5[0]);
+ s5[1] = vld1q_u16(sum5[1]);
+ s5[2] = vld1q_u16(sum5[2]);
+ s5[4] = s5[3];
+ sq5[0].val[0] = vld1q_u32(square_sum5[0] + 0);
+ sq5[0].val[1] = vld1q_u32(square_sum5[0] + 4);
+ sq5[1].val[0] = vld1q_u32(square_sum5[1] + 0);
+ sq5[1].val[1] = vld1q_u32(square_sum5[1] + 4);
+ sq5[2].val[0] = vld1q_u32(square_sum5[2] + 0);
+ sq5[2].val[1] = vld1q_u32(square_sum5[2] + 4);
+ sq5[4] = sq5[3];
+ CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5);
+ s3[0] = vld1q_u16(sum3[0]);
+ s3[1] = vld1q_u16(sum3[1]);
+ sq3[0].val[0] = vld1q_u32(square_sum3[0] + 0);
+ sq3[0].val[1] = vld1q_u32(square_sum3[0] + 4);
+ sq3[1].val[0] = vld1q_u32(square_sum3[1] + 0);
+ sq3[1].val[1] = vld1q_u32(square_sum3[1] + 4);
+ CalculateIntermediate3<0>(s3, sq3, scales[1], ma3, b3);
}
LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRow(
const uint8_t* const src, const ptrdiff_t x, const uint16_t scales[2],
const uint16_t* const sum3[4], const uint16_t* const sum5[5],
const uint32_t* const square_sum3[4], const uint32_t* const square_sum5[5],
- uint8x8x2_t* const s, uint16x8x2_t* const sq, uint8x8_t* const ma3,
- uint8x8_t* const ma5, uint16x8_t* const b3, uint16x8_t* const b5) {
- uint16x8_t s3[3], s5[5];
+ uint8x16_t s[2], uint16x8_t sq[3], uint8x16_t ma3[2], uint8x16_t ma5[2],
+ uint16x8_t b3[2], uint16x8_t b5[2]) {
+ uint16x8_t s3[2][3], s5[2][5];
uint32x4x2_t sq3[3], sq5[5];
- s->val[1] = vld1_u8(src + x + 8);
- sq->val[1] = vmull_u8(s->val[1], s->val[1]);
- SumHorizontal(*s, *sq, &s3[2], &s5[3], &sq3[2], &sq5[3]);
- s5[0] = vld1q_u16(sum5[0] + x);
- s5[1] = vld1q_u16(sum5[1] + x);
- s5[2] = vld1q_u16(sum5[2] + x);
- s5[4] = s5[3];
+ s[1] = vld1q_u8(src + x + 8);
+ sq[1] = vmull_u8(vget_low_u8(s[1]), vget_low_u8(s[1]));
+ SumHorizontal<8>(s, &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]);
+ SumHorizontal(sq, &sq3[2], &sq5[3]);
+ s5[0][0] = vld1q_u16(sum5[0] + x);
+ s5[0][1] = vld1q_u16(sum5[1] + x);
+ s5[0][2] = vld1q_u16(sum5[2] + x);
+ s5[0][4] = s5[0][3];
sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 0);
sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 4);
sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 0);
@@ -1134,14 +1538,36 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRow(
sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 0);
sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 4);
sq5[4] = sq5[3];
- CalculateIntermediate5(s5, sq5, scales[0], ma5, b5);
- s3[0] = vld1q_u16(sum3[0] + x);
- s3[1] = vld1q_u16(sum3[1] + x);
+ CalculateIntermediate5<8>(s5[0], sq5, scales[0], &ma5[0], &b5[0]);
+ s3[0][0] = vld1q_u16(sum3[0] + x);
+ s3[0][1] = vld1q_u16(sum3[1] + x);
sq3[0].val[0] = vld1q_u32(square_sum3[0] + x + 0);
sq3[0].val[1] = vld1q_u32(square_sum3[0] + x + 4);
sq3[1].val[0] = vld1q_u32(square_sum3[1] + x + 0);
sq3[1].val[1] = vld1q_u32(square_sum3[1] + x + 4);
- CalculateIntermediate3(s3, sq3, scales[1], ma3, b3);
+ CalculateIntermediate3<8>(s3[0], sq3, scales[1], &ma3[0], &b3[0]);
+
+ sq[2] = vmull_u8(vget_high_u8(s[1]), vget_high_u8(s[1]));
+ SumHorizontal(sq + 1, &sq3[2], &sq5[3]);
+ s5[1][0] = vld1q_u16(sum5[0] + x + 8);
+ s5[1][1] = vld1q_u16(sum5[1] + x + 8);
+ s5[1][2] = vld1q_u16(sum5[2] + x + 8);
+ s5[1][4] = s5[1][3];
+ sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 8);
+ sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 12);
+ sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 8);
+ sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 12);
+ sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 8);
+ sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 12);
+ sq5[4] = sq5[3];
+ CalculateIntermediate5<0>(s5[1], sq5, scales[0], &ma5[1], &b5[1]);
+ s3[1][0] = vld1q_u16(sum3[0] + x + 8);
+ s3[1][1] = vld1q_u16(sum3[1] + x + 8);
+ sq3[0].val[0] = vld1q_u32(square_sum3[0] + x + 8);
+ sq3[0].val[1] = vld1q_u32(square_sum3[0] + x + 12);
+ sq3[1].val[0] = vld1q_u32(square_sum3[1] + x + 8);
+ sq3[1].val[1] = vld1q_u32(square_sum3[1] + x + 12);
+ CalculateIntermediate3<0>(s3[1], sq3, scales[1], &ma3[1], &b3[1]);
}
inline void BoxSumFilterPreProcess5(const uint8_t* const src0,
@@ -1150,33 +1576,39 @@ inline void BoxSumFilterPreProcess5(const uint8_t* const src0,
uint16_t* const sum5[5],
uint32_t* const square_sum5[5],
uint16_t* ma565, uint32_t* b565) {
- uint8x8x2_t s[2], mas;
- uint16x8x2_t sq[2], bs;
- s[0].val[0] = vld1_u8(src0);
- s[1].val[0] = vld1_u8(src1);
- sq[0].val[0] = vmull_u8(s[0].val[0], s[0].val[0]);
- sq[1].val[0] = vmull_u8(s[1].val[0], s[1].val[0]);
- BoxFilterPreProcess5(src0, src1, 0, scale, sum5, square_sum5, s, sq,
- &mas.val[0], &bs.val[0]);
+ uint8x16_t s[2][2], mas[2];
+ uint16x8_t sq[2][4], bs[3];
+ BoxFilterPreProcess5Lo(src0, src1, scale, s, sum5, square_sum5, sq, &mas[0],
+ &bs[0]);
int x = 0;
do {
- s[0].val[0] = s[0].val[1];
- s[1].val[0] = s[1].val[1];
- sq[0].val[0] = sq[0].val[1];
- sq[1].val[0] = sq[1].val[1];
- BoxFilterPreProcess5(src0, src1, x + 8, scale, sum5, square_sum5, s, sq,
- &mas.val[1], &bs.val[1]);
- const uint16x8_t ma = Sum565(mas);
- const uint32x4x2_t b = Sum565W(bs);
- vst1q_u16(ma565, ma);
- vst1q_u32(b565 + 0, b.val[0]);
- vst1q_u32(b565 + 4, b.val[1]);
- mas.val[0] = mas.val[1];
- bs.val[0] = bs.val[1];
- ma565 += 8;
- b565 += 8;
- x += 8;
+ uint16x8_t ma[2];
+ uint8x16_t masx[3];
+ uint32x4x2_t b[2];
+ BoxFilterPreProcess5(src0, src1, x + 8, scale, s, sum5, square_sum5, sq,
+ mas, bs + 1);
+ Prepare3_8<0>(mas, masx);
+ ma[0] = Sum565<0>(masx);
+ b[0] = Sum565W(bs);
+ vst1q_u16(ma565, ma[0]);
+ vst1q_u32(b565 + 0, b[0].val[0]);
+ vst1q_u32(b565 + 4, b[0].val[1]);
+
+ ma[1] = Sum565<8>(masx);
+ b[1] = Sum565W(bs + 1);
+ vst1q_u16(ma565 + 8, ma[1]);
+ vst1q_u32(b565 + 8, b[1].val[0]);
+ vst1q_u32(b565 + 12, b[1].val[1]);
+ s[0][0] = s[0][1];
+ s[1][0] = s[1][1];
+ sq[0][1] = sq[0][3];
+ sq[1][1] = sq[1][3];
+ mas[0] = mas[1];
+ bs[0] = bs[2];
+ ma565 += 16;
+ b565 += 16;
+ x += 16;
} while (x < width);
}
@@ -1185,35 +1617,44 @@ LIBGAV1_ALWAYS_INLINE void BoxSumFilterPreProcess3(
const uint8_t* const src, const int width, const uint32_t scale,
uint16_t* const sum3[3], uint32_t* const square_sum3[3], uint16_t* ma343,
uint16_t* ma444, uint32_t* b343, uint32_t* b444) {
- uint8x8x2_t s, mas;
- uint16x8x2_t sq, bs;
- s.val[0] = vld1_u8(src);
- sq.val[0] = vmull_u8(s.val[0], s.val[0]);
- BoxFilterPreProcess3(src, 0, scale, sum3, square_sum3, &s, &sq, &mas.val[0],
- &bs.val[0]);
+ uint8x16_t s[2], mas[2];
+ uint16x8_t sq[4], bs[3];
+ BoxFilterPreProcess3Lo(src, scale, &s[0], sum3, square_sum3, sq, &mas[0],
+ &bs[0]);
int x = 0;
do {
- s.val[0] = s.val[1];
- sq.val[0] = sq.val[1];
- BoxFilterPreProcess3(src, x + 8, scale, sum3, square_sum3, &s, &sq,
- &mas.val[1], &bs.val[1]);
+ uint8x16_t ma3x[3];
+ BoxFilterPreProcess3(src, x + 8, scale, sum3, square_sum3, s, sq + 1, mas,
+ bs + 1);
+ Prepare3_8<0>(mas, ma3x);
if (calculate444) {
- Store343_444(mas, bs, 0, ma343, ma444, b343, b444);
- ma444 += 8;
- b444 += 8;
+ Store343_444<0>(ma3x, bs + 0, 0, ma343, ma444, b343, b444);
+ Store343_444<8>(ma3x, bs + 1, 0, ma343 + 8, ma444 + 8, b343 + 8,
+ b444 + 8);
+ ma444 += 16;
+ b444 += 16;
} else {
- const uint16x8_t ma = Sum343(mas);
- const uint32x4x2_t b = Sum343W(bs);
- vst1q_u16(ma343, ma);
- vst1q_u32(b343 + 0, b.val[0]);
- vst1q_u32(b343 + 4, b.val[1]);
+ uint16x8_t ma[2];
+ uint32x4x2_t b[2];
+ ma[0] = Sum343<0>(ma3x);
+ b[0] = Sum343W(bs);
+ vst1q_u16(ma343, ma[0]);
+ vst1q_u32(b343 + 0, b[0].val[0]);
+ vst1q_u32(b343 + 4, b[0].val[1]);
+ ma[1] = Sum343<8>(ma3x);
+ b[1] = Sum343W(bs + 1);
+ vst1q_u16(ma343 + 8, ma[1]);
+ vst1q_u32(b343 + 8, b[1].val[0]);
+ vst1q_u32(b343 + 12, b[1].val[1]);
}
- mas.val[0] = mas.val[1];
- bs.val[0] = bs.val[1];
- ma343 += 8;
- b343 += 8;
- x += 8;
+ s[0] = s[1];
+ sq[1] = sq[3];
+ mas[0] = mas[1];
+ bs[0] = bs[2];
+ ma343 += 16;
+ b343 += 16;
+ x += 16;
} while (x < width);
}
@@ -1221,48 +1662,58 @@ inline void BoxSumFilterPreProcess(
const uint8_t* const src0, const uint8_t* const src1, const int width,
const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
- uint16_t* const ma343[4], uint16_t* const ma444[2], uint16_t* ma565,
- uint32_t* const b343[4], uint32_t* const b444[2], uint32_t* b565) {
- uint8x8x2_t s[2];
- uint8x8x2_t ma3[2], ma5;
- uint16x8x2_t sq[2], b3[2], b5;
- s[0].val[0] = vld1_u8(src0);
- s[1].val[0] = vld1_u8(src1);
- sq[0].val[0] = vmull_u8(s[0].val[0], s[0].val[0]);
- sq[1].val[0] = vmull_u8(s[1].val[0], s[1].val[0]);
- BoxFilterPreProcess(src0, src1, 0, scales, sum3, sum5, square_sum3,
- square_sum5, s, sq, &ma3[0].val[0], &ma3[1].val[0],
- &b3[0].val[0], &b3[1].val[0], &ma5.val[0], &b5.val[0]);
+ uint16_t* const ma343[4], uint16_t* const ma444, uint16_t* ma565,
+ uint32_t* const b343[4], uint32_t* const b444, uint32_t* b565) {
+ uint8x16_t s[2][2], ma3[2][2], ma5[2];
+ uint16x8_t sq[2][4], b3[2][3], b5[3];
+ BoxFilterPreProcessLo(src0, src1, scales, s, sum3, sum5, square_sum3,
+ square_sum5, sq, ma3, b3, &ma5[0], &b5[0]);
int x = 0;
do {
- s[0].val[0] = s[0].val[1];
- s[1].val[0] = s[1].val[1];
- sq[0].val[0] = sq[0].val[1];
- sq[1].val[0] = sq[1].val[1];
- BoxFilterPreProcess(src0, src1, x + 8, scales, sum3, sum5, square_sum3,
- square_sum5, s, sq, &ma3[0].val[1], &ma3[1].val[1],
- &b3[0].val[1], &b3[1].val[1], &ma5.val[1], &b5.val[1]);
- uint16x8_t ma = Sum343(ma3[0]);
- uint32x4x2_t b = Sum343W(b3[0]);
- vst1q_u16(ma343[0] + x, ma);
- vst1q_u32(b343[0] + x, b.val[0]);
- vst1q_u32(b343[0] + x + 4, b.val[1]);
- Store343_444(ma3[1], b3[1], x, ma343[1], ma444[0], b343[1], b444[0]);
- ma = Sum565(ma5);
- b = Sum565W(b5);
- vst1q_u16(ma565, ma);
- vst1q_u32(b565 + 0, b.val[0]);
- vst1q_u32(b565 + 4, b.val[1]);
- ma3[0].val[0] = ma3[0].val[1];
- ma3[1].val[0] = ma3[1].val[1];
- b3[0].val[0] = b3[0].val[1];
- b3[1].val[0] = b3[1].val[1];
- ma5.val[0] = ma5.val[1];
- b5.val[0] = b5.val[1];
- ma565 += 8;
- b565 += 8;
- x += 8;
+ uint16x8_t ma[2];
+ uint8x16_t ma3x[3], ma5x[3];
+ uint32x4x2_t b[2];
+ BoxFilterPreProcess(src0, src1, x + 8, scales, s, sum3, sum5, square_sum3,
+ square_sum5, sq, ma3, b3, ma5, b5 + 1);
+ Prepare3_8<0>(ma3[0], ma3x);
+ ma[0] = Sum343<0>(ma3x);
+ ma[1] = Sum343<8>(ma3x);
+ b[0] = Sum343W(b3[0] + 0);
+ b[1] = Sum343W(b3[0] + 1);
+ vst1q_u16(ma343[0] + x, ma[0]);
+ vst1q_u16(ma343[0] + x + 8, ma[1]);
+ vst1q_u32(b343[0] + x, b[0].val[0]);
+ vst1q_u32(b343[0] + x + 4, b[0].val[1]);
+ vst1q_u32(b343[0] + x + 8, b[1].val[0]);
+ vst1q_u32(b343[0] + x + 12, b[1].val[1]);
+ Prepare3_8<0>(ma3[1], ma3x);
+ Store343_444<0>(ma3x, b3[1], x, ma343[1], ma444, b343[1], b444);
+ Store343_444<8>(ma3x, b3[1] + 1, x + 8, ma343[1], ma444, b343[1], b444);
+ Prepare3_8<0>(ma5, ma5x);
+ ma[0] = Sum565<0>(ma5x);
+ ma[1] = Sum565<8>(ma5x);
+ b[0] = Sum565W(b5);
+ b[1] = Sum565W(b5 + 1);
+ vst1q_u16(ma565, ma[0]);
+ vst1q_u16(ma565 + 8, ma[1]);
+ vst1q_u32(b565 + 0, b[0].val[0]);
+ vst1q_u32(b565 + 4, b[0].val[1]);
+ vst1q_u32(b565 + 8, b[1].val[0]);
+ vst1q_u32(b565 + 12, b[1].val[1]);
+ s[0][0] = s[0][1];
+ s[1][0] = s[1][1];
+ sq[0][1] = sq[0][3];
+ sq[1][1] = sq[1][3];
+ ma3[0][0] = ma3[0][1];
+ ma3[1][0] = ma3[1][1];
+ b3[0][0] = b3[0][2];
+ b3[1][0] = b3[1][2];
+ ma5[0] = ma5[1];
+ b5[0] = b5[2];
+ ma565 += 16;
+ b565 += 16;
+ x += 16;
} while (x < width);
}
@@ -1310,37 +1761,36 @@ inline int16x8_t CalculateFilteredOutputPass2(const uint8x8_t s,
return CalculateFilteredOutput<5>(s, ma_sum, b_sum);
}
-inline void SelfGuidedFinal(const uint8x8_t src, const int32x4_t v[2],
- uint8_t* const dst) {
+inline uint8x8_t SelfGuidedFinal(const uint8x8_t src, const int32x4_t v[2]) {
const int16x4_t v_lo =
vrshrn_n_s32(v[0], kSgrProjRestoreBits + kSgrProjPrecisionBits);
const int16x4_t v_hi =
vrshrn_n_s32(v[1], kSgrProjRestoreBits + kSgrProjPrecisionBits);
const int16x8_t vv = vcombine_s16(v_lo, v_hi);
- const int16x8_t s = ZeroExtend(src);
- const int16x8_t d = vaddq_s16(s, vv);
- vst1_u8(dst, vqmovun_s16(d));
+ const int16x8_t d =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(vv), src));
+ return vqmovun_s16(d);
}
-inline void SelfGuidedDoubleMultiplier(const uint8x8_t src,
- const int16x8_t filter[2], const int w0,
- const int w2, uint8_t* const dst) {
+inline uint8x8_t SelfGuidedDoubleMultiplier(const uint8x8_t src,
+ const int16x8_t filter[2],
+ const int w0, const int w2) {
int32x4_t v[2];
v[0] = vmull_n_s16(vget_low_s16(filter[0]), w0);
v[1] = vmull_n_s16(vget_high_s16(filter[0]), w0);
v[0] = vmlal_n_s16(v[0], vget_low_s16(filter[1]), w2);
v[1] = vmlal_n_s16(v[1], vget_high_s16(filter[1]), w2);
- SelfGuidedFinal(src, v, dst);
+ return SelfGuidedFinal(src, v);
}
-inline void SelfGuidedSingleMultiplier(const uint8x8_t src,
- const int16x8_t filter, const int w0,
- uint8_t* const dst) {
+inline uint8x8_t SelfGuidedSingleMultiplier(const uint8x8_t src,
+ const int16x8_t filter,
+ const int w0) {
// weight: -96 to 96 (Sgrproj_Xqd_Min/Max)
int32x4_t v[2];
v[0] = vmull_n_s16(vget_low_s16(filter), w0);
v[1] = vmull_n_s16(vget_high_s16(filter), w0);
- SelfGuidedFinal(src, v, dst);
+ return SelfGuidedFinal(src, v);
}
LIBGAV1_ALWAYS_INLINE void BoxFilterPass1(
@@ -1349,43 +1799,60 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterPass1(
uint32_t* const square_sum5[5], const int width, const uint32_t scale,
const int16_t w0, uint16_t* const ma565[2], uint32_t* const b565[2],
uint8_t* const dst) {
- uint8x8x2_t s[2], mas;
- uint16x8x2_t sq[2], bs;
- s[0].val[0] = vld1_u8(src0);
- s[1].val[0] = vld1_u8(src1);
- sq[0].val[0] = vmull_u8(s[0].val[0], s[0].val[0]);
- sq[1].val[0] = vmull_u8(s[1].val[0], s[1].val[0]);
- BoxFilterPreProcess5(src0, src1, 0, scale, sum5, square_sum5, s, sq,
- &mas.val[0], &bs.val[0]);
+ uint8x16_t s[2][2], mas[2];
+ uint16x8_t sq[2][4], bs[3];
+ BoxFilterPreProcess5Lo(src0, src1, scale, s, sum5, square_sum5, sq, &mas[0],
+ &bs[0]);
int x = 0;
do {
- s[0].val[0] = s[0].val[1];
- s[1].val[0] = s[1].val[1];
- sq[0].val[0] = sq[0].val[1];
- sq[1].val[0] = sq[1].val[1];
- BoxFilterPreProcess5(src0, src1, x + 8, scale, sum5, square_sum5, s, sq,
- &mas.val[1], &bs.val[1]);
uint16x8_t ma[2];
+ uint8x16_t masx[3];
uint32x4x2_t b[2];
- ma[1] = Sum565(mas);
+ int16x8_t p0, p1;
+ BoxFilterPreProcess5(src0, src1, x + 8, scale, s, sum5, square_sum5, sq,
+ mas, bs + 1);
+ Prepare3_8<0>(mas, masx);
+ ma[1] = Sum565<0>(masx);
b[1] = Sum565W(bs);
vst1q_u16(ma565[1] + x, ma[1]);
vst1q_u32(b565[1] + x + 0, b[1].val[0]);
vst1q_u32(b565[1] + x + 4, b[1].val[1]);
- const uint8x8_t sr0 = vld1_u8(src + x);
- const uint8x8_t sr1 = vld1_u8(src + stride + x);
- int16x8_t p0, p1;
+ const uint8x16_t sr0 = vld1q_u8(src + x);
+ const uint8x16_t sr1 = vld1q_u8(src + stride + x);
+ const uint8x8_t sr00 = vget_low_u8(sr0);
+ const uint8x8_t sr10 = vget_low_u8(sr1);
ma[0] = vld1q_u16(ma565[0] + x);
b[0].val[0] = vld1q_u32(b565[0] + x + 0);
b[0].val[1] = vld1q_u32(b565[0] + x + 4);
- p0 = CalculateFilteredOutputPass1(sr0, ma, b);
- p1 = CalculateFilteredOutput<4>(sr1, ma[1], b[1]);
- SelfGuidedSingleMultiplier(sr0, p0, w0, dst + x);
- SelfGuidedSingleMultiplier(sr1, p1, w0, dst + stride + x);
- mas.val[0] = mas.val[1];
- bs.val[0] = bs.val[1];
- x += 8;
+ p0 = CalculateFilteredOutputPass1(sr00, ma, b);
+ p1 = CalculateFilteredOutput<4>(sr10, ma[1], b[1]);
+ const uint8x8_t d00 = SelfGuidedSingleMultiplier(sr00, p0, w0);
+ const uint8x8_t d10 = SelfGuidedSingleMultiplier(sr10, p1, w0);
+
+ ma[1] = Sum565<8>(masx);
+ b[1] = Sum565W(bs + 1);
+ vst1q_u16(ma565[1] + x + 8, ma[1]);
+ vst1q_u32(b565[1] + x + 8, b[1].val[0]);
+ vst1q_u32(b565[1] + x + 12, b[1].val[1]);
+ const uint8x8_t sr01 = vget_high_u8(sr0);
+ const uint8x8_t sr11 = vget_high_u8(sr1);
+ ma[0] = vld1q_u16(ma565[0] + x + 8);
+ b[0].val[0] = vld1q_u32(b565[0] + x + 8);
+ b[0].val[1] = vld1q_u32(b565[0] + x + 12);
+ p0 = CalculateFilteredOutputPass1(sr01, ma, b);
+ p1 = CalculateFilteredOutput<4>(sr11, ma[1], b[1]);
+ const uint8x8_t d01 = SelfGuidedSingleMultiplier(sr01, p0, w0);
+ const uint8x8_t d11 = SelfGuidedSingleMultiplier(sr11, p1, w0);
+ vst1q_u8(dst + x, vcombine_u8(d00, d01));
+ vst1q_u8(dst + stride + x, vcombine_u8(d10, d11));
+ s[0][0] = s[0][1];
+ s[1][0] = s[1][1];
+ sq[0][1] = sq[0][3];
+ sq[1][1] = sq[1][3];
+ mas[0] = mas[1];
+ bs[0] = bs[2];
+ x += 16;
} while (x < width);
}
@@ -1396,34 +1863,45 @@ inline void BoxFilterPass1LastRow(const uint8_t* const src,
uint32_t* const square_sum5[5],
uint16_t* ma565, uint32_t* b565,
uint8_t* const dst) {
- uint8x8x2_t s, mas;
- uint16x8x2_t sq, bs;
- s.val[0] = vld1_u8(src0);
- sq.val[0] = vmull_u8(s.val[0], s.val[0]);
- BoxFilterPreProcess5LastRow(src0, 0, scale, sum5, square_sum5, &s, &sq,
- &mas.val[0], &bs.val[0]);
+ uint8x16_t s[2], mas[2];
+ uint16x8_t sq[4], bs[4];
+ BoxFilterPreProcess5LastRowLo(src0, scale, s, sum5, square_sum5, sq, &mas[0],
+ &bs[0]);
int x = 0;
do {
- s.val[0] = s.val[1];
- sq.val[0] = sq.val[1];
- BoxFilterPreProcess5LastRow(src0, x + 8, scale, sum5, square_sum5, &s, &sq,
- &mas.val[1], &bs.val[1]);
uint16x8_t ma[2];
+ uint8x16_t masx[3];
uint32x4x2_t b[2];
- ma[1] = Sum565(mas);
+ BoxFilterPreProcess5LastRow(src0, x + 8, scale, s, sum5, square_sum5,
+ sq + 1, mas, bs + 1);
+ Prepare3_8<0>(mas, masx);
+ ma[1] = Sum565<0>(masx);
b[1] = Sum565W(bs);
- mas.val[0] = mas.val[1];
- bs.val[0] = bs.val[1];
ma[0] = vld1q_u16(ma565);
b[0].val[0] = vld1q_u32(b565 + 0);
b[0].val[1] = vld1q_u32(b565 + 4);
- const uint8x8_t sr = vld1_u8(src + x);
- const int16x8_t p = CalculateFilteredOutputPass1(sr, ma, b);
- SelfGuidedSingleMultiplier(sr, p, w0, dst + x);
- ma565 += 8;
- b565 += 8;
- x += 8;
+ const uint8x16_t sr = vld1q_u8(src + x);
+ const uint8x8_t sr0 = vget_low_u8(sr);
+ const int16x8_t p0 = CalculateFilteredOutputPass1(sr0, ma, b);
+ const uint8x8_t d0 = SelfGuidedSingleMultiplier(sr0, p0, w0);
+
+ ma[1] = Sum565<8>(masx);
+ b[1] = Sum565W(bs + 1);
+ bs[0] = bs[2];
+ const uint8x8_t sr1 = vget_high_u8(sr);
+ ma[0] = vld1q_u16(ma565 + 8);
+ b[0].val[0] = vld1q_u32(b565 + 8);
+ b[0].val[1] = vld1q_u32(b565 + 12);
+ const int16x8_t p1 = CalculateFilteredOutputPass1(sr1, ma, b);
+ const uint8x8_t d1 = SelfGuidedSingleMultiplier(sr1, p1, w0);
+ vst1q_u8(dst + x, vcombine_u8(d0, d1));
+ s[0] = s[1];
+ sq[1] = sq[3];
+ mas[0] = mas[1];
+ ma565 += 16;
+ b565 += 16;
+ x += 16;
} while (x < width);
}
@@ -1433,35 +1911,49 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterPass2(
uint32_t* const square_sum3[3], uint16_t* const ma343[3],
uint16_t* const ma444[2], uint32_t* const b343[3], uint32_t* const b444[2],
uint8_t* const dst) {
- uint8x8x2_t s, mas;
- uint16x8x2_t sq, bs;
- s.val[0] = vld1_u8(src0);
- sq.val[0] = vmull_u8(s.val[0], s.val[0]);
- BoxFilterPreProcess3(src0, 0, scale, sum3, square_sum3, &s, &sq, &mas.val[0],
- &bs.val[0]);
+ uint8x16_t s[2], mas[2];
+ uint16x8_t sq[4], bs[3];
+ BoxFilterPreProcess3Lo(src0, scale, &s[0], sum3, square_sum3, sq, &mas[0],
+ &bs[0]);
int x = 0;
do {
- s.val[0] = s.val[1];
- sq.val[0] = sq.val[1];
- BoxFilterPreProcess3(src0, x + 8, scale, sum3, square_sum3, &s, &sq,
- &mas.val[1], &bs.val[1]);
uint16x8_t ma[3];
+ uint8x16_t ma3x[3];
uint32x4x2_t b[3];
- Store343_444(mas, bs, x, &ma[2], &b[2], ma343[2], ma444[1], b343[2],
- b444[1]);
- const uint8x8_t sr = vld1_u8(src + x);
+ BoxFilterPreProcess3(src0, x + 8, scale, sum3, square_sum3, s, sq + 1, mas,
+ bs + 1);
+ Prepare3_8<0>(mas, ma3x);
+ Store343_444<0>(ma3x, bs, x, &ma[2], &b[2], ma343[2], ma444[1], b343[2],
+ b444[1]);
+ const uint8x16_t sr = vld1q_u8(src + x);
+ const uint8x8_t sr0 = vget_low_u8(sr);
ma[0] = vld1q_u16(ma343[0] + x);
ma[1] = vld1q_u16(ma444[0] + x);
b[0].val[0] = vld1q_u32(b343[0] + x + 0);
b[0].val[1] = vld1q_u32(b343[0] + x + 4);
b[1].val[0] = vld1q_u32(b444[0] + x + 0);
b[1].val[1] = vld1q_u32(b444[0] + x + 4);
- const int16x8_t p = CalculateFilteredOutputPass2(sr, ma, b);
- SelfGuidedSingleMultiplier(sr, p, w0, dst + x);
- mas.val[0] = mas.val[1];
- bs.val[0] = bs.val[1];
- x += 8;
+ const int16x8_t p0 = CalculateFilteredOutputPass2(sr0, ma, b);
+ const uint8x8_t d0 = SelfGuidedSingleMultiplier(sr0, p0, w0);
+
+ Store343_444<8>(ma3x, bs + 1, x + 8, &ma[2], &b[2], ma343[2], ma444[1],
+ b343[2], b444[1]);
+ const uint8x8_t sr1 = vget_high_u8(sr);
+ ma[0] = vld1q_u16(ma343[0] + x + 8);
+ ma[1] = vld1q_u16(ma444[0] + x + 8);
+ b[0].val[0] = vld1q_u32(b343[0] + x + 8);
+ b[0].val[1] = vld1q_u32(b343[0] + x + 12);
+ b[1].val[0] = vld1q_u32(b444[0] + x + 8);
+ b[1].val[1] = vld1q_u32(b444[0] + x + 12);
+ const int16x8_t p1 = CalculateFilteredOutputPass2(sr1, ma, b);
+ const uint8x8_t d1 = SelfGuidedSingleMultiplier(sr1, p1, w0);
+ vst1q_u8(dst + x, vcombine_u8(d0, d1));
+ s[0] = s[1];
+ sq[1] = sq[3];
+ mas[0] = mas[1];
+ bs[0] = bs[2];
+ x += 16;
} while (x < width);
}
@@ -1474,64 +1966,96 @@ LIBGAV1_ALWAYS_INLINE void BoxFilter(
uint16_t* const ma343[4], uint16_t* const ma444[3],
uint16_t* const ma565[2], uint32_t* const b343[4], uint32_t* const b444[3],
uint32_t* const b565[2], uint8_t* const dst) {
- uint8x8x2_t s[2], ma3[2], ma5;
- uint16x8x2_t sq[2], b3[2], b5;
- s[0].val[0] = vld1_u8(src0);
- s[1].val[0] = vld1_u8(src1);
- sq[0].val[0] = vmull_u8(s[0].val[0], s[0].val[0]);
- sq[1].val[0] = vmull_u8(s[1].val[0], s[1].val[0]);
- BoxFilterPreProcess(src0, src1, 0, scales, sum3, sum5, square_sum3,
- square_sum5, s, sq, &ma3[0].val[0], &ma3[1].val[0],
- &b3[0].val[0], &b3[1].val[0], &ma5.val[0], &b5.val[0]);
+ uint8x16_t s[2][2], ma3[2][2], ma5[2];
+ uint16x8_t sq[2][4], b3[2][3], b5[3];
+ BoxFilterPreProcessLo(src0, src1, scales, s, sum3, sum5, square_sum3,
+ square_sum5, sq, ma3, b3, &ma5[0], &b5[0]);
int x = 0;
do {
- s[0].val[0] = s[0].val[1];
- s[1].val[0] = s[1].val[1];
- sq[0].val[0] = sq[0].val[1];
- sq[1].val[0] = sq[1].val[1];
- BoxFilterPreProcess(src0, src1, x + 8, scales, sum3, sum5, square_sum3,
- square_sum5, s, sq, &ma3[0].val[1], &ma3[1].val[1],
- &b3[0].val[1], &b3[1].val[1], &ma5.val[1], &b5.val[1]);
uint16x8_t ma[3][3];
+ uint8x16_t ma3x[2][3], ma5x[3];
uint32x4x2_t b[3][3];
- Store343_444(ma3[0], b3[0], x, &ma[1][2], &ma[2][1], &b[1][2], &b[2][1],
- ma343[2], ma444[1], b343[2], b444[1]);
- Store343_444(ma3[1], b3[1], x, &ma[2][2], &b[2][2], ma343[3], ma444[2],
- b343[3], b444[2]);
- ma[0][1] = Sum565(ma5);
+ int16x8_t p[2][2];
+ BoxFilterPreProcess(src0, src1, x + 8, scales, s, sum3, sum5, square_sum3,
+ square_sum5, sq, ma3, b3, ma5, b5 + 1);
+ Prepare3_8<0>(ma3[0], ma3x[0]);
+ Prepare3_8<0>(ma3[1], ma3x[1]);
+ Store343_444<0>(ma3x[0], b3[0], x, &ma[1][2], &ma[2][1], &b[1][2], &b[2][1],
+ ma343[2], ma444[1], b343[2], b444[1]);
+ Store343_444<0>(ma3x[1], b3[1], x, &ma[2][2], &b[2][2], ma343[3], ma444[2],
+ b343[3], b444[2]);
+ Prepare3_8<0>(ma5, ma5x);
+ ma[0][1] = Sum565<0>(ma5x);
b[0][1] = Sum565W(b5);
vst1q_u16(ma565[1] + x, ma[0][1]);
vst1q_u32(b565[1] + x, b[0][1].val[0]);
vst1q_u32(b565[1] + x + 4, b[0][1].val[1]);
- ma3[0].val[0] = ma3[0].val[1];
- ma3[1].val[0] = ma3[1].val[1];
- b3[0].val[0] = b3[0].val[1];
- b3[1].val[0] = b3[1].val[1];
- ma5.val[0] = ma5.val[1];
- b5.val[0] = b5.val[1];
- int16x8_t p[2][2];
- const uint8x8_t sr0 = vld1_u8(src + x);
- const uint8x8_t sr1 = vld1_u8(src + stride + x);
+ const uint8x16_t sr0 = vld1q_u8(src + x);
+ const uint8x16_t sr1 = vld1q_u8(src + stride + x);
+ const uint8x8_t sr00 = vget_low_u8(sr0);
+ const uint8x8_t sr10 = vget_low_u8(sr1);
ma[0][0] = vld1q_u16(ma565[0] + x);
b[0][0].val[0] = vld1q_u32(b565[0] + x);
b[0][0].val[1] = vld1q_u32(b565[0] + x + 4);
- p[0][0] = CalculateFilteredOutputPass1(sr0, ma[0], b[0]);
- p[1][0] = CalculateFilteredOutput<4>(sr1, ma[0][1], b[0][1]);
+ p[0][0] = CalculateFilteredOutputPass1(sr00, ma[0], b[0]);
+ p[1][0] = CalculateFilteredOutput<4>(sr10, ma[0][1], b[0][1]);
ma[1][0] = vld1q_u16(ma343[0] + x);
ma[1][1] = vld1q_u16(ma444[0] + x);
b[1][0].val[0] = vld1q_u32(b343[0] + x);
b[1][0].val[1] = vld1q_u32(b343[0] + x + 4);
b[1][1].val[0] = vld1q_u32(b444[0] + x);
b[1][1].val[1] = vld1q_u32(b444[0] + x + 4);
- p[0][1] = CalculateFilteredOutputPass2(sr0, ma[1], b[1]);
+ p[0][1] = CalculateFilteredOutputPass2(sr00, ma[1], b[1]);
ma[2][0] = vld1q_u16(ma343[1] + x);
b[2][0].val[0] = vld1q_u32(b343[1] + x);
b[2][0].val[1] = vld1q_u32(b343[1] + x + 4);
- p[1][1] = CalculateFilteredOutputPass2(sr1, ma[2], b[2]);
- SelfGuidedDoubleMultiplier(sr0, p[0], w0, w2, dst + x);
- SelfGuidedDoubleMultiplier(sr1, p[1], w0, w2, dst + stride + x);
- x += 8;
+ p[1][1] = CalculateFilteredOutputPass2(sr10, ma[2], b[2]);
+ const uint8x8_t d00 = SelfGuidedDoubleMultiplier(sr00, p[0], w0, w2);
+ const uint8x8_t d10 = SelfGuidedDoubleMultiplier(sr10, p[1], w0, w2);
+
+ Store343_444<8>(ma3x[0], b3[0] + 1, x + 8, &ma[1][2], &ma[2][1], &b[1][2],
+ &b[2][1], ma343[2], ma444[1], b343[2], b444[1]);
+ Store343_444<8>(ma3x[1], b3[1] + 1, x + 8, &ma[2][2], &b[2][2], ma343[3],
+ ma444[2], b343[3], b444[2]);
+ ma[0][1] = Sum565<8>(ma5x);
+ b[0][1] = Sum565W(b5 + 1);
+ vst1q_u16(ma565[1] + x + 8, ma[0][1]);
+ vst1q_u32(b565[1] + x + 8, b[0][1].val[0]);
+ vst1q_u32(b565[1] + x + 12, b[0][1].val[1]);
+ b3[0][0] = b3[0][2];
+ b3[1][0] = b3[1][2];
+ b5[0] = b5[2];
+ const uint8x8_t sr01 = vget_high_u8(sr0);
+ const uint8x8_t sr11 = vget_high_u8(sr1);
+ ma[0][0] = vld1q_u16(ma565[0] + x + 8);
+ b[0][0].val[0] = vld1q_u32(b565[0] + x + 8);
+ b[0][0].val[1] = vld1q_u32(b565[0] + x + 12);
+ p[0][0] = CalculateFilteredOutputPass1(sr01, ma[0], b[0]);
+ p[1][0] = CalculateFilteredOutput<4>(sr11, ma[0][1], b[0][1]);
+ ma[1][0] = vld1q_u16(ma343[0] + x + 8);
+ ma[1][1] = vld1q_u16(ma444[0] + x + 8);
+ b[1][0].val[0] = vld1q_u32(b343[0] + x + 8);
+ b[1][0].val[1] = vld1q_u32(b343[0] + x + 12);
+ b[1][1].val[0] = vld1q_u32(b444[0] + x + 8);
+ b[1][1].val[1] = vld1q_u32(b444[0] + x + 12);
+ p[0][1] = CalculateFilteredOutputPass2(sr01, ma[1], b[1]);
+ ma[2][0] = vld1q_u16(ma343[1] + x + 8);
+ b[2][0].val[0] = vld1q_u32(b343[1] + x + 8);
+ b[2][0].val[1] = vld1q_u32(b343[1] + x + 12);
+ p[1][1] = CalculateFilteredOutputPass2(sr11, ma[2], b[2]);
+ const uint8x8_t d01 = SelfGuidedDoubleMultiplier(sr01, p[0], w0, w2);
+ const uint8x8_t d11 = SelfGuidedDoubleMultiplier(sr11, p[1], w0, w2);
+ vst1q_u8(dst + x, vcombine_u8(d00, d01));
+ vst1q_u8(dst + stride + x, vcombine_u8(d10, d11));
+ s[0][0] = s[0][1];
+ s[1][0] = s[1][1];
+ sq[0][1] = sq[0][3];
+ sq[1][1] = sq[1][3];
+ ma3[0][0] = ma3[0][1];
+ ma3[1][0] = ma3[1][1];
+ ma5[0] = ma5[1];
+ x += 16;
} while (x < width);
}
@@ -1540,58 +2064,79 @@ inline void BoxFilterLastRow(
const uint16_t scales[2], const int16_t w0, const int16_t w2,
uint16_t* const sum3[4], uint16_t* const sum5[5],
uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
- uint16_t* const ma343[4], uint16_t* const ma444[3],
- uint16_t* const ma565[2], uint32_t* const b343[4], uint32_t* const b444[3],
- uint32_t* const b565[2], uint8_t* const dst) {
- uint8x8x2_t s, ma3, ma5;
- uint16x8x2_t sq, b3, b5;
- uint16x8_t ma[3];
+ uint16_t* const ma343, uint16_t* const ma444, uint16_t* const ma565,
+ uint32_t* const b343, uint32_t* const b444, uint32_t* const b565,
+ uint8_t* const dst) {
+ uint8x16_t s[2], ma3[2], ma5[2];
+ uint16x8_t sq[4], ma[3], b3[3], b5[3];
uint32x4x2_t b[3];
- s.val[0] = vld1_u8(src0);
- sq.val[0] = vmull_u8(s.val[0], s.val[0]);
- BoxFilterPreProcessLastRow(src0, 0, scales, sum3, sum5, square_sum3,
- square_sum5, &s, &sq, &ma3.val[0], &ma5.val[0],
- &b3.val[0], &b5.val[0]);
+ BoxFilterPreProcessLastRowLo(src0, scales, sum3, sum5, square_sum3,
+ square_sum5, &s[0], sq, &ma3[0], &ma5[0], &b3[0],
+ &b5[0]);
int x = 0;
do {
- s.val[0] = s.val[1];
- sq.val[0] = sq.val[1];
+ uint8x16_t ma3x[3], ma5x[3];
+ int16x8_t p[2];
BoxFilterPreProcessLastRow(src0, x + 8, scales, sum3, sum5, square_sum3,
- square_sum5, &s, &sq, &ma3.val[1], &ma5.val[1],
- &b3.val[1], &b5.val[1]);
- ma[1] = Sum565(ma5);
+ square_sum5, s, sq + 1, ma3, ma5, &b3[1],
+ &b5[1]);
+ Prepare3_8<0>(ma5, ma5x);
+ ma[1] = Sum565<0>(ma5x);
b[1] = Sum565W(b5);
- ma5.val[0] = ma5.val[1];
- b5.val[0] = b5.val[1];
- ma[2] = Sum343(ma3);
+ Prepare3_8<0>(ma3, ma3x);
+ ma[2] = Sum343<0>(ma3x);
b[2] = Sum343W(b3);
- ma3.val[0] = ma3.val[1];
- b3.val[0] = b3.val[1];
- const uint8x8_t sr = vld1_u8(src + x);
- int16x8_t p[2];
- ma[0] = vld1q_u16(ma565[0] + x);
- b[0].val[0] = vld1q_u32(b565[0] + x + 0);
- b[0].val[1] = vld1q_u32(b565[0] + x + 4);
- p[0] = CalculateFilteredOutputPass1(sr, ma, b);
- ma[0] = vld1q_u16(ma343[0] + x);
- ma[1] = vld1q_u16(ma444[0] + x);
- b[0].val[0] = vld1q_u32(b343[0] + x + 0);
- b[0].val[1] = vld1q_u32(b343[0] + x + 4);
- b[1].val[0] = vld1q_u32(b444[0] + x + 0);
- b[1].val[1] = vld1q_u32(b444[0] + x + 4);
- p[1] = CalculateFilteredOutputPass2(sr, ma, b);
- SelfGuidedDoubleMultiplier(sr, p, w0, w2, dst + x);
- x += 8;
+ const uint8x16_t sr = vld1q_u8(src + x);
+ const uint8x8_t sr0 = vget_low_u8(sr);
+ ma[0] = vld1q_u16(ma565 + x);
+ b[0].val[0] = vld1q_u32(b565 + x + 0);
+ b[0].val[1] = vld1q_u32(b565 + x + 4);
+ p[0] = CalculateFilteredOutputPass1(sr0, ma, b);
+ ma[0] = vld1q_u16(ma343 + x);
+ ma[1] = vld1q_u16(ma444 + x);
+ b[0].val[0] = vld1q_u32(b343 + x + 0);
+ b[0].val[1] = vld1q_u32(b343 + x + 4);
+ b[1].val[0] = vld1q_u32(b444 + x + 0);
+ b[1].val[1] = vld1q_u32(b444 + x + 4);
+ p[1] = CalculateFilteredOutputPass2(sr0, ma, b);
+ const uint8x8_t d0 = SelfGuidedDoubleMultiplier(sr0, p, w0, w2);
+
+ ma[1] = Sum565<8>(ma5x);
+ b[1] = Sum565W(b5 + 1);
+ b5[0] = b5[2];
+ ma[2] = Sum343<8>(ma3x);
+ b[2] = Sum343W(b3 + 1);
+ b3[0] = b3[2];
+ const uint8x8_t sr1 = vget_high_u8(sr);
+ ma[0] = vld1q_u16(ma565 + x + 8);
+ b[0].val[0] = vld1q_u32(b565 + x + 8);
+ b[0].val[1] = vld1q_u32(b565 + x + 12);
+ p[0] = CalculateFilteredOutputPass1(sr1, ma, b);
+ ma[0] = vld1q_u16(ma343 + x + 8);
+ ma[1] = vld1q_u16(ma444 + x + 8);
+ b[0].val[0] = vld1q_u32(b343 + x + 8);
+ b[0].val[1] = vld1q_u32(b343 + x + 12);
+ b[1].val[0] = vld1q_u32(b444 + x + 8);
+ b[1].val[1] = vld1q_u32(b444 + x + 12);
+ p[1] = CalculateFilteredOutputPass2(sr1, ma, b);
+ const uint8x8_t d1 = SelfGuidedDoubleMultiplier(sr1, p, w0, w2);
+ vst1q_u8(dst + x, vcombine_u8(d0, d1));
+ s[0] = s[1];
+ sq[1] = sq[3];
+ ma3[0] = ma3[1];
+ ma5[0] = ma5[1];
+ x += 16;
} while (x < width);
}
LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
const RestorationUnitInfo& restoration_info, const uint8_t* src,
- const uint8_t* const top_border, const uint8_t* bottom_border,
- const ptrdiff_t stride, const int width, const int height,
+ const ptrdiff_t stride, const uint8_t* const top_border,
+ const ptrdiff_t top_border_stride, const uint8_t* bottom_border,
+ const ptrdiff_t bottom_border_stride, const int width, const int height,
SgrBuffer* const sgr_buffer, uint8_t* dst) {
- const auto temp_stride = Align<ptrdiff_t>(width, 8);
+ const auto temp_stride = Align<ptrdiff_t>(width, 16);
const ptrdiff_t sum_stride = temp_stride + 8;
const int sgr_proj_index = restoration_info.sgr_proj_info.index;
const uint16_t* const scales = kSgrScaleParameter[sgr_proj_index]; // < 2^12.
@@ -1628,13 +2173,13 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
b565[1] = b565[0] + temp_stride;
assert(scales[0] != 0);
assert(scales[1] != 0);
- BoxSum(top_border, stride, 2, sum_stride, sum3[0], sum5[1], square_sum3[0],
- square_sum5[1]);
+ BoxSum(top_border, top_border_stride, sum_stride, sum3[0], sum5[1],
+ square_sum3[0], square_sum5[1]);
sum5[0] = sum5[1];
square_sum5[0] = square_sum5[1];
const uint8_t* const s = (height > 1) ? src + stride : bottom_border;
BoxSumFilterPreProcess(src, s, width, scales, sum3, sum5, square_sum3,
- square_sum5, ma343, ma444, ma565[0], b343, b444,
+ square_sum5, ma343, ma444[0], ma565[0], b343, b444[0],
b565[0]);
sum5[0] = sgr_buffer->sum5;
square_sum5[0] = sgr_buffer->square_sum5;
@@ -1665,7 +2210,7 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
const uint8_t* sr[2];
if ((height & 1) == 0) {
sr[0] = bottom_border;
- sr[1] = bottom_border + stride;
+ sr[1] = bottom_border + bottom_border_stride;
} else {
sr[0] = src + 2 * stride;
sr[1] = bottom_border;
@@ -1689,20 +2234,22 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
std::swap(ma565[0], ma565[1]);
std::swap(b565[0], b565[1]);
}
- BoxFilterLastRow(src + 3, bottom_border + stride, width, scales, w0, w2,
- sum3, sum5, square_sum3, square_sum5, ma343, ma444, ma565,
- b343, b444, b565, dst);
+ BoxFilterLastRow(src + 3, bottom_border + bottom_border_stride, width,
+ scales, w0, w2, sum3, sum5, square_sum3, square_sum5,
+ ma343[0], ma444[0], ma565[0], b343[0], b444[0], b565[0],
+ dst);
}
}
inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
- const uint8_t* src,
+ const uint8_t* src, const ptrdiff_t stride,
const uint8_t* const top_border,
+ const ptrdiff_t top_border_stride,
const uint8_t* bottom_border,
- const ptrdiff_t stride, const int width,
- const int height, SgrBuffer* const sgr_buffer,
- uint8_t* dst) {
- const auto temp_stride = Align<ptrdiff_t>(width, 8);
+ const ptrdiff_t bottom_border_stride,
+ const int width, const int height,
+ SgrBuffer* const sgr_buffer, uint8_t* dst) {
+ const auto temp_stride = Align<ptrdiff_t>(width, 16);
const ptrdiff_t sum_stride = temp_stride + 8;
const int sgr_proj_index = restoration_info.sgr_proj_info.index;
const uint32_t scale = kSgrScaleParameter[sgr_proj_index][0]; // < 2^12.
@@ -1720,7 +2267,7 @@ inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
b565[0] = sgr_buffer->b565;
b565[1] = b565[0] + temp_stride;
assert(scale != 0);
- BoxSum<5>(top_border, stride, 2, sum_stride, sum5[1], square_sum5[1]);
+ BoxSum<5>(top_border, top_border_stride, sum_stride, sum5[1], square_sum5[1]);
sum5[0] = sum5[1];
square_sum5[0] = square_sum5[1];
const uint8_t* const s = (height > 1) ? src + stride : bottom_border;
@@ -1746,7 +2293,7 @@ inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
const uint8_t* sr[2];
if ((height & 1) == 0) {
sr[0] = bottom_border;
- sr[1] = bottom_border + stride;
+ sr[1] = bottom_border + bottom_border_stride;
} else {
sr[0] = src + 2 * stride;
sr[1] = bottom_border;
@@ -1763,20 +2310,21 @@ inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
Circulate5PointersBy2<uint16_t>(sum5);
Circulate5PointersBy2<uint32_t>(square_sum5);
}
- BoxFilterPass1LastRow(src + 3, bottom_border + stride, width, scale, w0,
- sum5, square_sum5, ma565[0], b565[0], dst);
+ BoxFilterPass1LastRow(src + 3, bottom_border + bottom_border_stride, width,
+ scale, w0, sum5, square_sum5, ma565[0], b565[0], dst);
}
}
inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
- const uint8_t* src,
+ const uint8_t* src, const ptrdiff_t stride,
const uint8_t* const top_border,
+ const ptrdiff_t top_border_stride,
const uint8_t* bottom_border,
- const ptrdiff_t stride, const int width,
- const int height, SgrBuffer* const sgr_buffer,
- uint8_t* dst) {
+ const ptrdiff_t bottom_border_stride,
+ const int width, const int height,
+ SgrBuffer* const sgr_buffer, uint8_t* dst) {
assert(restoration_info.sgr_proj_info.multiplier[0] == 0);
- const auto temp_stride = Align<ptrdiff_t>(width, 8);
+ const auto temp_stride = Align<ptrdiff_t>(width, 16);
const ptrdiff_t sum_stride = temp_stride + 8;
const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1;
@@ -1799,7 +2347,7 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
b444[0] = sgr_buffer->b444;
b444[1] = b444[0] + temp_stride;
assert(scale != 0);
- BoxSum<3>(top_border, stride, 2, sum_stride, sum3[0], square_sum3[0]);
+ BoxSum<3>(top_border, top_border_stride, sum_stride, sum3[0], square_sum3[0]);
BoxSumFilterPreProcess3<false>(src, width, scale, sum3, square_sum3, ma343[0],
nullptr, b343[0], nullptr);
Circulate3PointersBy1<uint16_t>(sum3);
@@ -1809,7 +2357,7 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
s = src + stride;
} else {
s = bottom_border;
- bottom_border += stride;
+ bottom_border += bottom_border_stride;
}
BoxSumFilterPreProcess3<true>(s, width, scale, sum3, square_sum3, ma343[1],
ma444[0], b343[1], b444[0]);
@@ -1836,7 +2384,7 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
ma343, ma444, b343, b444, dst);
src += stride;
dst += stride;
- bottom_border += stride;
+ bottom_border += bottom_border_stride;
Circulate3PointersBy1<uint16_t>(ma343);
Circulate3PointersBy1<uint32_t>(b343);
std::swap(ma444[0], ma444[1]);
@@ -1849,8 +2397,9 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
// part of the visible frame.
void SelfGuidedFilter_NEON(
const RestorationUnitInfo& restoration_info, const void* const source,
- const void* const top_border, const void* const bottom_border,
- const ptrdiff_t stride, const int width, const int height,
+ const ptrdiff_t stride, const void* const top_border,
+ const ptrdiff_t top_border_stride, const void* const bottom_border,
+ const ptrdiff_t bottom_border_stride, const int width, const int height,
RestorationBuffer* const restoration_buffer, void* const dest) {
const int index = restoration_info.sgr_proj_info.index;
const int radius_pass_0 = kSgrProjParams[index][0]; // 2 or 0
@@ -1864,14 +2413,17 @@ void SelfGuidedFilter_NEON(
// |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the
// following assertion.
assert(radius_pass_0 != 0);
- BoxFilterProcessPass1(restoration_info, src - 3, top - 3, bottom - 3,
- stride, width, height, sgr_buffer, dst);
+ BoxFilterProcessPass1(restoration_info, src - 3, stride, top - 3,
+ top_border_stride, bottom - 3, bottom_border_stride,
+ width, height, sgr_buffer, dst);
} else if (radius_pass_0 == 0) {
- BoxFilterProcessPass2(restoration_info, src - 2, top - 2, bottom - 2,
- stride, width, height, sgr_buffer, dst);
+ BoxFilterProcessPass2(restoration_info, src - 2, stride, top - 2,
+ top_border_stride, bottom - 2, bottom_border_stride,
+ width, height, sgr_buffer, dst);
} else {
- BoxFilterProcess(restoration_info, src - 3, top - 3, bottom - 3, stride,
- width, height, sgr_buffer, dst);
+ BoxFilterProcess(restoration_info, src - 3, stride, top - 3,
+ top_border_stride, bottom - 3, bottom_border_stride, width,
+ height, sgr_buffer, dst);
}
}
@@ -1890,7 +2442,7 @@ void LoopRestorationInit_NEON() { low_bitdepth::Init8bpp(); }
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_NEON
+#else // !LIBGAV1_ENABLE_NEON
namespace libgav1 {
namespace dsp {
diff --git a/src/dsp/arm/mask_blend_neon.cc b/src/dsp/arm/mask_blend_neon.cc
index 084f42f..ee50923 100644
--- a/src/dsp/arm/mask_blend_neon.cc
+++ b/src/dsp/arm/mask_blend_neon.cc
@@ -432,7 +432,7 @@ void MaskBlendInit_NEON() { low_bitdepth::Init8bpp(); }
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_NEON
+#else // !LIBGAV1_ENABLE_NEON
namespace libgav1 {
namespace dsp {
diff --git a/src/dsp/arm/motion_field_projection_neon.cc b/src/dsp/arm/motion_field_projection_neon.cc
index 8caba7d..3e731b2 100644
--- a/src/dsp/arm/motion_field_projection_neon.cc
+++ b/src/dsp/arm/motion_field_projection_neon.cc
@@ -382,7 +382,7 @@ void MotionFieldProjectionInit_NEON() {
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_NEON
+#else // !LIBGAV1_ENABLE_NEON
namespace libgav1 {
namespace dsp {
diff --git a/src/dsp/arm/motion_vector_search_neon.cc b/src/dsp/arm/motion_vector_search_neon.cc
index 8a403a6..da3ba17 100644
--- a/src/dsp/arm/motion_vector_search_neon.cc
+++ b/src/dsp/arm/motion_vector_search_neon.cc
@@ -256,7 +256,7 @@ void MotionVectorSearchInit_NEON() {
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_NEON
+#else // !LIBGAV1_ENABLE_NEON
namespace libgav1 {
namespace dsp {
diff --git a/src/dsp/arm/obmc_neon.cc b/src/dsp/arm/obmc_neon.cc
index 66ad663..1111a90 100644
--- a/src/dsp/arm/obmc_neon.cc
+++ b/src/dsp/arm/obmc_neon.cc
@@ -380,7 +380,7 @@ void ObmcInit_NEON() { Init8bpp(); }
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_NEON
+#else // !LIBGAV1_ENABLE_NEON
namespace libgav1 {
namespace dsp {
diff --git a/src/dsp/arm/super_res_neon.cc b/src/dsp/arm/super_res_neon.cc
index 1680450..91537c4 100644
--- a/src/dsp/arm/super_res_neon.cc
+++ b/src/dsp/arm/super_res_neon.cc
@@ -12,7 +12,6 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "src/dsp/arm/common_neon.h"
#include "src/dsp/super_res.h"
#include "src/utils/cpu.h"
@@ -20,6 +19,7 @@
#include <arm_neon.h>
+#include "src/dsp/arm/common_neon.h"
#include "src/dsp/constants.h"
#include "src/dsp/dsp.h"
#include "src/utils/common.h"
@@ -82,10 +82,10 @@ inline uint8x8_t SuperRes(const uint8x8_t src[kSuperResFilterTaps],
}
void SuperRes_NEON(const void* const coefficients, void* const source,
- const ptrdiff_t stride, const int height,
+ const ptrdiff_t source_stride, const int height,
const int downscaled_width, const int upscaled_width,
const int initial_subpixel_x, const int step,
- void* const dest) {
+ void* const dest, const ptrdiff_t dest_stride) {
auto* src = static_cast<uint8_t*>(source) - DivideBy2(kSuperResFilterTaps);
auto* dst = static_cast<uint8_t*>(dest);
int y = height;
@@ -100,7 +100,7 @@ void SuperRes_NEON(const void* const coefficients, void* const source,
int x = RightShiftWithCeiling(upscaled_width, 4);
// The below code calculates up to 15 extra upscaled
// pixels which will over-read up to 15 downscaled pixels in the end of each
- // row. kSuperResHorizontalBorder accounts for this.
+ // row. kSuperResHorizontalPadding accounts for this.
do {
for (int i = 0; i < 8; ++i, subpixel_x += step) {
sr[i] = vld1_u8(&src[subpixel_x >> kSuperResScaleBits]);
@@ -135,8 +135,8 @@ void SuperRes_NEON(const void* const coefficients, void* const source,
vst1q_u8(dst_ptr, vcombine_u8(d0, d1));
dst_ptr += 16;
} while (--x != 0);
- src += stride;
- dst += stride;
+ src += source_stride;
+ dst += dest_stride;
} while (--y != 0);
}
@@ -149,12 +149,147 @@ void Init8bpp() {
} // namespace
} // namespace low_bitdepth
-void SuperResInit_NEON() { low_bitdepth::Init8bpp(); }
+//------------------------------------------------------------------------------
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+void SuperResCoefficients_NEON(const int upscaled_width,
+ const int initial_subpixel_x, const int step,
+ void* const coefficients) {
+ auto* dst = static_cast<uint16_t*>(coefficients);
+ int subpixel_x = initial_subpixel_x;
+ int x = RightShiftWithCeiling(upscaled_width, 3);
+ do {
+ uint16x8_t filter[8];
+ for (int i = 0; i < 8; ++i, subpixel_x += step) {
+ const uint8x8_t filter_8 =
+ vld1_u8(kUpscaleFilterUnsigned[(subpixel_x & kSuperResScaleMask) >>
+ kSuperResExtraBits]);
+ // uint8_t -> uint16_t
+ filter[i] = vmovl_u8(filter_8);
+ }
+
+ Transpose8x8(filter);
+
+ vst1q_u16(dst, filter[0]);
+ dst += 8;
+ vst1q_u16(dst, filter[1]);
+ dst += 8;
+ vst1q_u16(dst, filter[2]);
+ dst += 8;
+ vst1q_u16(dst, filter[3]);
+ dst += 8;
+ vst1q_u16(dst, filter[4]);
+ dst += 8;
+ vst1q_u16(dst, filter[5]);
+ dst += 8;
+ vst1q_u16(dst, filter[6]);
+ dst += 8;
+ vst1q_u16(dst, filter[7]);
+ dst += 8;
+ } while (--x != 0);
+}
+
+// The sum is clipped to [0, ((1 << bitdepth) -1)]. Adding all positive and then
+// subtracting all negative with saturation will clip to zero.
+// 0 1 2 3 4 5 6 7
+// tap sign: - + - + + - + -
+inline uint16x8_t SuperRes(const uint16x8_t src[kSuperResFilterTaps],
+ const uint16_t** coefficients, int bitdepth) {
+ uint16x8_t f[kSuperResFilterTaps];
+ for (int i = 0; i < kSuperResFilterTaps; ++i, *coefficients += 8) {
+ f[i] = vld1q_u16(*coefficients);
+ }
+
+ uint32x4_t res_lo = vmull_u16(vget_low_u16(src[1]), vget_low_u16(f[1]));
+ res_lo = vmlal_u16(res_lo, vget_low_u16(src[3]), vget_low_u16(f[3]));
+ res_lo = vmlal_u16(res_lo, vget_low_u16(src[4]), vget_low_u16(f[4]));
+ res_lo = vmlal_u16(res_lo, vget_low_u16(src[6]), vget_low_u16(f[6]));
+
+ uint32x4_t temp_lo = vmull_u16(vget_low_u16(src[0]), vget_low_u16(f[0]));
+ temp_lo = vmlal_u16(temp_lo, vget_low_u16(src[2]), vget_low_u16(f[2]));
+ temp_lo = vmlal_u16(temp_lo, vget_low_u16(src[5]), vget_low_u16(f[5]));
+ temp_lo = vmlal_u16(temp_lo, vget_low_u16(src[7]), vget_low_u16(f[7]));
+
+ res_lo = vqsubq_u32(res_lo, temp_lo);
+
+ uint32x4_t res_hi = vmull_u16(vget_high_u16(src[1]), vget_high_u16(f[1]));
+ res_hi = vmlal_u16(res_hi, vget_high_u16(src[3]), vget_high_u16(f[3]));
+ res_hi = vmlal_u16(res_hi, vget_high_u16(src[4]), vget_high_u16(f[4]));
+ res_hi = vmlal_u16(res_hi, vget_high_u16(src[6]), vget_high_u16(f[6]));
+ uint32x4_t temp_hi = vmull_u16(vget_high_u16(src[0]), vget_high_u16(f[0]));
+ temp_hi = vmlal_u16(temp_hi, vget_high_u16(src[2]), vget_high_u16(f[2]));
+ temp_hi = vmlal_u16(temp_hi, vget_high_u16(src[5]), vget_high_u16(f[5]));
+ temp_hi = vmlal_u16(temp_hi, vget_high_u16(src[7]), vget_high_u16(f[7]));
+
+ res_hi = vqsubq_u32(res_hi, temp_hi);
+
+ const uint16x8_t res = vcombine_u16(vqrshrn_n_u32(res_lo, kFilterBits),
+ vqrshrn_n_u32(res_hi, kFilterBits));
+
+ // Clip the result at (1 << bd) - 1.
+ return vminq_u16(res, vdupq_n_u16((1 << bitdepth) - 1));
+}
+
+template <int bitdepth>
+void SuperRes_NEON(const void* const coefficients, void* const source,
+ const ptrdiff_t source_stride, const int height,
+ const int downscaled_width, const int upscaled_width,
+ const int initial_subpixel_x, const int step,
+ void* const dest, const ptrdiff_t dest_stride) {
+ auto* src = static_cast<uint16_t*>(source) - DivideBy2(kSuperResFilterTaps);
+ auto* dst = static_cast<uint16_t*>(dest);
+ int y = height;
+ do {
+ const auto* filter = static_cast<const uint16_t*>(coefficients);
+ uint16_t* dst_ptr = dst;
+ ExtendLine<uint16_t>(src + DivideBy2(kSuperResFilterTaps), downscaled_width,
+ kSuperResHorizontalBorder, kSuperResHorizontalBorder);
+ int subpixel_x = initial_subpixel_x;
+ uint16x8_t sr[8];
+ int x = RightShiftWithCeiling(upscaled_width, 3);
+ // The below code calculates up to 7 extra upscaled
+ // pixels which will over-read up to 7 downscaled pixels in the end of each
+ // row. kSuperResHorizontalBorder accounts for this.
+ do {
+ for (int i = 0; i < 8; ++i, subpixel_x += step) {
+ sr[i] = vld1q_u16(&src[subpixel_x >> kSuperResScaleBits]);
+ }
+
+ Transpose8x8(sr);
+
+ const uint16x8_t d0 = SuperRes(sr, &filter, bitdepth);
+ vst1q_u16(dst_ptr, d0);
+ dst_ptr += 8;
+ } while (--x != 0);
+ src += source_stride;
+ dst += dest_stride;
+ } while (--y != 0);
+}
+
+void Init10bpp() {
+ Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+ dsp->super_res_coefficients = SuperResCoefficients_NEON;
+ dsp->super_res = SuperRes_NEON<10>;
+}
+
+} // namespace
+} // namespace high_bitdepth
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+void SuperResInit_NEON() {
+ low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ high_bitdepth::Init10bpp();
+#endif
+}
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_NEON
+#else // !LIBGAV1_ENABLE_NEON
namespace libgav1 {
namespace dsp {
diff --git a/src/dsp/arm/super_res_neon.h b/src/dsp/arm/super_res_neon.h
index f51785d..65e48c5 100644
--- a/src/dsp/arm/super_res_neon.h
+++ b/src/dsp/arm/super_res_neon.h
@@ -31,7 +31,10 @@ void SuperResInit_NEON();
#if LIBGAV1_ENABLE_NEON
#define LIBGAV1_Dsp8bpp_SuperRes LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_SuperResClip LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_SuperResCoefficients LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_SuperResCoefficients LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_SuperRes LIBGAV1_CPU_NEON
#endif // LIBGAV1_ENABLE_NEON
#endif // LIBGAV1_SRC_DSP_ARM_SUPER_RES_NEON_H_
diff --git a/src/dsp/arm/warp_neon.cc b/src/dsp/arm/warp_neon.cc
index 7a41998..c7fb739 100644
--- a/src/dsp/arm/warp_neon.cc
+++ b/src/dsp/arm/warp_neon.cc
@@ -289,7 +289,7 @@ void Warp_NEON(const void* const source, const ptrdiff_t source_stride,
const int16x8_t sum = vld1q_s16(tmp);
vst1_u8(reinterpret_cast<uint8_t*>(dst_row), vqmovun_s16(sum));
}
-#else // !defined(__aarch64__)
+#else // !defined(__aarch64__)
int16x8_t filter[8];
for (int x = 0; x < 8; ++x) {
const int offset =
@@ -442,7 +442,7 @@ void WarpInit_NEON() { low_bitdepth::Init8bpp(); }
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_NEON
+#else // !LIBGAV1_ENABLE_NEON
namespace libgav1 {
namespace dsp {
diff --git a/src/dsp/arm/weight_mask_neon.cc b/src/dsp/arm/weight_mask_neon.cc
index 49d3be0..7e5bff0 100644
--- a/src/dsp/arm/weight_mask_neon.cc
+++ b/src/dsp/arm/weight_mask_neon.cc
@@ -451,7 +451,7 @@ void WeightMaskInit_NEON() { low_bitdepth::Init8bpp(); }
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_NEON
+#else // !LIBGAV1_ENABLE_NEON
namespace libgav1 {
namespace dsp {
diff --git a/src/dsp/average_blend.cc b/src/dsp/average_blend.cc
index a59abb0..d3ec21f 100644
--- a/src/dsp/average_blend.cc
+++ b/src/dsp/average_blend.cc
@@ -76,9 +76,7 @@ void Init10bpp() {
Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
assert(dsp != nullptr);
#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
-#ifndef LIBGAV1_Dsp10bpp_AverageBlend
dsp->average_blend = AverageBlend_C<10, uint16_t>;
-#endif
#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
static_cast<void>(dsp);
#ifndef LIBGAV1_Dsp10bpp_AverageBlend
diff --git a/src/dsp/average_blend_test.cc b/src/dsp/average_blend_test.cc
new file mode 100644
index 0000000..fe8a9d6
--- /dev/null
+++ b/src/dsp/average_blend_test.cc
@@ -0,0 +1,322 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/average_blend.h"
+
+#include <cstdint>
+#include <ostream>
+#include <string>
+#include <type_traits>
+
+#include "absl/strings/match.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/distance_weighted_blend.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "tests/block_utils.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kNumSpeedTests = 5e8;
+constexpr char kAverageBlend[] = "AverageBlend";
+// average_blend is applied to compound prediction values. This implies a range
+// far exceeding that of pixel values.
+// The ranges include kCompoundOffset in 10bpp and 12bpp.
+// see: src/dsp/convolve.cc & src/dsp/warp.cc.
+constexpr int kCompoundPredictionRange[3][2] = {
+ // 8bpp
+ {-5132, 9212},
+ // 10bpp
+ {3988, 61532},
+ // 12bpp
+ {3974, 61559},
+};
+
+struct TestParam {
+ TestParam(int width, int height) : width(width), height(height) {}
+ int width;
+ int height;
+};
+
+std::ostream& operator<<(std::ostream& os, const TestParam& param) {
+ return os << "BlockSize" << param.width << "x" << param.height;
+}
+
+template <int bitdepth, typename Pixel>
+class AverageBlendTest : public testing::TestWithParam<TestParam>,
+ public test_utils::MaxAlignedAllocable {
+ public:
+ AverageBlendTest() = default;
+ ~AverageBlendTest() override = default;
+
+ void SetUp() override {
+ test_utils::ResetDspTable(bitdepth);
+ AverageBlendInit_C();
+ DistanceWeightedBlendInit_C();
+ const dsp::Dsp* const dsp = dsp::GetDspTable(bitdepth);
+ ASSERT_NE(dsp, nullptr);
+ base_func_ = dsp->average_blend;
+ const testing::TestInfo* const test_info =
+ testing::UnitTest::GetInstance()->current_test_info();
+ const absl::string_view test_case = test_info->test_suite_name();
+ if (absl::StartsWith(test_case, "C/")) {
+ base_func_ = nullptr;
+ } else if (absl::StartsWith(test_case, "SSE41/")) {
+ if ((GetCpuInfo() & kSSE4_1) != 0) {
+ AverageBlendInit_SSE4_1();
+ }
+ } else if (absl::StartsWith(test_case, "NEON/")) {
+ AverageBlendInit_NEON();
+ } else {
+ FAIL() << "Unrecognized architecture prefix in test case name: "
+ << test_case;
+ }
+ func_ = dsp->average_blend;
+ dist_blend_func_ = dsp->distance_weighted_blend;
+ }
+
+ protected:
+ void Test(const char* digest, int num_tests, bool debug);
+
+ private:
+ using PredType =
+ typename std::conditional<bitdepth == 8, int16_t, uint16_t>::type;
+ static constexpr int kDestStride = kMaxSuperBlockSizeInPixels;
+ const int width_ = GetParam().width;
+ const int height_ = GetParam().height;
+ alignas(kMaxAlignment) PredType
+ source1_[kMaxSuperBlockSizeInPixels * kMaxSuperBlockSizeInPixels];
+ alignas(kMaxAlignment) PredType
+ source2_[kMaxSuperBlockSizeInPixels * kMaxSuperBlockSizeInPixels];
+ Pixel dest_[kMaxSuperBlockSizeInPixels * kMaxSuperBlockSizeInPixels] = {};
+ Pixel reference_[kMaxSuperBlockSizeInPixels * kMaxSuperBlockSizeInPixels] =
+ {};
+ dsp::AverageBlendFunc base_func_;
+ dsp::AverageBlendFunc func_;
+ dsp::DistanceWeightedBlendFunc dist_blend_func_;
+};
+
+template <int bitdepth, typename Pixel>
+void AverageBlendTest<bitdepth, Pixel>::Test(const char* digest, int num_tests,
+ bool debug) {
+ if (func_ == nullptr) return;
+ libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+ PredType* src_1 = source1_;
+ PredType* src_2 = source2_;
+ for (int y = 0; y < height_; ++y) {
+ for (int x = 0; x < width_; ++x) {
+ constexpr int bitdepth_index = (bitdepth - 8) >> 1;
+ const int min_val = kCompoundPredictionRange[bitdepth_index][0];
+ const int max_val = kCompoundPredictionRange[bitdepth_index][1];
+ src_1[x] = static_cast<PredType>(rnd(max_val - min_val) + min_val);
+ src_2[x] = static_cast<PredType>(rnd(max_val - min_val) + min_val);
+ }
+ src_1 += width_;
+ src_2 += width_;
+ }
+ absl::Duration elapsed_time;
+ for (int i = 0; i < num_tests; ++i) {
+ const absl::Time start = absl::Now();
+ func_(source1_, source2_, width_, height_, dest_,
+ sizeof(dest_[0]) * kDestStride);
+ elapsed_time += absl::Now() - start;
+ }
+ if (debug) {
+ if (base_func_ != nullptr) {
+ base_func_(source1_, source2_, width_, height_, reference_,
+ sizeof(reference_[0]) * kDestStride);
+ } else {
+ // Use dist_blend_func_ as the base for C tests.
+ const int8_t weight = 8;
+ dist_blend_func_(source1_, source2_, weight, weight, width_, height_,
+ reference_, sizeof(reference_[0]) * kDestStride);
+ }
+ EXPECT_TRUE(test_utils::CompareBlocks(dest_, reference_, width_, height_,
+ kDestStride, kDestStride, false));
+ }
+
+ test_utils::CheckMd5Digest(
+ kAverageBlend, absl::StrFormat("%dx%d", width_, height_).c_str(), digest,
+ dest_, sizeof(dest_[0]) * kDestStride * height_, elapsed_time);
+}
+
+const TestParam kTestParam[] = {
+ TestParam(4, 4), TestParam(4, 8), TestParam(8, 8),
+ TestParam(8, 16), TestParam(16, 8), TestParam(16, 16),
+ TestParam(16, 32), TestParam(32, 16), TestParam(32, 32),
+ TestParam(32, 64), TestParam(64, 32), TestParam(64, 64),
+ TestParam(64, 128), TestParam(128, 64), TestParam(128, 128),
+};
+
+using AverageBlendTest8bpp = AverageBlendTest<8, uint8_t>;
+
+const char* GetAverageBlendDigest8bpp(const TestParam block_size) {
+ static const char* const kDigestsWidth4[] = {
+ "152bcc35946900b1ed16369b3e7a81b7",
+ "c23e9b5698f7384eaae30a3908118b77",
+ };
+ static const char* const kDigestsWidth8[] = {
+ "d90d3abd368e58c513070a88b34649ba",
+ "77f7d53d0edeffb3537afffd9ff33a4a",
+ };
+ static const char* const kDigestsWidth16[] = {
+ "a50e268e93b48ae39cc2a47d377410e2",
+ "65c8502ff6d78065d466f9911ed6bb3e",
+ "bc2c873b9f5d74b396e1df705e87f699",
+ };
+ static const char* const kDigestsWidth32[] = {
+ "ca40d46d89773e7f858b15fcecd43cc0",
+ "bfdc894707323f4dc43d1326309f8368",
+ "f4733417621719b7feba3166ec0da5b9",
+ };
+ static const char* const kDigestsWidth64[] = {
+ "db38fe2e082bd4a09acb3bb1d52ee11e",
+ "3ad44401cc731215c46c9b7d96f7e4ae",
+ "6c43267be5ed03d204a05fe36090f870",
+ };
+ static const char* const kDigestsWidth128[] = {
+ "c8cfe46ebf166c1cbf08e8804206aadb",
+ "b0557b5156d2334c8ce4a7ee12f9d6b4",
+ };
+ // height < width implies 0.
+ // height == width implies 1.
+ // height > width implies 2.
+ const int height_index = block_size.height / block_size.width;
+ switch (block_size.width) {
+ case 4:
+ return kDigestsWidth4[height_index - 1];
+ case 8:
+ return kDigestsWidth8[height_index - 1];
+ case 16:
+ return kDigestsWidth16[height_index];
+ case 32:
+ return kDigestsWidth32[height_index];
+ case 64:
+ return kDigestsWidth64[height_index];
+ default:
+ EXPECT_EQ(block_size.width, 128)
+ << "Unknown width parameter: " << block_size.width;
+ return kDigestsWidth128[height_index];
+ }
+}
+
+TEST_P(AverageBlendTest8bpp, Blending) {
+ Test(GetAverageBlendDigest8bpp(GetParam()), 1, false);
+}
+
+TEST_P(AverageBlendTest8bpp, DISABLED_Speed) {
+ Test(GetAverageBlendDigest8bpp(GetParam()),
+ kNumSpeedTests / (GetParam().height * GetParam().width), false);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, AverageBlendTest8bpp,
+ testing::ValuesIn(kTestParam));
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, AverageBlendTest8bpp,
+ testing::ValuesIn(kTestParam));
+#endif
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, AverageBlendTest8bpp,
+ testing::ValuesIn(kTestParam));
+#endif
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using AverageBlendTest10bpp = AverageBlendTest<10, uint16_t>;
+
+const char* GetAverageBlendDigest10bpp(const TestParam block_size) {
+ static const char* const kDigestsWidth4[] = {
+ "98c0671c092b4288adcaaa17362cc4a3",
+ "7083f3def8bfb63ab3a985ef5616a923",
+ };
+ static const char* const kDigestsWidth8[] = {
+ "3bee144b9ea6f4288b860c24f88a22f3",
+ "27113bd17bf95034f100e9046c7b59d2",
+ };
+ static const char* const kDigestsWidth16[] = {
+ "24c9e079b9a8647a6ee03f5441f2cdd9",
+ "dd05777751ccdb4356856c90e1176e53",
+ "27b1d69d035b1525c013b7373cfe3875",
+ };
+ static const char* const kDigestsWidth32[] = {
+ "efd24dd7b555786bff1a482e51170ea3",
+ "3b37ddac87de443cd18784f02c2d1dd5",
+ "80d8070939a743a20689a65bf5dc0a68",
+ };
+ static const char* const kDigestsWidth64[] = {
+ "af1fe8c52487c9f2951c3ea516828abb",
+ "ea6f18ff56b053748c18032b7e048e83",
+ "af0cb87fe27d24c2e0afd2c90a8533a6",
+ };
+ static const char* const kDigestsWidth128[] = {
+ "16a83b19911d6dc7278a694b8baa9901",
+ "bd22e77ce6fa727267ff63eeb4dcb19c",
+ };
+ // (height < width) -> 0
+ // (height == width) -> 1
+ // (height > width) -> 2
+ const int height_index = block_size.height / block_size.width;
+ switch (block_size.width) {
+ case 4:
+ return kDigestsWidth4[height_index - 1];
+ case 8:
+ return kDigestsWidth8[height_index - 1];
+ case 16:
+ return kDigestsWidth16[height_index];
+ case 32:
+ return kDigestsWidth32[height_index];
+ case 64:
+ return kDigestsWidth64[height_index];
+ default:
+ EXPECT_EQ(block_size.width, 128)
+ << "Unknown width parameter: " << block_size.width;
+ return kDigestsWidth128[height_index];
+ }
+}
+
+TEST_P(AverageBlendTest10bpp, Blending) {
+ Test(GetAverageBlendDigest10bpp(GetParam()), 1, false);
+}
+
+TEST_P(AverageBlendTest10bpp, DISABLED_Speed) {
+ Test(GetAverageBlendDigest10bpp(GetParam()),
+ kNumSpeedTests / (GetParam().height * GetParam().width) / 2, false);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, AverageBlendTest10bpp,
+ testing::ValuesIn(kTestParam));
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, AverageBlendTest10bpp,
+ testing::ValuesIn(kTestParam));
+#endif
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, AverageBlendTest10bpp,
+ testing::ValuesIn(kTestParam));
+#endif
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+} // namespace
+} // namespace dsp
+} // namespace libgav1
diff --git a/src/dsp/cdef.h b/src/dsp/cdef.h
index 2d70d2c..b820b77 100644
--- a/src/dsp/cdef.h
+++ b/src/dsp/cdef.h
@@ -30,6 +30,7 @@
// The order of includes is important as each tests for a superior version
// before setting the base.
// clang-format off
+#include "src/dsp/x86/cdef_avx2.h"
#include "src/dsp/x86/cdef_sse4.h"
// clang-format on
// IWYU pragma: end_exports
diff --git a/src/dsp/cdef_test.cc b/src/dsp/cdef_test.cc
new file mode 100644
index 0000000..fd64593
--- /dev/null
+++ b/src/dsp/cdef_test.cc
@@ -0,0 +1,409 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/cdef.h"
+
+#include <cstdint>
+#include <cstring>
+#include <ostream>
+
+#include "absl/strings/match.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/third_party/libvpx/md5_helper.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr char kCdef[] = "Cdef";
+constexpr char kCdefDirectionName[] = "Cdef Direction";
+constexpr char kCdefFilterName[] = "Cdef Filtering";
+constexpr int kTestBufferStride = 8;
+constexpr int kTestBufferSize = 64;
+constexpr int kSourceStride = kMaxSuperBlockSizeInPixels + 2 * 8;
+constexpr int kSourceBufferSize =
+ (kMaxSuperBlockSizeInPixels + 2 * 3) * kSourceStride;
+constexpr int kNumSpeedTests = 5000;
+
+const char* GetDirectionDigest(const int bitdepth, const int num_runs) {
+ static const char* const kDigest[2][2] = {
+ {"de78c820a1fec7e81385aa0a615dbf8c", "7bfc543244f932a542691480dc4541b2"},
+ {"b54236de5d25e16c0f8678d9784cb85e", "559144cf183f3c69cb0e5d98cbf532ff"}};
+ const int bitdepth_index = (bitdepth == 8) ? 0 : 1;
+ const int run_index = (num_runs == 1) ? 0 : 1;
+ return kDigest[bitdepth_index][run_index];
+}
+
+template <int bitdepth, typename Pixel>
+class CdefDirectionTest : public testing::TestWithParam<int> {
+ public:
+ CdefDirectionTest() = default;
+ CdefDirectionTest(const CdefDirectionTest&) = delete;
+ CdefDirectionTest& operator=(const CdefDirectionTest&) = delete;
+ ~CdefDirectionTest() override = default;
+
+ protected:
+ void SetUp() override {
+ test_utils::ResetDspTable(bitdepth);
+ CdefInit_C();
+
+ const Dsp* const dsp = GetDspTable(bitdepth);
+ ASSERT_NE(dsp, nullptr);
+ base_cdef_direction_ = nullptr;
+ const testing::TestInfo* const test_info =
+ testing::UnitTest::GetInstance()->current_test_info();
+ const char* const test_case = test_info->test_suite_name();
+ if (absl::StartsWith(test_case, "C/")) {
+ } else if (absl::StartsWith(test_case, "SSE41/")) {
+ CdefInit_SSE4_1();
+ } else if (absl::StartsWith(test_case, "AVX2/")) {
+ if ((GetCpuInfo() & kAVX2) != 0) {
+ CdefInit_AVX2();
+ }
+ } else if (absl::StartsWith(test_case, "NEON/")) {
+ CdefInit_NEON();
+ } else {
+ FAIL() << "Unrecognized architecture prefix in test case name: "
+ << test_case;
+ }
+ cur_cdef_direction_ = dsp->cdef_direction;
+ }
+
+ void TestRandomValues(int num_runs);
+
+ Pixel buffer_[kTestBufferSize];
+ int strength_;
+ int size_;
+
+ CdefDirectionFunc base_cdef_direction_;
+ CdefDirectionFunc cur_cdef_direction_;
+};
+
+template <int bitdepth, typename Pixel>
+void CdefDirectionTest<bitdepth, Pixel>::TestRandomValues(int num_runs) {
+ if (cur_cdef_direction_ == nullptr) return;
+ libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+ absl::Duration elapsed_time;
+ libvpx_test::MD5 actual_digest;
+ for (int num_tests = 0; num_tests < num_runs; ++num_tests) {
+ for (int level = 0; level < (1 << bitdepth); level += 1 + (bitdepth - 8)) {
+ for (int bits = 0; bits <= bitdepth; ++bits) {
+ for (auto& pixel : buffer_) {
+ pixel = Clip3((rnd.Rand16() & ((1 << bits) - 1)) + level, 0,
+ (1 << bitdepth) - 1);
+ }
+ int output[2] = {};
+ const absl::Time start = absl::Now();
+ cur_cdef_direction_(buffer_, kTestBufferStride * sizeof(Pixel),
+ reinterpret_cast<uint8_t*>(&output[0]), &output[1]);
+ elapsed_time += absl::Now() - start;
+ actual_digest.Add(reinterpret_cast<const uint8_t*>(output),
+ sizeof(output));
+ }
+ }
+ }
+ test_utils::CheckMd5Digest(kCdef, kCdefDirectionName,
+ GetDirectionDigest(bitdepth, num_runs),
+ actual_digest.Get(), elapsed_time);
+}
+
+using CdefDirectionTest8bpp = CdefDirectionTest<8, uint8_t>;
+
+TEST_P(CdefDirectionTest8bpp, Correctness) { TestRandomValues(1); }
+
+TEST_P(CdefDirectionTest8bpp, DISABLED_Speed) {
+ TestRandomValues(kNumSpeedTests / 100);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, CdefDirectionTest8bpp, testing::Values(0));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, CdefDirectionTest8bpp, testing::Values(0));
+#endif
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, CdefDirectionTest8bpp, testing::Values(0));
+#endif
+
+#if LIBGAV1_ENABLE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, CdefDirectionTest8bpp, testing::Values(0));
+#endif // LIBGAV1_ENABLE_AVX2
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using CdefDirectionTest10bpp = CdefDirectionTest<10, uint16_t>;
+
+TEST_P(CdefDirectionTest10bpp, Correctness) { TestRandomValues(1); }
+
+TEST_P(CdefDirectionTest10bpp, DISABLED_Speed) {
+ TestRandomValues(kNumSpeedTests / 100);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, CdefDirectionTest10bpp, testing::Values(0));
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+const char* GetDigest8bpp(int id) {
+ static const char* const kDigest[] = {
+ "b6fe1a1f5bbb23e35197160ce57d90bd", "8aed39871b19184f1d381b145779bc33",
+ "82653dd66072e8ebd967083a0413ab03", "421c048396bc66ffaa6aafa016c7bc54",
+ "1f70ba51091e8c6034c3f0974af241c3", "8f700997452a24091136ca58890a5be4",
+ "9deaaf07db25ca1d96ea8762925372d3", "7edadd9ad058be518430e64f78fe34a2",
+ "862362a654edb2562609895395eb69cd", "3b4dae4d353b75f652ce67f96b2fd718",
+ "65c51f49e4fd848d9fef23a346702b17", "f93b3fa86764e53e4c206ef01d5ee9db",
+ "202e36551bc147c30b76ae359d5f7646", "3de677a2b6fe4aa6fc29a5e5f2d63063",
+ "ab860362809e878f7b47dacc6087bce3", "c0d991affc8aeb45d91ae36e7b3d77d8",
+ "27f19fffabfb79104b4be3c272723f62", "a54b981f562e2cf10a4fb037d0181e2d",
+ "9a65933d02867a1e8fc1f29097d4d0db", "c068b21d232145c61db8ef9298447bfa",
+ "8db1948c23648372509e4f3577e8eaa0", "c08a3b192ab0a47abe22f7f0ae78a5d7",
+ "4ff9bd4ae06f2cc2d2660df41cf1baca", "a0a634e48c55a2ca340cf5cac7f74cb6",
+ "f9f631985b42214f8b059c8f119d4401", "5fb136073300a45d74145649473970da",
+ "33624aab8ba0264657fa9304dbdcf72c", "e6a15775d451a3c4803a7c0604deb0ea",
+ "4c28b63022cdc5ea0e49b492c187d53d", "c5fa9792ee292d29c5a864e376ddacc0",
+ "fcdf7319978b64f03ca3b9d4d83a0c2a", "394931c89bd5065308b0633d12370b19",
+ "9e702d68000c1b02759001e9a8876df2", "c844919f0114e83960dd329b1aa7146f",
+ "499248c675884db3ef57018d0a0868b5", "4a9041ed183f9add717e5ddcdb280799",
+ };
+ return kDigest[id];
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+const char* GetDigest10bpp(int id) {
+ static const char* const kDigest[] = {
+ "0a9630b39974850998db653b07e09ab4", "97a924661d931b23ee57893da617ae70",
+ "0d79516b9a491ce5112eb00bbae5eb80", "d5801fd96029a7509cf66dde61e8e2d8",
+ "5bf5c0ea5a85e9b6c1e6991619c34ebc", "e2f1c08a8b3cd93b3a85511493a0ee31",
+ "18910f422e386c71ffde8680176d61c0", "3255afe8b3db5be4c17299420ae9b4b3",
+ "ccac34de92891d4ef25820737e7a4f06", "5c2109c4142867c15bc6bb81e19b8058",
+ "86e8300e2ad292bfce95185530ef06c8", "21c06ed6d62b8fbef1363cd177386cd0",
+ "fd6687987dbff6f15210c2cc61570daa", "7cb246cb65a9cf9b2f829ab086f7c45a",
+ "3a38dc3c89f7e400383b1b7ce3e73008", "7b23b520e41ad510b9608b47f9c5f87e",
+ "f9ca24b57fc06d7b8dc4151bbc4d2840", "070ef8fa64dcdc45701428ee6ef0ca79",
+ "0e7e3ca3cf8546972d01fc262b2b9cfb", "9ac81b7cf93173f33d195927b0a3685a",
+ "1f964b6959774651a79d961e5a2a6a56", "64d5f88995a918a85df317d4240f0862",
+ "55c94ec09facda30fac677d205beb708", "2c010b256f4dabf42ef78bf5a3851b2c",
+ "c7d18d0e287fa8658b94131603e378db", "4f7696fe2c8dbedd0c8e8a53b9dec0fc",
+ "b3483dc32665a4bb0606d78dfb3d285c", "0bcb4acd4090f5798c2d260df73b2c46",
+ "4f574c782f3b28fb9c85cdb70dfcb46a", "14bd700a88be0107e9ef2fe54f75cee6",
+ "5d3b2698c9ffa4a6aed45a9adbddb8bf", "eff870414f80897cf8958ebeea84f0a6",
+ "e042843275f82271a9f540bc3e4ef35c", "26e3ff3d661dac25861a0f5bab522340",
+ "239844e66b07796003f9315166b9e29e", "44b8e6884215a1793cc7f8f7ce40bcee",
+ };
+ return kDigest[id];
+}
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+struct CdefTestParam {
+ CdefTestParam(int subsampling_x, int subsampling_y, int rows4x4,
+ int columns4x4)
+ : subsampling_x(subsampling_x),
+ subsampling_y(subsampling_y),
+ rows4x4(rows4x4),
+ columns4x4(columns4x4) {}
+ int subsampling_x;
+ int subsampling_y;
+ int rows4x4;
+ int columns4x4;
+};
+
+std::ostream& operator<<(std::ostream& os, const CdefTestParam& param) {
+ return os << "subsampling(x/y): " << param.subsampling_x << "/"
+ << param.subsampling_y << ", (rows,columns)4x4: " << param.rows4x4
+ << ", " << param.columns4x4;
+}
+
+// TODO(b/154245961): rework the parameters for this test to match
+// CdefFilteringFuncs. It should cover 4x4, 8x4, 8x8 blocks and
+// primary/secondary strength combinations for both Y and UV.
+template <int bitdepth, typename Pixel>
+class CdefFilteringTest : public testing::TestWithParam<CdefTestParam> {
+ public:
+ CdefFilteringTest() = default;
+ CdefFilteringTest(const CdefFilteringTest&) = delete;
+ CdefFilteringTest& operator=(const CdefFilteringTest&) = delete;
+ ~CdefFilteringTest() override = default;
+
+ protected:
+ void SetUp() override {
+ test_utils::ResetDspTable(bitdepth);
+ CdefInit_C();
+
+ const Dsp* const dsp = GetDspTable(bitdepth);
+ ASSERT_NE(dsp, nullptr);
+ const testing::TestInfo* const test_info =
+ testing::UnitTest::GetInstance()->current_test_info();
+ const char* const test_case = test_info->test_suite_name();
+ if (absl::StartsWith(test_case, "C/")) {
+ } else if (absl::StartsWith(test_case, "NEON/")) {
+ CdefInit_NEON();
+ } else if (absl::StartsWith(test_case, "SSE41/")) {
+ CdefInit_SSE4_1();
+ } else if (absl::StartsWith(test_case, "AVX2/")) {
+ if ((GetCpuInfo() & kAVX2) != 0) {
+ CdefInit_AVX2();
+ }
+ } else {
+ FAIL() << "Unrecognized architecture prefix in test case name: "
+ << test_case;
+ }
+ memcpy(cur_cdef_filter_, dsp->cdef_filters, sizeof(cur_cdef_filter_));
+ }
+
+ void TestRandomValues(int num_runs);
+
+ uint16_t source_[kSourceBufferSize];
+ Pixel dest_[kMaxPlanes][kTestBufferSize];
+ int primary_strength_;
+ int secondary_strength_;
+ int damping_;
+ int direction_;
+ CdefTestParam param_ = GetParam();
+
+ CdefFilteringFuncs cur_cdef_filter_;
+};
+
+template <int bitdepth, typename Pixel>
+void CdefFilteringTest<bitdepth, Pixel>::TestRandomValues(int num_runs) {
+ const int id = ((param_.rows4x4 < 4) + (param_.rows4x4 < 2)) * 3 +
+ param_.subsampling_x * 9 + param_.subsampling_y * 18;
+ absl::Duration elapsed_time;
+ for (int num_tests = 0; num_tests < num_runs; ++num_tests) {
+ for (int plane = kPlaneY; plane < kMaxPlanes; ++plane) {
+ const int subsampling_x = (plane == kPlaneY) ? 0 : param_.subsampling_x;
+ const int subsampling_y = (plane == kPlaneY) ? 0 : param_.subsampling_y;
+ const int block_width = 8 >> subsampling_x;
+ const int block_height = 8 >> subsampling_y;
+ libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed() +
+ id + plane);
+ const int offset = 2 * kSourceStride + 2;
+ // Fill boundaries with a large value such that cdef does not take them
+ // into calculation.
+ const int plane_width = MultiplyBy4(param_.columns4x4) >> subsampling_x;
+ const int plane_height = MultiplyBy4(param_.rows4x4) >> subsampling_y;
+ for (int y = 0; y < plane_height; ++y) {
+ for (int x = 0; x < plane_width; ++x) {
+ source_[y * kSourceStride + x + offset] =
+ rnd.Rand16() & ((1 << bitdepth) - 1);
+ }
+ }
+ for (int y = 0; y < 2; ++y) {
+ Memset(&source_[y * kSourceStride], kCdefLargeValue, kSourceStride);
+ Memset(&source_[(y + plane_height + 2) * kSourceStride],
+ kCdefLargeValue, kSourceStride);
+ }
+ for (int y = 0; y < plane_height; ++y) {
+ Memset(&source_[y * kSourceStride + offset - 2], kCdefLargeValue, 2);
+ Memset(&source_[y * kSourceStride + offset + plane_width],
+ kCdefLargeValue, 2);
+ }
+ do {
+ int strength = rnd.Rand16() & 15;
+ if (strength == 3) ++strength;
+ primary_strength_ = strength << (bitdepth - 8);
+ } while (primary_strength_ == 0);
+ do {
+ int strength = rnd.Rand16() & 3;
+ if (strength == 3) ++strength;
+ secondary_strength_ = strength << (bitdepth - 8);
+ } while (secondary_strength_ == 0);
+ damping_ = (rnd.Rand16() & 3) + 3;
+ direction_ = (rnd.Rand16() & 7);
+
+ memset(dest_[plane], 0, sizeof(dest_[plane]));
+ const absl::Time start = absl::Now();
+ const int width_index = block_width >> 3;
+ if (cur_cdef_filter_[width_index][0] == nullptr) return;
+ cur_cdef_filter_[width_index][0](
+ source_ + offset, kSourceStride, block_height, primary_strength_,
+ secondary_strength_, damping_, direction_, dest_[plane],
+ kTestBufferStride * sizeof(dest_[0][0]));
+ elapsed_time += absl::Now() - start;
+ }
+ }
+
+ for (int plane = kPlaneY; plane < kMaxPlanes; ++plane) {
+ if (bitdepth == 8) {
+ test_utils::CheckMd5Digest(kCdef, kCdefFilterName,
+ GetDigest8bpp(id + plane),
+ reinterpret_cast<uint8_t*>(dest_[plane]),
+ sizeof(dest_[plane]), elapsed_time);
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ } else {
+ test_utils::CheckMd5Digest(kCdef, kCdefFilterName,
+ GetDigest10bpp(id + plane),
+ reinterpret_cast<uint8_t*>(dest_[plane]),
+ sizeof(dest_[plane]), elapsed_time);
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+ }
+ }
+}
+
+// Do not test single blocks with any subsampling. 2xH and Wx2 blocks are not
+// supported.
+const CdefTestParam cdef_test_param[] = {
+ CdefTestParam(0, 0, 4, 4), CdefTestParam(0, 0, 2, 2),
+ CdefTestParam(1, 0, 4, 4), CdefTestParam(1, 0, 2, 2),
+ CdefTestParam(0, 1, 4, 4), CdefTestParam(0, 1, 2, 2),
+ CdefTestParam(1, 1, 4, 4), CdefTestParam(1, 1, 2, 2),
+};
+
+using CdefFilteringTest8bpp = CdefFilteringTest<8, uint8_t>;
+
+TEST_P(CdefFilteringTest8bpp, Correctness) { TestRandomValues(1); }
+
+TEST_P(CdefFilteringTest8bpp, DISABLED_Speed) {
+ TestRandomValues(kNumSpeedTests);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, CdefFilteringTest8bpp,
+ testing::ValuesIn(cdef_test_param));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, CdefFilteringTest8bpp,
+ testing::ValuesIn(cdef_test_param));
+#endif
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, CdefFilteringTest8bpp,
+ testing::ValuesIn(cdef_test_param));
+#endif
+
+#if LIBGAV1_ENABLE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, CdefFilteringTest8bpp,
+ testing::ValuesIn(cdef_test_param));
+#endif // LIBGAV1_ENABLE_AVX2
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using CdefFilteringTest10bpp = CdefFilteringTest<10, uint16_t>;
+
+TEST_P(CdefFilteringTest10bpp, Correctness) { TestRandomValues(1); }
+
+TEST_P(CdefFilteringTest10bpp, DISABLED_Speed) {
+ TestRandomValues(kNumSpeedTests);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, CdefFilteringTest10bpp,
+ testing::ValuesIn(cdef_test_param));
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+} // namespace
+} // namespace dsp
+} // namespace libgav1
diff --git a/src/dsp/constants.cc b/src/dsp/constants.cc
index 0099ca3..1b85795 100644
--- a/src/dsp/constants.cc
+++ b/src/dsp/constants.cc
@@ -20,7 +20,7 @@ namespace libgav1 {
// Each set of 7 taps is padded with a 0 to easily align and pack into the high
// and low 8 bytes. This way, we can load 16 at a time to fit mulhi and mullo.
-const int8_t kFilterIntraTaps[kNumFilterIntraPredictors][8][8] = {
+alignas(16) const int8_t kFilterIntraTaps[kNumFilterIntraPredictors][8][8] = {
{{-6, 10, 0, 0, 0, 12, 0, 0},
{-5, 2, 10, 0, 0, 9, 0, 0},
{-3, 1, 1, 10, 0, 7, 0, 0},
diff --git a/src/dsp/convolve.cc b/src/dsp/convolve.cc
index 8c6f68f..727b4af 100644
--- a/src/dsp/convolve.cc
+++ b/src/dsp/convolve.cc
@@ -623,6 +623,8 @@ void ConvolveIntraBlockCopy2D_C(const void* const reference,
const int /*vertical_filter_id*/,
const int width, const int height,
void* prediction, const ptrdiff_t pred_stride) {
+ assert(width >= 4 && width <= kMaxSuperBlockSizeInPixels);
+ assert(height >= 4 && height <= kMaxSuperBlockSizeInPixels);
const auto* src = static_cast<const Pixel*>(reference);
const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
auto* dest = static_cast<Pixel*>(prediction);
@@ -676,6 +678,8 @@ void ConvolveIntraBlockCopy1D_C(const void* const reference,
const int /*vertical_filter_id*/,
const int width, const int height,
void* prediction, const ptrdiff_t pred_stride) {
+ assert(width >= 4 && width <= kMaxSuperBlockSizeInPixels);
+ assert(height >= 4 && height <= kMaxSuperBlockSizeInPixels);
const auto* src = static_cast<const Pixel*>(reference);
const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
auto* dest = static_cast<Pixel*>(prediction);
diff --git a/src/dsp/convolve_test.cc b/src/dsp/convolve_test.cc
new file mode 100644
index 0000000..4a2a9f1
--- /dev/null
+++ b/src/dsp/convolve_test.cc
@@ -0,0 +1,1373 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/convolve.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+#include <ostream>
+#include <string>
+#include <tuple>
+
+#include "absl/strings/match.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "tests/block_utils.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/third_party/libvpx/md5_helper.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// The convolve function will access at most (block_height + 7) rows/columns
+// from the beginning.
+constexpr int kMaxBlockWidth = kMaxSuperBlockSizeInPixels + kSubPixelTaps;
+constexpr int kMaxBlockHeight = kMaxSuperBlockSizeInPixels + kSubPixelTaps;
+
+// Test all the filters in |kSubPixelFilters|. There are 6 different filters but
+// filters [4] and [5] are only reached through GetFilterIndex().
+constexpr int kMinimumViableRuns = 4 * 16;
+
+// When is_scaled_convolve_ is true, we don't test every combination of
+// type_param_, so some digests in ths array are redudant, marked as
+// "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa".
+// We keep it so that the logic of calculation id in GetDigestId() is clearer.
+const char* GetDigest8bpp(int id) {
+ static const char* const kDigest[] = {
+ "ae5977a4ceffbac0cde72a04a43a9d57", "fab093b917d36f6b69fb4f50a6b5c822",
+ "1168251e6261e2ff1fa69a93226dbd76", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "d2f5ca2b7958c332a3fb771f66da01f0", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "6bbcc075f8b768a02cdc9149f150326d", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "c4e90cd202f9867517433b550afdc644", "43d6df191744f6c5d489c0673714a714",
+ "bfe8197057b0f3f096344251047f481f", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "1681719b0f8905d99382f4132fe1472a", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "8d24b59c0f3942079ba4945ed6686269", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "ae5977a4ceffbac0cde72a04a43a9d57", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "995318eff1fe62822366490192ad8b5e", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "0ef1c5beb3228c6d9ecf3ced584c4aa8", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "fc02228efb85c665bd27a3dab72a9037", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "6cf5f791fe0d8dcd3526be3c6b814035", "eaa0942097fd2b2dd621b77e0a659896",
+ "4821befdf63f8c6da6440afeb57f320f", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "7aec92c3b65e456b64ae285c12b03b0d", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "4ae70d9db2ec36885394db7d59bdd4f7", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "911212ae2492690de06d12bfaf71c7d4", "cb284b0ae039582039563638f682db26",
+ "6b4393b2d7387dd291d3a7bd3aabcae4", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "0804d93136549388b6cd7fdcd187a578", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "b25f037602efdb4eaacb3ade1dc5c28f", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "6cf5f791fe0d8dcd3526be3c6b814035", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "704b0bb4128aa163ef5899e6d8ad9664", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "abf3f31ec4daff000e80f7ab9628688b", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "09e12a389cd454e10f750062102ea1b2", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "d905dfcad930aded7718587c05b48aaf", "fe85aaee8007d2130d56919242e01163",
+ "c30fc44d83821141e84cc4793e127301", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "f72a99ad63f6a88c23724e898b705d21", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "5fee162fe52c11c823db4d5ede370654", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "a9210113ff6873e5b50d5d3ad67e440f", "b7633a78f959b20ca27ffb700b44b45c",
+ "6d1c5145be9fd636ababd64c64d23a10", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "d55d8012ddddb55e6c3e51dafab92980", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "b1948cb353fa308f0d5592b0ad338997", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "d905dfcad930aded7718587c05b48aaf", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "04e3b7f46e748431c76cf6125057601c", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "71362b65cffd008d1ca4a20adc8cc15f", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "987f7a6a8bef47acbd1e49bb39f51ac4", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "6baf153feff04cc5b7e87c0bb60a905d", "fa1ad095bf696745599079fb73975b75",
+ "a8293b933d9f2e5d7f922ea40111d643", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "07a1f07f114c4a38ba08d2f44e1e1132", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "9365186c59ef66d9def40f437022ad93", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "a7305087fae23de53d21a6909009ff69", "bd44440b5757b74bcc3e2f7f32ef42af",
+ "a5a1ac658d7ce4a846a32b9fcfaa3475", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "3b1ceebf0579fcbbfd6136938c595b91", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "3bfad931bce82335219e0e29c15f2b21", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "6baf153feff04cc5b7e87c0bb60a905d", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "4cfad2c437084a93ea76913e21c2dd89", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "1a0bdfc96a3b9fd904e658f238ab1076", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "b8a710baa6a9fc784909671d450ecd99", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "871ed5a69ca31e6444faa720895949bf", "e55d0c54fd28355d32e29d411488b571",
+ "354a54861a94e8b027afd9931e61f997", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "26b9de95edb45b31ac5aa19825831c7a", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "0f95fb0276c9c7910937fbdf75f2811d", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "8dcce009395264379c1a51239f4bb22c", "06925f05ea49811e3efc2a44b111b32b",
+ "2370f4e4a83edf91b7f504bbe4b00e90", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "ecafabcad1045f15d31ce2f3b13132f2", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "68a701313d2247d2b32636ebc1f2a008", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "871ed5a69ca31e6444faa720895949bf", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "d372f0c17bce98855d6d59fbee814c3d", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "56d16e54afe205e97527902770e71c71", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "f9e6a56382d8d12da676d6631bb6ef75", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "68e2f90eaa0ab5da7e6f5776993f7eea", "8718965c4831a363a321a25f4aada7ba",
+ "eeeb8589c1b31cbb565154736ca939ec", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "c1b836a6ce023663b90db0e320389414", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "b355dab2dbb6f5869018563eece22862", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "8dcce009395264379c1a51239f4bb22c", "e7c2bfd356c860c36053dea19f8d0705",
+ "ae5464066a049622a7a264cdf9394b55", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "5f211eba020e256a5781b203c5aa1d2e", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "05afe1f40d37a45a97a5e0aadd5066fb", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "68e2f90eaa0ab5da7e6f5776993f7eea", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "d99ffd2579eb781c30bc0df7b76ad61e", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "1f7b5b8282ff3cf4d8e8c52d80ef5b4d", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "3bf8e11e18527b16f0d7c0361d74a52d", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "f1f8282fb33c30eb68c0c315b7a4bc01", "4c718ddbe8b5aa7118c8bc1c2f5ea158",
+ "f49dab626ddd977ed171f79295c24935", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "5befcf222152ebc8d779fcc10b95320a", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "cf6ff8c43d8059cea6090a23ab66a0ef", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "d90a69e7bae8aa46ed0e1e5f911d7a07", "1d7113d705fa0edeef49e5c50a91151d",
+ "45368b6db3d1fee739a64b0bc823ea9c", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "3b04497634364dd2cd3f2482b5d4b32f", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "9e1f0e0bddb58d15d0925eeaede9b84c", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "f1f8282fb33c30eb68c0c315b7a4bc01", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "4e139e57cbb049a0f4ef816adc48d026", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "79e9e260a2028c5fe320005c272064b9", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "b9ff54c6f1e3b41fc7fc0f3fa0e75cf2", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "9412064b0eebf8123f23d74147d04dff", "0dee657827cd48c4ce4a7657f6f92233",
+ "78d2f27e0d4708cb16856d7d40dc16fb", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "62adf407fc27d8682ced4dd7b55af14e", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "a336f8b7bcf188840ca65c0d0e66518a", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "6ab4dc87be03be1dcc5d956ca819d938", "78cef82670ff99b1e4a279de3538c233",
+ "8dff0f28192d9f8c0bf7fb5405719dd8", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "a8ac7b5dc65ffb758b0643508a0e744e", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "03313cdaa593a1a7b4869010dcc7b241", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "9412064b0eebf8123f23d74147d04dff", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "be53b2507048e7ff50226d15c0b28865", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "2418ebcdf85551b9ae6e3725f04aae6d", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "06ef1504f31af5f173d3317866ca57cb", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "cc08936effe309ab9a4fa1bf7e28e24e", "a81bcdeb021d3a23477c40c47548df52",
+ "9d2393ea156a1c2083f5b4207793064b", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "35be0786a072bf2f1286989261bf6580", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "de953f03895923359c6a719e6a537b89", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "6ab4dc87be03be1dcc5d956ca819d938", "e053321d7c75951d5ff3dce85762acd3",
+ "632738ef3ff3021cff45045c41978849", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "561ed8be43c221a561f8885a0d74c7ef", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "88a50d2b4107ee5b5074b2520183f8ac", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "cc08936effe309ab9a4fa1bf7e28e24e", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "b73f3c1a10405de89d1f9e812ff73b5a", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "98bdf907ebacacb734c9eef1ee727c6e", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "635e8ee11cf04d73598549234ad732a0", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "36cbef36fa21b98df03536c918bf752a", "b7a4d080e2f24040eebb785f437de66a",
+ "a9c62745b95c66fa497a524886af57e2", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "90562fc42dc5d879ae74c4909c1dec30", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "8463ade9347ed602663e2cec5c4c3fe6", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "8f2afdb2f03cd04ffacd421b958caaa0", "2e15a26905467e5ad9f8da04b94e60b6",
+ "f7ec43384037e8d6c618e0df826ec029", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "8159619fc234598c8c75154d80021fd4", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "ac50ea9f7306da95a5092709442989cf", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "36cbef36fa21b98df03536c918bf752a", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "c7d51b1f2df49ab83962257e8a5934e5", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "4dd5672d53c8f359e8f80badaa843dfc", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "fab693410d59ee88aa2895527efc31ac", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "9d0da6321cf5311ea0bdd41271763030", "22ff7819c55ce6b2e0ce5431eb8c309c",
+ "2c614ec4463386ec075a0f1dbb587933", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "a1427352f9e413975a0949e2b300c657", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "392de11ffcd5c2ecf3db3480ee135340", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "710ccecc103033088d898a2b924551fb", "160c29a91e372d66b12e171e4d81bc18",
+ "a6bc648197781a2dc99c487e66464320", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "8f43645dce92cf7594aa4822aa53b17d", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "739b17591437edffd36799237b962658", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "9d0da6321cf5311ea0bdd41271763030", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "159e443d79cc59b11ca4a80aa7aa09be", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "a1bef519bbf07138e2eec5a91694de46", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "3041eb26c23a63a587fbec623919e2d2", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "55a10165ee8a660d7dddacf7de558cdd", "355b691a656e6a287e4504ef2fbb8034",
+ "7a8856480d752153370240b066b90f6a", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "bcbc418bc2beb243e463851cd95335a9", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "bddd31e3e852712e6244b616622af83d", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "710ccecc103033088d898a2b924551fb", "f6cb80c4d5683553929b1e41f99e487e",
+ "1112ebd509007154c72c5a485b220b62", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "b6ccddb7dfa4eddc87b4eff08b5a3195", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "b8a7eb7dd9c216e240517edfc6489397", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "55a10165ee8a660d7dddacf7de558cdd", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "6ef14b14882e1465b0482b0e0b16d8ce", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "df1cb51fe1a937cd7834e973dc5cb814", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "c61d99d5daf575664fb7ad64976f4b03", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "ac7fc9f9ea7213743fae5a023faaaf08", "a6307a981600c3fb5b9d3e89ddf55069",
+ "beaef1dbffadc701fccb7c18a03e3a41", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "cb8fedcbecee3947358dc61f95e56530", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "30a36245c40d978fc8976b442a8600c3", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "a4093e3e5902dd659407ce6471635a4e", "658f0f51eb2f965f7490053852653fc0",
+ "9714c4ce636b6fb0ad05cba246d48c76", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "b4e605327b28db573d88844a1a09db8d", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "75b755f199dbf4a0e5ebbb86c2bd871d", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "ac7fc9f9ea7213743fae5a023faaaf08", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "22a8d287b425c870f40c64a50f91ce54", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "317fe65abf81ef3ea07976ef8667baeb", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "822f6c4eb5db760468d822b21f48d94d", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "077e1b7b355c7ab3ca40230ee8efd8ea", "628229ce2484d67e72c51b2f4ad124a6",
+ "72b1e700c949d06eaf62d664dafdb5b6", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "0d0154a7d573685285a83a4cf201ac57", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "93aa662b988b8502e5ea95659eafde59", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "375d7f5358d7a088a498b8b3aaecc0d5", "b726ef75b641c21519ecc2f802bbaf39",
+ "2c93dde8884f09fb5bb5ad6d95cde86d", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "15b00a15d1cc6cc96ca85d00b167e4dd", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "31b0017ba1110e3d70b020901bc15564", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "077e1b7b355c7ab3ca40230ee8efd8ea", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "f1d96db5a2e0a2160df38bd96d28d19b", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "2da29da97806ae0ee300c5e69c35a4aa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "3f6fcb9fae3666e085b9e29002a802fc", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "7a3e8de2a1caae206cf3e51a86dfd15a", "c266a1b65599686c771fad8a822e7a49",
+ "684f5c3a25a080edaf79add6e9137a8e", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "b14bd8068f108905682b83cc15778065", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "70440ba9ee7f9d16d297dbb49e54a56e", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "375d7f5358d7a088a498b8b3aaecc0d5", "4dca696cc0552c1d684c4fc963adc336",
+ "a49e6160b5d1b56bc2046963101cd606", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "7bf911888c11a9fefd604b8b9c82e9a1", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "0a1aa8f5ecfd11ddba080af0051c576a", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "7a3e8de2a1caae206cf3e51a86dfd15a", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "637d1e5221422dfe9a6dbcfd7f62ebdd", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "555475f5d1685638169ab904447e4f13", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "d9b9fecd195736a6049c528d4cb886b5", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "1ddf9020f18fa7883355cf8c0881186a", "e681b35b1fe02e2a6698525040015cd0",
+ "3be970f49e4288988818b087201d54da", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "c96c867d998473197dde9b587be14e3a", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "1eb2be4c05b50e427e29c72fa566bff5", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "08867ea5cc38c705ec52af821bc4736a", "c51c8bb294f4fa20bdab355ad1e7df37",
+ "7f084953976111e9f65b57876e7552b1", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "bfb69b4d7d4aed73cfa75a0f55b66440", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "536181ee90de883cc383787aec089221", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "1ddf9020f18fa7883355cf8c0881186a", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "f275af4f1f350ffaaf650310cb5dddec", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "b3e3a6234e8045e6182cf90a09f767b2", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "fed17fc391e6c3db4aa14ea1d6596c87", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "2377dd167ef2707978bed6f10ffd4e76", "b1f6c0cd490b584b1883222a4c281e0f",
+ "d2b9dba2968894a414756bb510ac389a", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "f596c63c7b14cada0174e17124c83942", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "52c0980bae63e8459e82eee7d8af2334", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "2afb540e8063f58d1b03896486c5e89b", "b929f7956cf35dd6225ca6cf45eacb23",
+ "0846ec82555b66197c5c45b08240fbcc", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "034d1d62581bd0d840c4cf1e28227931", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "29f82b0f3e4113944bd28aacd9b8489a", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "2377dd167ef2707978bed6f10ffd4e76", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "f81c4d6b001a14584528880fa6988a87", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "849dfeca59074525dea59681a7f88ab4", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "d0d3482d981989e117cbb32fc4550267", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "f918e0e4422967c6a7e47298135c7ae9", "fc8718e6f9e6663c2b6bf9710f835bfc",
+ "9a3215eb97aedbbddd76c7440837d040", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "eb2822ad8204ed4ecbf0f30fcb210498", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "75e57104d6058cd2bce1d3d8142d273d", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "2afb540e8063f58d1b03896486c5e89b", "d9d9f3c699cd03ab9d698e6b235ddcc6",
+ "ca7471c126ccd22189e874f0a6e41960", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "8cba849640e9e2859d509bc81ca94acd", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "ee3e76371240d1f1ff811cea6a7d4f63", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "f918e0e4422967c6a7e47298135c7ae9", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "a5a2f9c2e7759d8a3dec1bc4b56be587", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "39a68af80be11e1682b6f3c4ede33530", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "39561688bf6680054edbfae6035316ce", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "b2264e129636368b5496760b39e64b7a", "4dbb4ce94d4948c990a51b15959d2fa6",
+ "4e317feac6da46addf0e8b9d8d54304b", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "538ce869ffd23b6963e61badfab7712b", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "b4c735269ade44419169adbd852d5ddc", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "6ce47b11d2e60c5d183c84ce9f2e46cc", "3ac8d5b68ebb29fd1a41c5fa9d5f4382",
+ "0802b6318fbd0969a33de8fdfcd07f10", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "bc79acf2a0fe419194cdb4529bc7dcc8", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "17a20dbbf09feae557d40aa5818fbe76", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "b2264e129636368b5496760b39e64b7a", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "2317c57ab69a36eb3bf278cf8a8795a3", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "b22d765af176d87e7d3048b4b89b86ad", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "087c5992ca6f829e1ba4ba5332d67947", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "c9cf1deba08dac5972b3b0a43eff8f98", "84777bdeb84e2530a1c8c1ee432ec934",
+ "b384e9e3d81f9f4f9024028fbe451d8b", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "4e4677a0623d44237eb8d6a622cdc526", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "356d4003477283e157c8d2b5a79d913c", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "c9cf1deba08dac5972b3b0a43eff8f98", "1e58b76ca365b0bd4fd3c4519ec4a500",
+ "24accebe2e795b13fcb56dd3abacf53f", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "98f584ceaf2d65af997f85d71ceeda1b", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "c9cf1deba08dac5972b3b0a43eff8f98", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "1e58b76ca365b0bd4fd3c4519ec4a500", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "24accebe2e795b13fcb56dd3abacf53f", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "98f584ceaf2d65af997f85d71ceeda1b", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ };
+ return kDigest[id];
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+const char* GetDigest10bpp(int id) {
+ static const char* const kDigest[] = {
+ "b1b6903d60501c7bc11e5285beb26a52", "3fa4ebd556ea33cfa7f0129ddfda0c5b",
+ "a693b4bd0334a3b98d45e67d3985bb63", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "3e787534dff83c22b3033750e448865a", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "fd1da8d197cb385f7917cd296d67afb9", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "d9941769b66d012c68f70accc1a3b664", "98728677401560d7c29ba8bec59c6a00",
+ "2924788891caa175bb0725b57de6cbd2", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "915a60e7bb2c38ad5a556098230d6092", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "a25de86fd8d389c1c75405aac8049b58", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "b1b6903d60501c7bc11e5285beb26a52", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "cf792b94b1f3f321fa0c1d6362d89c90", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "5f1622fde194bd04560b04f13dc47a7c", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "d935e0ec1d933d0c48fa529be4f998eb", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "a7855ed75772d7fa815978a202bbcd9f", "cd3e8b96ff6796650e138f5d106d70d4",
+ "156de3172d9acf3c7f251cd7a18ad461", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "4c91f676a054d582bcae1ca9adb87a31", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "a984202c527b757337c605443f376915", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "20a390cc7e06a265ecc1e118f776c25a", "ab0da36b88021ed0efd806a1a4cd4fa0",
+ "fc57a318fbf0c0f29c24edbc84e35ec6", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "568055866caf274d67e984307cda2742", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "3ff2b19730d6bb8b97f4d72085d2d5b8", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "a7855ed75772d7fa815978a202bbcd9f", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "acc8588292b326f15076dd3a3d260072", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "f990a13f7a062665d7f18a40bd5da2ae", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "931df73c3d50c4b2e4ec3502bc8774de", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "bde291a4e8087c085fe8b3632f4d7351", "555eead3b67766f56b0e3714d431506f",
+ "e545b8a3ff958f8363c7968cbae96732", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "eab5894046a99ad0a1a12c91b0f37bd7", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "c347f4a58fd784c5e88c1a23e4ff15d2", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "9272ee0820b09bfdc252a97b2e103862", "be8dd418158226a00d5e01ccc3e4f66b",
+ "34b37b59ee49108276be28a2e4585c2d", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "f4deb462014249d4ab02db7f7f62308e", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "6ae557169928f3be15c7aad8d67205b1", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "bde291a4e8087c085fe8b3632f4d7351", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "14be0f12550c814f75655b4e1e22ddde", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "af4cadb78ee54aacebac76c8ad275375", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "c0c4ebfd6dbbddd88114c36e8c9085da", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "238980eebc9e63ae3eea2771c7a70f12", "661c69a7b49984fa1e92cf8485ab28b6",
+ "7842b2047356c1417d9d88219707f1a1", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "765b4cfbfc1a4988878c412d53bcb597", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "29cbaadbff9adf4a3d49bd9900a9dd0b", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "7e3fa9c03bc3dfbdeb67f24c5d9a49cd", "a65e13b534b32fdff3f48d09389daaf1",
+ "da1a6ff2be03ec8acde4cb1cd519a6f0", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "d54206c34785cc3d8a06c2ceac46378c", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "b1f26ee13df2e14a757416ba8a682278", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "238980eebc9e63ae3eea2771c7a70f12", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "e552466a4e7ff187251b8914b084d404", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aba5d5ef5e96fe418e65d20e506ea834", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "972aeba65e8a6d20dd0f95279be2aa75", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "0eac13431bd7d8a573318408a72246d5", "71c57b774e4c3d9b965b060e2a895448",
+ "1a487c658d684314d91bb6d961a94672", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "bc63b29ec78c1efec5543885a45bb822", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "c5997b802a6ba1cf5ba1057ddc5baa7e", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "f3454ca93cbb0c8c09b0695d90a0df3d", "d259b9c0d0e3322114b2bcce04ae35dd",
+ "a4ca37cb869a0dbd1c4a2dcc449a8f31", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "85a11892ed884e3e74968435f6b16e64", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "996b6c166f9ed25bd07ea6acdf7597ff", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "0eac13431bd7d8a573318408a72246d5", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "981b7c44b6f7b7ac2acf0cc4096e6bf4", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "d70bf16e2a31e90b7b3cdeaef1494cf9", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "34165457282e2af2e9b3f5840e4dec5d", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "73438155feb62595e3e406921102d748", "86d00d2e3dd4a198343f37e3dc4461c9",
+ "0635a296be01b7e641de98ee27c33cd2", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "cecd57396a0033456408f3f3554c6912", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "59f33727e5beeb783a057770bec7b4cd", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "f3454ca93cbb0c8c09b0695d90a0df3d", "b11f98b5bb864413952d47a67b4add79",
+ "1b5d1d4c7be8d5ec00a42a49eecf918f", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "16434230d24b9522ae2680e8c37e1b95", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "34895d4c69a6c3303693e6f431bcd5d8", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "73438155feb62595e3e406921102d748", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "a4c75372af36162831cb872e24e1088c", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "6df80bb7f264f4f285d09a4d61533fae", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "b8c5582b9bbb789c45471f93be83b41f", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "5871e0e88a776840d619670fbf107858", "57dd2cde826c50e0b0ec504396cb3ceb",
+ "82dc120bf8c2043bc5eee81007309ebf", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "5b37f94ef136c1eb9a6181c19491459c", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "0654d72f22306b28d9ae42515845240c", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "1a77d2af4d2b6cf8737cfbcacacdc4e4", "7123d4aa8083da90ec6986dda0e126ce",
+ "98b77e88b0784baaea64c98c8707fe46", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "963dea92f3efbb99137d1de9c56728d3", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "c9497b00cb1bc3363dd126ffdddadc8e", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "5871e0e88a776840d619670fbf107858", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "497271227a70a72f9ad25b415d41563f", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "c8831118d1004a7cca015a4fca140018", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "257bf5467db570974d7cf2356bacf116", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "1c6376ce55c9ee9e35d432edb1ffb3b7", "6fff9189c1d11f183f7c42d4ce5febdb",
+ "58c826cad3c14cdf26a649265758c58b", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "716ba3a25b454e44b46caa42622c128c", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "6c9d7d9e6ef81d76e775a85c53abe209", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "89bec831efea2f88129dedcad06bb3fa", "e1ef4ae726d864b36a9b64b1e43ede7e",
+ "8148788044522edc3c497e1017efe2ce", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "b72fb6a9a073c2fe65013af1842dc9b0", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "1e461869bb2ee9b6069c5e52cf817291", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "1c6376ce55c9ee9e35d432edb1ffb3b7", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "c48bd7e11ec44ba7b2bc8b6a04592439", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "b7f82c140369067c105c7967c75b6f9e", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "5255dded79f56b0078543b5a1814a668", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "d675e0195c9feca956e637f3f1959f40", "670fa8c31c82fced9a810b64c03e87ee",
+ "f166254037c0dfb140f54cd7b08bddfe", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "9076f58c4ab20f2f06d701a6b53b1c4f", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "a35f435ccc67717a49251a07e62ae204", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "89bec831efea2f88129dedcad06bb3fa", "7c3a79a90f3f4b460540e796f3197ef1",
+ "acf60abeda98bbea161139b915317423", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "86fa0c299737eb499cbcdce94abe2d33", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "8d7f1d7ea6a0dcc922ad5d2e77bc74dd", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "d675e0195c9feca956e637f3f1959f40", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "0960a9af91250e9faa1eaac32227bf6f", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "130f47aae365aabfec4360fa5b5ff554", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "ef745100f5f34c8ff841b2b0b57eb33f", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "b5681673903ade13d69e295f82fdd009", "9ccd4cc6216eab35ddcb66a76b55dd2f",
+ "74ab206f14ac5f62653cd3dd71a7916d", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "d3212ab3922f147c3cf126c3b1aa17f6", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "c5325015cb0b7c42839ac4aa21803fa0", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "dead0fe4030085c22e92d16bb110de9d", "3c6d97f25d6bc647c843850be007f512",
+ "262c96b1f2c4f85c86c0e9c77fedff1e", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "6b80af04470b83673d98f46925e678a5", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "138855d9bf0ccd0c62ac14c7bff4fd37", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "b5681673903ade13d69e295f82fdd009", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "746c2e0f96ae2246d534d67102be068c", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "92483ed631de21b685ffe6ccadbbec8f", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "edae8ed67286ca6a31573a541b3deb6f", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "3c43020105ae93a301404b4cd6238654", "cef7cfdcb8ca8d2612f31a1fe95ce371",
+ "5621caef7cc1d6522903290ccc5c2cb8", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "b55fea77f0e14a8bf8b6562b766fe91f", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "f81f31f1585c0f70438c09e829416f20", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "306a2f5dfd675df4ed9af44fd5cac8c0", "1dfda318021a05a7e72fd815ddb0dfc8",
+ "f35a3d13516440f9168076d9b07c9e98", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "65baca6167fe5249f7a839ce5b2fd591", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "64035142864914d05a48ef8e013631d0", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "3c43020105ae93a301404b4cd6238654", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "d6f6db079da9b8909a153c07cc9d0e63", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "cbb6ab31547df6b91cfb48630fdffb48", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "01adcd8bf15fbf70df47fbf3a953aa14", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "dd2c5880a94ed3758bfea0b0e8c78286", "5f6c1725f4c7c73a8d8f0d9468106624",
+ "78ec6cf42cce4b1feb65e076c78ca241", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "59b578268ff26a1e21c5b4273f73f852", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "ab10b22fb8dd8199040745565b28595d", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "306a2f5dfd675df4ed9af44fd5cac8c0", "9209f83153ef6f09b5262536a2dc1671",
+ "13782526fc2726100cb3cf375b3150ed", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "e47ded6c0eec1d5baadd02aff172f2b1", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "205904fa3c644433b46e01c11dd2fe40", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "dd2c5880a94ed3758bfea0b0e8c78286", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "7c8928a0d769f4264d195f39cb68a772", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "1eea5e8a24d6aa11778eb3e5e5e9c9f2", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "ba539808a8501609ce052a1562a62b25", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "4ebb1a7b25a39d8b9868ec8a1243103f", "c2732a08997e1f5176dfb297d2e89235",
+ "42188e2dbb4e02cd353552ea147ad03f", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "16761e7c8ba2645718153bed83ae78f6", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "0d928d6111f86c60ccefc6c6604d5659", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "9d01c946a12f5ef9d9cebd9816e06014", "d738eb9f3f4f0b412b93687b55b6e45a",
+ "13c07441b47b0c1ed80f015ac302d220", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "c0950e609f278efb7050d319a9756bb3", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "291425aaf8206b20e88db8ebf3cf7e7f", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "4ebb1a7b25a39d8b9868ec8a1243103f", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "db645c96fc8be04015e0eb538afec9ae", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "9e193b6b28ce798c44c744efde19eee9", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "ac8e6391200cec2abdebb00744a2ba82", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "d34ec07845cd8523651e5f5112984a14", "745c794b557d4a0d734e45d720a7f7ad",
+ "f9813870fc27941a7c00a0443d7c2fe7", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "a9e9805769fe1baf5c7933793ccca0d8", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "4ed1a6200912995d4f571bdb7822aa83", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "768f63912e43148c13688d7f23281531", "43fb786fd2e79610d6a6d912b95f4509",
+ "02880fde51ac991ad18d8986f4e5145c", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "9051290279237f9fb1389989b142d2dd", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "cb6238b8eb6b72980958e6fcceb2f2eb", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "d34ec07845cd8523651e5f5112984a14", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "946af3a8f5362def5f4e27cb0fd4e754", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "885c384d90aaa34acd8303958033c252", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "54b17120f7d71ddb4d70590ecd231cc1", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "2ce55308d873f4cd244f16da2b06e06e", "af7b76d3471cfbdc97d1e57bc2876ce7",
+ "20b14a6b5af7aa356963bcaaf23d230d", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "553a2c24939dff18ec5833c77f556cfb", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "92e31a45513582f386dc9c22a57bbbbd", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "768f63912e43148c13688d7f23281531", "4e255554dab9dfa1064e20a905538308",
+ "aa25073115bad49432953254e7dce0bc", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "34cdc1be291c95981c98812c5c343a15", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "626321a6dfac542d0fc70321fac13ff3", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "2ce55308d873f4cd244f16da2b06e06e", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "7ad78dfe7bbedf696dd58d9ad01bcfba", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "8110ed10e7234851dff3c7e4a51108a2", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "f6e36446a97611a4db4425df926974b2", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "a4bb5d5ff4b25f391265b5231049a09a", "cf4867c6b1b8be86a7e0bee708c28d83",
+ "9c9c41435697f75fa118b6d6464ee7cb", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "5c1ec75a160c444fa90abf106fa1140e", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "6dbf310a9c8d85f76306d6a35545f8af", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "2e7927158e7b8e40e7269fc909fb584b", "8b72feff8bb0901229a2bd7da2857c4b",
+ "69e3361b7199e10e75685b90fb0df623", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "5b64a6911cb7c3d60bb8f961ed9782a2", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "1c6fda7501e0f8bdad972f7857cd9354", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "a4bb5d5ff4b25f391265b5231049a09a", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "f0fd9c09d454e4ce918faa97e9ac10be", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "6fb9383302eb7e7a13387464d2634e03", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "a82f4080699300b659bbe1b5c4463147", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "c9106e0c820b03bcdde3aa94efc11a3e", "0408e10e51a31ac756a57d5149a2b409",
+ "38816245ed832ba313fefafcbed1e5c8", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "2266840f11ac4c066d941ec473b1a54f", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "80fce29dc82d5857c1ed5ef2aea16835", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "123028e18c2bfb334e34adb5a4f67de4", "1670eb8ed876e609ed81236a683b4a3d",
+ "2f8ab35f6e7030e82ca922a68b29af4a", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "7133de9d03a4b07716a12226b5e493e8", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "4fd485dadcb570e5a0a5addaf9ba84da", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "c9106e0c820b03bcdde3aa94efc11a3e", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "af6ae5c0eb28417bd251184baf2eaba7", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "864d51fcc737bc73a3f588b67515039a", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "ecedb178f7cad3dc1b921eca67f9efb6", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "7ec2eae9e118506da8b33440b399511a", "108a4a6530a6b9c933ccf14edbd896be",
+ "5d34137cc8ddba75347b0fa1d0a91791", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "9e194755b2a37b615a517d5f8746dfbb", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "14f2c5b9d2cd621c178a39f1ec0c38eb", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "123028e18c2bfb334e34adb5a4f67de4", "2fdc713ba418780d0be33a3ebbcb323c",
+ "452f91b01833c57db4e909575a029ff6", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "3594eff52d5ed875bd9655ddbf106fae", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "d3f140aea9e8eabf4e1e5190e0148288", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "7ec2eae9e118506da8b33440b399511a", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "866f8df540dd3b58ab1339314d139cbd", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "2ecb7890f00234bcb28c1d969f489012", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "0609ca0ff3ca90069e8b48829b4b0891", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "78de867c8ee947ed6d29055747f26949", "0a7cb4f51f1acf0940b59295b2327465",
+ "465dcb046a0449b9dfb3e0b297aa3863", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "bbf86f8174334f0b8d869fd8d58bf92d", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "da54cfb4530841bda29966cfa05f4879", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "2c979c2bddef79a760e72a802f83cc76", "545426be3436073ba63790aa3c4a5598",
+ "1fabf0655bedb671e4d7287fec8119ba", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "90d7e13aa2f9a064493ff2b3b5b12109", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "e4938219593bbed5ae638a93f2f4a580", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "78de867c8ee947ed6d29055747f26949", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "72803589b453a29501540aeddc23e6f4", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "c4793d431dbf2d88826bb440bf027512", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "839e86c681e97359f7819c766000dd1c", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "d05a237ed7a9ca877256b71555b1b8e4", "3052776d186fca6dd8011f4fe908a212",
+ "94b3e5bcd6b849b66a4571ec3d23f9be", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "91d6bdbc62d4bb80c9b371d9704e3c9e", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "4f750f6375524311d260306deb233861", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "d05a237ed7a9ca877256b71555b1b8e4", "03ce2d07cac044d6b68604d398571844",
+ "68ece92dcbe70a2ae9776d72972740a7", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "380d296d0d55a49dd86ee562b053a9d8", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "d05a237ed7a9ca877256b71555b1b8e4", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "03ce2d07cac044d6b68604d398571844", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "68ece92dcbe70a2ae9776d72972740a7", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "380d296d0d55a49dd86ee562b053a9d8", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ };
+ return kDigest[id];
+}
+#endif
+
+struct ConvolveTestParam {
+ ConvolveTestParam(int width, int height) : width(width), height(height) {}
+ int width;
+ int height;
+};
+
+struct ConvolveTypeParam {
+ ConvolveTypeParam(bool is_intra_block_copy, bool is_compound,
+ bool has_vertical_filter, bool has_horizontal_filter)
+ : is_intra_block_copy(is_intra_block_copy),
+ is_compound(is_compound),
+ has_vertical_filter(has_vertical_filter),
+ has_horizontal_filter(has_horizontal_filter) {}
+ bool is_intra_block_copy;
+ bool is_compound;
+ bool has_vertical_filter;
+ bool has_horizontal_filter;
+};
+
+std::ostream& operator<<(std::ostream& os, const ConvolveTestParam& param) {
+ return os << "BlockSize" << param.width << "x" << param.height;
+}
+
+std::ostream& operator<<(std::ostream& os, const ConvolveTypeParam& param) {
+ return os << "is_intra_block_copy: " << param.is_intra_block_copy
+ << ", is_compound: " << param.is_compound
+ << ", has_(vertical/horizontal)_filter: "
+ << param.has_vertical_filter << "/" << param.has_horizontal_filter;
+}
+
+// TODO(b/146062680): split this to ConvolveTest and ConvolveScaleTest to
+// simplify the members and test logic.
+template <int bitdepth, typename Pixel>
+class ConvolveTest
+ : public testing::TestWithParam<
+ std::tuple<ConvolveTestParam, ConvolveTypeParam, bool>> {
+ public:
+ ConvolveTest() = default;
+ ~ConvolveTest() override = default;
+
+ void SetUp() override {
+ ConvolveInit_C();
+
+ const Dsp* const dsp = GetDspTable(bitdepth);
+ ASSERT_NE(dsp, nullptr);
+ GetConvolveFuncs(dsp, &base_convolve_func_, &base_convolve_scale_func_);
+
+ const testing::TestInfo* const test_info =
+ testing::UnitTest::GetInstance()->current_test_info();
+ const absl::string_view test_case = test_info->test_suite_name();
+ if (absl::StartsWith(test_case, "C/")) {
+ base_convolve_func_ = nullptr;
+ base_convolve_scale_func_ = nullptr;
+ } else if (absl::StartsWith(test_case, "SSE41/")) {
+ if ((GetCpuInfo() & kSSE4_1) != 0) {
+ ConvolveInit_SSE4_1();
+ }
+ } else if (absl::StartsWith(test_case, "AVX2/")) {
+ if ((GetCpuInfo() & kAVX2) != 0) {
+ ConvolveInit_AVX2();
+ }
+ } else if (absl::StartsWith(test_case, "NEON/")) {
+ ConvolveInit_NEON();
+ } else {
+ FAIL() << "Unrecognized architecture prefix in test case name: "
+ << test_case;
+ }
+
+ GetConvolveFuncs(dsp, &cur_convolve_func_, &cur_convolve_scale_func_);
+
+ // Skip functions that have not been specialized for this particular
+ // architecture.
+ if (cur_convolve_func_ == base_convolve_func_) {
+ cur_convolve_func_ = nullptr;
+ }
+ if (cur_convolve_scale_func_ == base_convolve_scale_func_) {
+ cur_convolve_scale_func_ = nullptr;
+ }
+ }
+
+ protected:
+ int GetDigestId() const {
+ // id is the combination of the 3-dimension array:
+ // (param_, type_param_, is_scaled_convolve_)
+ // The number of each array is 20, 16, 2.
+ // The range of id is from 0 to 20x16x2 - 1.
+ // is_scaled_convolve_: false, id += 0; true, id += 1;
+ // type_param_: (0, 0, 0, 0), id += 0 * 2.
+ // (0, 0, 0, 1), id += 1 * 2; (0, 0, 1, 0), id += 2 * 2;
+ // ...
+ // param_: (2, 2), id += 0 * 32; (2, 4), id += 1 * 32;
+ // (4, 2), id += 2 * 32; (4, 4), id += 3 * 32;
+ // ...
+ int id = static_cast<int>(is_scaled_convolve_);
+ id += 2 * static_cast<int>(type_param_.has_horizontal_filter);
+ id += 2 * 2 * static_cast<int>(type_param_.has_vertical_filter);
+ id += 2 * 4 * static_cast<int>(type_param_.is_compound);
+ id += 2 * 8 * static_cast<int>(type_param_.is_intra_block_copy);
+ if (param_.width == param_.height) {
+ id += 32 * 3 * static_cast<int>(std::log2(param_.width) - 1);
+ } else if (param_.width < param_.height) {
+ id += 32 * (1 + 3 * static_cast<int>(std::log2(param_.width) - 1));
+ } else {
+ // param_.width > param_.height
+ if (param_.width == 8 && param_.height == 2) {
+ // Special case is at the end of the array.
+ id += 32 * 19;
+ } else {
+ id += 32 * (2 + 3 * static_cast<int>(std::log2(param_.height) - 1));
+ }
+ }
+ return id;
+ }
+
+ void GetConvolveFuncs(const Dsp* dsp, ConvolveFunc* func,
+ ConvolveScaleFunc* scale_func);
+ void SetInputData(bool use_fixed_values, int value);
+ void Check(bool use_fixed_values, const Pixel* src, const Pixel* dest,
+ libvpx_test::MD5* md5_digest);
+ void Check16Bit(bool use_fixed_values, const uint16_t* src,
+ const uint16_t* dest, libvpx_test::MD5* md5_digest);
+ // |num_runs| covers the categories of filters (6) and the number of filters
+ // under each category (16).
+ void Test(bool use_fixed_values, int value,
+ int num_runs = kMinimumViableRuns);
+
+ const ConvolveTestParam param_ = std::get<0>(GetParam());
+ const ConvolveTypeParam type_param_ = std::get<1>(GetParam());
+ const bool is_scaled_convolve_ = std::get<2>(GetParam());
+
+ private:
+ ConvolveFunc base_convolve_func_;
+ ConvolveFunc cur_convolve_func_;
+ ConvolveScaleFunc base_convolve_scale_func_;
+ ConvolveScaleFunc cur_convolve_scale_func_;
+ // Convolve filters are 7-tap, which needs 3 pixels (kRestorationBoder)
+ // padding.
+ // When is_scaled_convolve_ is true, the source can be at most 2 times of
+ // max width/height. So we allocate a larger memory for it and setup the
+ // extra memory when is_scaled_convolve_ is true.
+ Pixel source_[kMaxBlockHeight * kMaxBlockWidth * 4] = {};
+ uint16_t source_16bit_[kMaxBlockHeight * kMaxBlockWidth * 4] = {};
+ uint16_t dest_16bit_[kMaxBlockHeight * kMaxBlockWidth] = {};
+ Pixel dest_clipped_[kMaxBlockHeight * kMaxBlockWidth] = {};
+
+ const int source_stride_ =
+ is_scaled_convolve_ ? kMaxBlockWidth * 2 : kMaxBlockWidth;
+ const int source_height_ =
+ is_scaled_convolve_ ? kMaxBlockHeight * 2 : kMaxBlockHeight;
+};
+
+template <int bitdepth, typename Pixel>
+void ConvolveTest<bitdepth, Pixel>::GetConvolveFuncs(
+ const Dsp* const dsp, ConvolveFunc* func, ConvolveScaleFunc* scale_func) {
+ if (is_scaled_convolve_) {
+ *func = nullptr;
+ *scale_func = dsp->convolve_scale[type_param_.is_compound];
+ } else {
+ *scale_func = nullptr;
+ *func =
+ dsp->convolve[type_param_.is_intra_block_copy][type_param_.is_compound]
+ [type_param_.has_vertical_filter]
+ [type_param_.has_horizontal_filter];
+ }
+}
+
+template <int bitdepth, typename Pixel>
+void ConvolveTest<bitdepth, Pixel>::SetInputData(bool use_fixed_values,
+ int value) {
+ if (use_fixed_values) {
+ std::fill(source_, source_ + source_height_ * source_stride_, value);
+ } else {
+ const int offset =
+ kConvolveBorderLeftTop * source_stride_ + kConvolveBorderLeftTop;
+ const int mask = (1 << bitdepth) - 1;
+ libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+ const int height = is_scaled_convolve_ ? param_.height * 2 : param_.height;
+ const int width = is_scaled_convolve_ ? param_.width * 2 : param_.width;
+ for (int y = 0; y < height; ++y) {
+ for (int x = 0; x < width; ++x) {
+ source_[y * source_stride_ + x + offset] = rnd.Rand16() & mask;
+ }
+ }
+ // Copy border pixels to the left and right borders.
+ for (int y = 0; y < height; ++y) {
+ Memset(&source_[(y + kConvolveBorderLeftTop) * source_stride_],
+ source_[y * source_stride_ + offset], kConvolveBorderLeftTop);
+ Memset(&source_[y * source_stride_ + offset + width],
+ source_[y * source_stride_ + offset + width - 1],
+ kConvolveBorderLeftTop);
+ }
+ // Copy border pixels to the top and bottom borders.
+ for (int y = 0; y < kConvolveBorderLeftTop; ++y) {
+ memcpy(&source_[y * source_stride_],
+ &source_[kConvolveBorderLeftTop * source_stride_],
+ source_stride_ * sizeof(Pixel));
+ memcpy(&source_[(y + kConvolveBorderLeftTop + height) * source_stride_],
+ &source_[(kConvolveBorderLeftTop + height - 1) * source_stride_],
+ source_stride_ * sizeof(Pixel));
+ }
+ }
+}
+
+template <int bitdepth, typename Pixel>
+void ConvolveTest<bitdepth, Pixel>::Check(bool use_fixed_values,
+ const Pixel* src, const Pixel* dest,
+ libvpx_test::MD5* md5_digest) {
+ if (use_fixed_values) {
+ // For fixed values, input and output are identical.
+ const bool success =
+ test_utils::CompareBlocks(src, dest, param_.width, param_.height,
+ kMaxBlockWidth, kMaxBlockWidth, false, false);
+ EXPECT_TRUE(success);
+ } else {
+ // For random input, compare md5.
+ const int offset =
+ kConvolveBorderLeftTop * kMaxBlockWidth + kConvolveBorderLeftTop;
+ const size_t size = sizeof(dest_clipped_) - offset * sizeof(Pixel);
+ md5_digest->Add(reinterpret_cast<const uint8_t*>(dest), size);
+ }
+}
+
+template <int bitdepth, typename Pixel>
+void ConvolveTest<bitdepth, Pixel>::Check16Bit(bool use_fixed_values,
+ const uint16_t* src,
+ const uint16_t* dest,
+ libvpx_test::MD5* md5_digest) {
+ if (use_fixed_values) {
+ // For fixed values, input and output are identical.
+ const bool success =
+ test_utils::CompareBlocks(src, dest, param_.width, param_.height,
+ kMaxBlockWidth, kMaxBlockWidth, false);
+ EXPECT_TRUE(success);
+ } else {
+ // For random input, compare md5.
+ const int offset =
+ kConvolveBorderLeftTop * kMaxBlockWidth + kConvolveBorderLeftTop;
+ const size_t size = sizeof(dest_16bit_) - offset * sizeof(uint16_t);
+ md5_digest->Add(reinterpret_cast<const uint8_t*>(dest), size);
+ }
+}
+
+template <int bitdepth, typename Pixel>
+void ConvolveTest<bitdepth, Pixel>::Test(bool use_fixed_values, int value,
+ int num_runs /*= 16 * 6*/) {
+ // There's no meaning testing fixed input in compound convolve.
+ if (type_param_.is_compound && use_fixed_values) GTEST_SKIP();
+
+ // Scaled convolve does not behave differently under most params. Only need to
+ // test the enabled compound implementation.
+ if (is_scaled_convolve_ &&
+ (type_param_.is_intra_block_copy || type_param_.has_vertical_filter ||
+ type_param_.has_horizontal_filter)) {
+ GTEST_SKIP();
+ }
+
+ // There should not be any function set for this combination.
+ if (type_param_.is_intra_block_copy && type_param_.is_compound) {
+ ASSERT_EQ(cur_convolve_func_, nullptr);
+ return;
+ }
+
+ // Compound and intra block copy functions are only used for blocks 4x4 or
+ // greater.
+ if (type_param_.is_compound || type_param_.is_intra_block_copy) {
+ if (param_.width < 4 || param_.height < 4) {
+ GTEST_SKIP();
+ }
+ }
+
+ // Skip unspecialized functions.
+ if (cur_convolve_func_ == nullptr && cur_convolve_scale_func_ == nullptr) {
+ GTEST_SKIP();
+ }
+
+ SetInputData(use_fixed_values, value);
+ libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed() +
+ GetDigestId());
+ // [1,2048] for |step_[xy]|. This covers a scaling range of 1/1024 to 2x.
+ const int step_x = (rnd.Rand16() & ((1 << 11) - 1)) + 1;
+ const int step_y = (rnd.Rand16() & ((1 << 11) - 1)) + 1;
+ int subpixel_x = 0;
+ int subpixel_y = 0;
+ int vertical_index = 0;
+ int horizontal_index = 0;
+ const int offset =
+ kConvolveBorderLeftTop * kMaxBlockWidth + kConvolveBorderLeftTop;
+ const int offset_scale =
+ kConvolveBorderLeftTop * source_stride_ + kConvolveBorderLeftTop;
+ const Pixel* const src = source_ + offset;
+ const Pixel* const src_scale = source_ + offset_scale;
+ const ptrdiff_t src_stride = source_stride_ * sizeof(Pixel);
+ const ptrdiff_t src_stride_16 = source_stride_;
+ const ptrdiff_t dst_stride = kMaxBlockWidth * sizeof(Pixel);
+ // Pack Compound output since we control the predictor buffer.
+ const ptrdiff_t dst_stride_compound = param_.width;
+
+ // Output is always 16 bits regardless of |bitdepth|.
+ uint16_t* dst_16 = dest_16bit_ + offset;
+ // Output depends on |bitdepth|.
+ Pixel* dst_pixel = dest_clipped_ + offset;
+
+ // Collect the first |kMinimumViableRuns| into one md5 buffer.
+ libvpx_test::MD5 md5_digest;
+
+ absl::Duration elapsed_time;
+ for (int i = 0; i < num_runs; ++i) {
+ // Test every filter.
+ // Because of masking |subpixel_{x,y}| values roll over every 16 iterations.
+ subpixel_x += 1 << 6;
+ subpixel_y += 1 << 6;
+
+ const int horizontal_filter_id = (subpixel_x >> 6) & 0xF;
+ const int vertical_filter_id = (subpixel_y >> 6) & 0xF;
+
+ // |filter_id| == 0 (copy) must be handled by the appropriate 1D or copy
+ // function.
+ if (horizontal_filter_id == 0 || vertical_filter_id == 0) {
+ continue;
+ }
+
+ // For focused speed testing these can be set to the desired filter. Want
+ // only 8 tap filters? Set |{vertical,horizontal}_index| to 2.
+ vertical_index += static_cast<int>(i % 16 == 0);
+ vertical_index %= 4;
+ horizontal_index += static_cast<int>(i % 16 == 0);
+ horizontal_index %= 4;
+
+ if (is_scaled_convolve_) {
+ ASSERT_EQ(cur_convolve_func_, nullptr);
+ // Output type is uint16_t.
+ const absl::Time start = absl::Now();
+ if (type_param_.is_compound) {
+ cur_convolve_scale_func_(
+ source_, src_stride, horizontal_index, vertical_index, 0, 0, step_x,
+ step_y, param_.width, param_.height, dst_16, dst_stride_compound);
+ } else {
+ cur_convolve_scale_func_(
+ source_, src_stride, horizontal_index, vertical_index, 0, 0, step_x,
+ step_y, param_.width, param_.height, dst_pixel, dst_stride);
+ }
+ elapsed_time += absl::Now() - start;
+ } else if (type_param_.is_compound) {
+ ASSERT_EQ(cur_convolve_scale_func_, nullptr);
+ // Output type is uint16_t.
+ const absl::Time start = absl::Now();
+ cur_convolve_func_(src, src_stride, horizontal_index, vertical_index,
+ horizontal_filter_id, vertical_filter_id, param_.width,
+ param_.height, dst_16, dst_stride_compound);
+ elapsed_time += absl::Now() - start;
+ } else {
+ ASSERT_EQ(cur_convolve_scale_func_, nullptr);
+ // Output type is Pixel.
+ const absl::Time start = absl::Now();
+ cur_convolve_func_(src, src_stride, horizontal_index, vertical_index,
+ horizontal_filter_id, vertical_filter_id, param_.width,
+ param_.height, dst_pixel, dst_stride);
+ elapsed_time += absl::Now() - start;
+ }
+
+ // Only check the output for the first set. After that it's just repeated
+ // runs for speed timing.
+ if (i >= kMinimumViableRuns) continue;
+
+ if (is_scaled_convolve_) {
+ // Convolve function does not clip the output. The clipping is applied
+ // later. But libaom clips the output. So we apply clipping to match
+ // libaom in tests.
+ if (type_param_.is_compound) {
+ const int single_round_offset = (1 << bitdepth) + (1 << (bitdepth - 1));
+ Pixel* dest_row = dest_clipped_;
+ for (int y = 0; y < kMaxBlockHeight; ++y) {
+ for (int x = 0; x < kMaxBlockWidth; ++x) {
+ dest_row[x] = static_cast<Pixel>(Clip3(
+ dest_16bit_[y * dst_stride_compound + x] - single_round_offset,
+ 0, (1 << bitdepth) - 1));
+ }
+ dest_row += kMaxBlockWidth;
+ }
+ }
+
+ if (type_param_.is_compound) {
+ Check16Bit(use_fixed_values, source_16bit_ + offset_scale, dst_16,
+ &md5_digest);
+ } else {
+ Check(use_fixed_values, src_scale, dst_pixel, &md5_digest);
+ }
+ } else if (type_param_.is_compound) {
+ // Need to copy source to a uint16_t buffer for comparison.
+ Pixel* src_ptr = source_;
+ uint16_t* src_ptr_16 = source_16bit_;
+ for (int y = 0; y < kMaxBlockHeight; ++y) {
+ for (int x = 0; x < kMaxBlockWidth; ++x) {
+ src_ptr_16[x] = src_ptr[x];
+ }
+ src_ptr += src_stride_16;
+ src_ptr_16 += src_stride_16;
+ }
+
+ Check16Bit(use_fixed_values, source_16bit_ + offset, dst_16, &md5_digest);
+ } else {
+ Check(use_fixed_values, src, dst_pixel, &md5_digest);
+ }
+ }
+
+ if (!use_fixed_values) {
+ // md5 sums are only calculated for random input.
+ const char* ref_digest;
+ if (bitdepth == 8) {
+ ref_digest = GetDigest8bpp(GetDigestId());
+ } else {
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ ref_digest = GetDigest10bpp(GetDigestId());
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+ }
+ const char* direction;
+ if (is_scaled_convolve_ || (type_param_.has_vertical_filter &&
+ type_param_.has_horizontal_filter)) {
+ direction = "2D";
+ } else if (type_param_.has_vertical_filter) {
+ direction = "Vertical";
+ } else if (type_param_.has_horizontal_filter) {
+ direction = "Horizontal";
+ } else {
+ direction = "Copy";
+ }
+ const auto elapsed_time_us =
+ static_cast<int>(absl::ToInt64Microseconds(elapsed_time));
+ printf("Mode Convolve%s%s%s%s[%25s]: %5d us MD5: %s\n",
+ type_param_.is_compound ? "Compound" : "",
+ type_param_.is_intra_block_copy ? "IntraBlockCopy" : "",
+ is_scaled_convolve_ ? "Scale" : "", direction,
+ absl::StrFormat("%dx%d", param_.width, param_.height).c_str(),
+ elapsed_time_us, md5_digest.Get());
+ EXPECT_STREQ(ref_digest, md5_digest.Get());
+ }
+}
+
+void ApplyFilterToSignedInput(const int min_input, const int max_input,
+ const int8_t filter[kSubPixelTaps],
+ int* min_output, int* max_output) {
+ int min = 0, max = 0;
+ for (int i = 0; i < kSubPixelTaps; ++i) {
+ const int tap = filter[i];
+ if (tap > 0) {
+ max += max_input * tap;
+ min += min_input * tap;
+ } else {
+ min += max_input * tap;
+ max += min_input * tap;
+ }
+ }
+ *min_output = min;
+ *max_output = max;
+}
+
+void ApplyFilterToUnsignedInput(const int max_input,
+ const int8_t filter[kSubPixelTaps],
+ int* min_output, int* max_output) {
+ ApplyFilterToSignedInput(0, max_input, filter, min_output, max_output);
+}
+
+// Validate the maximum ranges for different parts of the Convolve process.
+template <int bitdepth>
+void ShowRange() {
+ // Subtract one from the shift bits because the filter is pre-shifted by 1.
+ constexpr int horizontal_bits = (bitdepth == kBitdepth12)
+ ? kInterRoundBitsHorizontal12bpp - 1
+ : kInterRoundBitsHorizontal - 1;
+ constexpr int vertical_bits = (bitdepth == kBitdepth12)
+ ? kInterRoundBitsVertical12bpp - 1
+ : kInterRoundBitsVertical - 1;
+ constexpr int compound_vertical_bits = kInterRoundBitsCompoundVertical - 1;
+
+ constexpr int compound_offset = (bitdepth == 8) ? 0 : kCompoundOffset;
+
+ constexpr int max_input = (1 << bitdepth) - 1;
+
+ const int8_t* worst_convolve_filter = kHalfSubPixelFilters[2][8];
+
+ // First pass.
+ printf("Bitdepth: %2d Input range: [%8d, %8d]\n", bitdepth, 0,
+ max_input);
+
+ int min, max;
+ ApplyFilterToUnsignedInput(max_input, worst_convolve_filter, &min, &max);
+
+ if (bitdepth == 8) {
+ // 8bpp can use int16_t for sums.
+ assert(min > INT16_MIN);
+ assert(max < INT16_MAX);
+ } else {
+ // 10bpp and 12bpp require int32_t.
+ assert(min > INT32_MIN);
+ assert(max > INT16_MAX && max < INT32_MAX);
+ }
+
+ printf(" intermediate range: [%8d, %8d]\n", min, max);
+
+ const int first_pass_min = RightShiftWithRounding(min, horizontal_bits);
+ const int first_pass_max = RightShiftWithRounding(max, horizontal_bits);
+
+ // All bitdepths can use int16_t for first pass output.
+ assert(first_pass_min > INT16_MIN);
+ assert(first_pass_max < INT16_MAX);
+
+ printf(" first pass output range: [%8d, %8d]\n", first_pass_min,
+ first_pass_max);
+
+ // Second pass.
+ ApplyFilterToSignedInput(first_pass_min, first_pass_max,
+ worst_convolve_filter, &min, &max);
+
+ // All bitdepths require int32_t for second pass sums.
+ assert(min < INT16_MIN && min > INT32_MIN);
+ assert(max > INT16_MAX && max < INT32_MAX);
+
+ printf(" intermediate range: [%8d, %8d]\n", min, max);
+
+ // Second pass non-compound output is clipped to Pixel values.
+ const int second_pass_min =
+ Clip3(RightShiftWithRounding(min, vertical_bits), 0, max_input);
+ const int second_pass_max =
+ Clip3(RightShiftWithRounding(max, vertical_bits), 0, max_input);
+ printf(" second pass output range: [%8d, %8d]\n", second_pass_min,
+ second_pass_max);
+
+ // Output is Pixel so matches Pixel values.
+ assert(second_pass_min == 0);
+ assert(second_pass_max == max_input);
+
+ const int compound_second_pass_min =
+ RightShiftWithRounding(min, compound_vertical_bits) + compound_offset;
+ const int compound_second_pass_max =
+ RightShiftWithRounding(max, compound_vertical_bits) + compound_offset;
+
+ printf(" compound second pass output range: [%8d, %8d]\n",
+ compound_second_pass_min, compound_second_pass_max);
+
+ if (bitdepth == 8) {
+ // 8bpp output is int16_t without an offset.
+ assert(compound_second_pass_min > INT16_MIN);
+ assert(compound_second_pass_max < INT16_MAX);
+ } else {
+ // 10bpp and 12bpp use the offset to fit inside uint16_t.
+ assert(compound_second_pass_min > 0);
+ assert(compound_second_pass_max < UINT16_MAX);
+ }
+
+ printf("\n");
+}
+
+TEST(ConvolveTest, ShowRange) {
+ ShowRange<kBitdepth8>();
+ ShowRange<kBitdepth10>();
+ ShowRange<kBitdepth12>();
+}
+
+using ConvolveTest8bpp = ConvolveTest<8, uint8_t>;
+
+TEST_P(ConvolveTest8bpp, FixedValues) {
+ Test(true, 0);
+ Test(true, 1);
+ Test(true, 128);
+ Test(true, 255);
+}
+
+TEST_P(ConvolveTest8bpp, RandomValues) { Test(false, 0); }
+
+TEST_P(ConvolveTest8bpp, DISABLED_Speed) {
+ const int num_runs = static_cast<int>(1.0e7 / (param_.width * param_.height));
+ Test(false, 0, num_runs);
+}
+
+const ConvolveTestParam kConvolveParam[] = {
+ ConvolveTestParam(2, 2), ConvolveTestParam(2, 4),
+ ConvolveTestParam(4, 2), ConvolveTestParam(4, 4),
+ ConvolveTestParam(4, 8), ConvolveTestParam(8, 2),
+ ConvolveTestParam(8, 4), ConvolveTestParam(8, 8),
+ ConvolveTestParam(8, 16), ConvolveTestParam(16, 8),
+ ConvolveTestParam(16, 16), ConvolveTestParam(16, 32),
+ ConvolveTestParam(32, 16), ConvolveTestParam(32, 32),
+ ConvolveTestParam(32, 64), ConvolveTestParam(64, 32),
+ ConvolveTestParam(64, 64), ConvolveTestParam(64, 128),
+ ConvolveTestParam(128, 64), ConvolveTestParam(128, 128),
+};
+
+const ConvolveTypeParam kConvolveTypeParam[] = {
+ ConvolveTypeParam(false, false, false, false),
+ ConvolveTypeParam(false, false, false, true),
+ ConvolveTypeParam(false, false, true, false),
+ ConvolveTypeParam(false, false, true, true),
+ ConvolveTypeParam(false, true, false, false),
+ ConvolveTypeParam(false, true, false, true),
+ ConvolveTypeParam(false, true, true, false),
+ ConvolveTypeParam(false, true, true, true),
+ ConvolveTypeParam(true, false, false, false),
+ ConvolveTypeParam(true, false, false, true),
+ ConvolveTypeParam(true, false, true, false),
+ ConvolveTypeParam(true, false, true, true),
+ ConvolveTypeParam(true, true, false, false),
+ ConvolveTypeParam(true, true, false, true),
+ ConvolveTypeParam(true, true, true, false),
+ ConvolveTypeParam(true, true, true, true),
+};
+
+INSTANTIATE_TEST_SUITE_P(C, ConvolveTest8bpp,
+ testing::Combine(testing::ValuesIn(kConvolveParam),
+ testing::ValuesIn(kConvolveTypeParam),
+ testing::Bool()));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, ConvolveTest8bpp,
+ testing::Combine(testing::ValuesIn(kConvolveParam),
+ testing::ValuesIn(kConvolveTypeParam),
+ testing::Bool()));
+#endif // LIBGAV1_ENABLE_NEON
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, ConvolveTest8bpp,
+ testing::Combine(testing::ValuesIn(kConvolveParam),
+ testing::ValuesIn(kConvolveTypeParam),
+ testing::Bool()));
+#endif // LIBGAV1_ENABLE_SSE4_1
+
+#if LIBGAV1_ENABLE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, ConvolveTest8bpp,
+ testing::Combine(testing::ValuesIn(kConvolveParam),
+ testing::ValuesIn(kConvolveTypeParam),
+ testing::Bool()));
+#endif // LIBGAV1_ENABLE_AVX2
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using ConvolveTest10bpp = ConvolveTest<10, uint16_t>;
+
+TEST_P(ConvolveTest10bpp, FixedValues) {
+ Test(true, 0);
+ Test(true, 1);
+ Test(true, 128);
+ Test(true, (1 << 10) - 1);
+}
+
+TEST_P(ConvolveTest10bpp, RandomValues) { Test(false, 0); }
+
+TEST_P(ConvolveTest10bpp, DISABLED_Speed) {
+ const int num_runs = static_cast<int>(1.0e7 / (param_.width * param_.height));
+ Test(false, 0, num_runs);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, ConvolveTest10bpp,
+ testing::Combine(testing::ValuesIn(kConvolveParam),
+ testing::ValuesIn(kConvolveTypeParam),
+ testing::Bool()));
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+} // namespace
+} // namespace dsp
+} // namespace libgav1
diff --git a/src/dsp/distance_weighted_blend_test.cc b/src/dsp/distance_weighted_blend_test.cc
new file mode 100644
index 0000000..b3f3a2e
--- /dev/null
+++ b/src/dsp/distance_weighted_blend_test.cc
@@ -0,0 +1,324 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/distance_weighted_blend.h"
+
+#include <cstdint>
+#include <ostream>
+#include <string>
+#include <type_traits>
+
+#include "absl/strings/match.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kNumSpeedTests = 500000;
+
+constexpr int kQuantizedDistanceLookup[4][2] = {
+ {9, 7}, {11, 5}, {12, 4}, {13, 3}};
+
+struct TestParam {
+ TestParam(int width, int height) : width(width), height(height) {}
+ int width;
+ int height;
+};
+
+std::ostream& operator<<(std::ostream& os, const TestParam& param) {
+ return os << "BlockSize" << param.width << "x" << param.height;
+}
+
+template <int bitdepth, typename Pixel>
+class DistanceWeightedBlendTest : public testing::TestWithParam<TestParam>,
+ public test_utils::MaxAlignedAllocable {
+ public:
+ DistanceWeightedBlendTest() = default;
+ ~DistanceWeightedBlendTest() override = default;
+
+ void SetUp() override {
+ test_utils::ResetDspTable(bitdepth);
+ DistanceWeightedBlendInit_C();
+ const dsp::Dsp* const dsp = dsp::GetDspTable(bitdepth);
+ ASSERT_NE(dsp, nullptr);
+ base_func_ = dsp->distance_weighted_blend;
+ const testing::TestInfo* const test_info =
+ testing::UnitTest::GetInstance()->current_test_info();
+ const absl::string_view test_case = test_info->test_suite_name();
+ if (absl::StartsWith(test_case, "C/")) {
+ base_func_ = nullptr;
+ } else if (absl::StartsWith(test_case, "SSE41/")) {
+ if ((GetCpuInfo() & kSSE4_1) != 0) {
+ DistanceWeightedBlendInit_SSE4_1();
+ }
+ } else if (absl::StartsWith(test_case, "NEON/")) {
+ DistanceWeightedBlendInit_NEON();
+ } else {
+ FAIL() << "Unrecognized architecture prefix in test case name: "
+ << test_case;
+ }
+ func_ = dsp->distance_weighted_blend;
+ }
+
+ protected:
+ void Test(const char* digest, int num_tests);
+
+ private:
+ using PredType =
+ typename std::conditional<bitdepth == 8, int16_t, uint16_t>::type;
+ static constexpr int kDestStride = kMaxSuperBlockSizeInPixels;
+ const int width_ = GetParam().width;
+ const int height_ = GetParam().height;
+ alignas(kMaxAlignment) PredType
+ source1_[kMaxSuperBlockSizeInPixels * kMaxSuperBlockSizeInPixels];
+ alignas(kMaxAlignment) PredType
+ source2_[kMaxSuperBlockSizeInPixels * kMaxSuperBlockSizeInPixels];
+ Pixel dest_[kMaxSuperBlockSizeInPixels * kMaxSuperBlockSizeInPixels] = {};
+ Pixel reference_[kMaxSuperBlockSizeInPixels * kMaxSuperBlockSizeInPixels] =
+ {};
+ dsp::DistanceWeightedBlendFunc base_func_;
+ dsp::DistanceWeightedBlendFunc func_;
+};
+
+template <int bitdepth, typename Pixel>
+void DistanceWeightedBlendTest<bitdepth, Pixel>::Test(const char* digest,
+ int num_tests) {
+ if (func_ == nullptr) return;
+ libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+ PredType* src_1 = source1_;
+ PredType* src_2 = source2_;
+
+ const int index = rnd.Rand8() & 3;
+ const uint8_t weight_0 = kQuantizedDistanceLookup[index][0];
+ const uint8_t weight_1 = kQuantizedDistanceLookup[index][1];
+ // In libgav1, predictors have an offset which are later subtracted and
+ // clipped in distance weighted blending. Therefore we add the offset
+ // here to match libaom's implementation.
+ for (int y = 0; y < height_; ++y) {
+ for (int x = 0; x < width_; ++x) {
+ // distance_weighted_blend is applied to compound prediction values. This
+ // implies a range far exceeding that of pixel values.
+ // The ranges include kCompoundOffset in 10bpp and 12bpp.
+ // see: src/dsp/convolve.cc & src/dsp/warp.cc.
+ static constexpr int kCompoundPredictionRange[3][2] = {
+ // 8bpp
+ {-5132, 9212},
+ // 10bpp
+ {3988, 61532},
+ // 12bpp
+ {3974, 61559},
+ };
+ constexpr int bitdepth_index = (bitdepth - 8) >> 1;
+ const int min_val = kCompoundPredictionRange[bitdepth_index][0];
+ const int max_val = kCompoundPredictionRange[bitdepth_index][1];
+ src_1[x] = static_cast<PredType>(rnd(max_val - min_val) + min_val);
+ src_2[x] = static_cast<PredType>(rnd(max_val - min_val) + min_val);
+ }
+ src_1 += width_;
+ src_2 += width_;
+ }
+ absl::Duration elapsed_time;
+ for (int i = 0; i < num_tests; ++i) {
+ const absl::Time start = absl::Now();
+ func_(source1_, source2_, weight_0, weight_1, width_, height_, dest_,
+ sizeof(Pixel) * kDestStride);
+ elapsed_time += absl::Now() - start;
+ }
+
+ test_utils::CheckMd5Digest(
+ "DistanceWeightedBlend",
+ absl::StrFormat("BlockSize%dx%d", width_, height_).c_str(), digest, dest_,
+ sizeof(dest_), elapsed_time);
+}
+
+const TestParam kTestParam[] = {
+ TestParam(4, 4), TestParam(4, 8), TestParam(4, 16),
+ TestParam(8, 4), TestParam(8, 8), TestParam(8, 16),
+ TestParam(8, 32), TestParam(16, 4), TestParam(16, 8),
+ TestParam(16, 16), TestParam(16, 32), TestParam(16, 64),
+ TestParam(32, 8), TestParam(32, 16), TestParam(32, 32),
+ TestParam(32, 64), TestParam(32, 128), TestParam(64, 16),
+ TestParam(64, 32), TestParam(64, 64), TestParam(64, 128),
+ TestParam(128, 32), TestParam(128, 64), TestParam(128, 128),
+};
+
+const char* GetDistanceWeightedBlendDigest8bpp(const TestParam block_size) {
+ static const char* const kDigestsWidth4[] = {
+ "ebf389f724f8ab46a2cac895e4e073ca",
+ "09acd567b6b12c8cf8eb51d8b86eb4bf",
+ "57bb4d65695d8ec6752f2bd8686b64fd",
+ };
+ static const char* const kDigestsWidth8[] = {
+ "270905ac76f9a2cba8a552eb0bf7c8c1",
+ "f0801c8574d2c271ef2bbea77a1d7352",
+ "e761b580e3312be33a227492a233ce72",
+ "ff214dab1a7e98e2285961d6421720c6",
+ };
+ static const char* const kDigestsWidth16[] = {
+ "4f712609a36e817f9752326d58562ff8", "14243f5c5f7c7104160c1f2cef0a0fbc",
+ "3ac3f3161b7c8dd8436b02abfdde104a", "81a00b704e0e41a5dbe6436ac70c098d",
+ "af8fd02017c7acdff788be742d700baa",
+ };
+ static const char* const kDigestsWidth32[] = {
+ "ee34332c66a6d6ed8ce64031aafe776c", "b5e3d22bd2dbdb624c8b86a1afb5ce6d",
+ "607ffc22098d81b7e37a7bf62f4af5d3", "3823dbf043b4682f56d5ca698e755ea5",
+ "57f7e8d1e67645269ce760a2c8da4afc",
+ };
+ static const char* const kDigestsWidth64[] = {
+ "4acf556b921956c2bc24659cd5128401",
+ "a298c544c9c3b27924b4c23cc687ea5a",
+ "539e2df267782ce61c70103b23b7d922",
+ "3b0cb2a0b5d384efee4d81401025bec1",
+ };
+ static const char* const kDigestsWidth128[] = {
+ "d71ee689a40ff5f390d07717df4b7233",
+ "8b56b636dd712c2f8d138badb7219991",
+ "8cfc8836908902b8f915639b7bff45b3",
+ };
+ const int height_index =
+ FloorLog2(block_size.height) - FloorLog2(block_size.width) + 2;
+ switch (block_size.width) {
+ case 4:
+ return kDigestsWidth4[height_index - 2];
+ case 8:
+ return kDigestsWidth8[height_index - 1];
+ case 16:
+ return kDigestsWidth16[height_index];
+ case 32:
+ return kDigestsWidth32[height_index];
+ case 64:
+ return kDigestsWidth64[height_index];
+ default:
+ EXPECT_EQ(block_size.width, 128)
+ << "Unknown width parameter: " << block_size.width;
+ return kDigestsWidth128[height_index];
+ }
+}
+
+using DistanceWeightedBlendTest8bpp = DistanceWeightedBlendTest<8, uint8_t>;
+
+TEST_P(DistanceWeightedBlendTest8bpp, Blending) {
+ Test(GetDistanceWeightedBlendDigest8bpp(GetParam()), 1);
+}
+
+TEST_P(DistanceWeightedBlendTest8bpp, DISABLED_Speed) {
+ Test(GetDistanceWeightedBlendDigest8bpp(GetParam()), kNumSpeedTests);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, DistanceWeightedBlendTest8bpp,
+ testing::ValuesIn(kTestParam));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, DistanceWeightedBlendTest8bpp,
+ testing::ValuesIn(kTestParam));
+#endif
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, DistanceWeightedBlendTest8bpp,
+ testing::ValuesIn(kTestParam));
+#endif
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+const char* GetDistanceWeightedBlendDigest10bpp(const TestParam block_size) {
+ static const char* const kDigestsWidth4[] = {
+ "55f594b56e16d5c401274affebbcc3d3",
+ "69df14da4bb33a8f7d7087921008e919",
+ "1b61f33604c54015794198a13bfebf46",
+ };
+ static const char* const kDigestsWidth8[] = {
+ "825a938185b152f7cf09bf1c0723ce2b",
+ "85ea315c51d979bc9b45834d6b40ec6f",
+ "92ebde208e8c39f7ec6de2de82182dbb",
+ "520f84716db5b43684dbb703806383fe",
+ };
+ static const char* const kDigestsWidth16[] = {
+ "12ca23e3e2930005a0511646e8c83da4", "6208694a6744f4a3906f58c1add670e3",
+ "a33d63889df989a3bbf84ff236614267", "34830846ecb0572a98bbd192fed02b16",
+ "34bb2f79c0bd7f9a80691b8af597f2a8",
+ };
+ static const char* const kDigestsWidth32[] = {
+ "fa97f2d0e3143f1f44d3ac018b0d696d", "3df4a22456c9ab6ed346ab1b9750ae7d",
+ "6276a058b35c6131bc0c94a4b4a37ebc", "9ca42da5d2d5eb339df03ae2c7a26914",
+ "2ff0dc010a7b40830fb47423a9beb894",
+ };
+ static const char* const kDigestsWidth64[] = {
+ "800e692c520f99223bc24c1ac95a0166",
+ "818b6d20426585ef7fe844015a03aaf5",
+ "fb48691ccfff083e01d74826e88e613f",
+ "0bd350bc5bc604a224d77a5f5a422698",
+ };
+ static const char* const kDigestsWidth128[] = {
+ "02aac5d5669c1245da876c5440c4d829",
+ "a130840813cd6bd69d09bcf5f8d0180f",
+ "6ece1846bea55e8f8f2ed7fbf73718de",
+ };
+ const int height_index =
+ FloorLog2(block_size.height) - FloorLog2(block_size.width) + 2;
+ switch (block_size.width) {
+ case 4:
+ return kDigestsWidth4[height_index - 2];
+ case 8:
+ return kDigestsWidth8[height_index - 1];
+ case 16:
+ return kDigestsWidth16[height_index];
+ case 32:
+ return kDigestsWidth32[height_index];
+ case 64:
+ return kDigestsWidth64[height_index];
+ default:
+ EXPECT_EQ(block_size.width, 128)
+ << "Unknown width parameter: " << block_size.width;
+ return kDigestsWidth128[height_index];
+ }
+}
+
+using DistanceWeightedBlendTest10bpp = DistanceWeightedBlendTest<10, uint16_t>;
+
+TEST_P(DistanceWeightedBlendTest10bpp, Blending) {
+ Test(GetDistanceWeightedBlendDigest10bpp(GetParam()), 1);
+}
+
+TEST_P(DistanceWeightedBlendTest10bpp, DISABLED_Speed) {
+ Test(GetDistanceWeightedBlendDigest10bpp(GetParam()), kNumSpeedTests);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, DistanceWeightedBlendTest10bpp,
+ testing::ValuesIn(kTestParam));
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, DistanceWeightedBlendTest10bpp,
+ testing::ValuesIn(kTestParam));
+#endif
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, DistanceWeightedBlendTest10bpp,
+ testing::ValuesIn(kTestParam));
+#endif
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+} // namespace
+} // namespace dsp
+} // namespace libgav1
diff --git a/src/dsp/dsp.cc b/src/dsp/dsp.cc
index 5b54c4e..a3d7701 100644
--- a/src/dsp/dsp.cc
+++ b/src/dsp/dsp.cc
@@ -16,7 +16,6 @@
#include <mutex> // NOLINT (unapproved c++11 header)
-#include "src/dsp/arm/weight_mask_neon.h"
#include "src/dsp/average_blend.h"
#include "src/dsp/cdef.h"
#include "src/dsp/convolve.h"
@@ -24,6 +23,10 @@
#include "src/dsp/film_grain.h"
#include "src/dsp/intra_edge.h"
#include "src/dsp/intrapred.h"
+#include "src/dsp/intrapred_cfl.h"
+#include "src/dsp/intrapred_directional.h"
+#include "src/dsp/intrapred_filter.h"
+#include "src/dsp/intrapred_smooth.h"
#include "src/dsp/inverse_transform.h"
#include "src/dsp/loop_filter.h"
#include "src/dsp/loop_restoration.h"
@@ -39,6 +42,30 @@
namespace libgav1 {
namespace dsp_internal {
+void DspInit_C() {
+ dsp::AverageBlendInit_C();
+ dsp::CdefInit_C();
+ dsp::ConvolveInit_C();
+ dsp::DistanceWeightedBlendInit_C();
+ dsp::FilmGrainInit_C();
+ dsp::IntraEdgeInit_C();
+ dsp::IntraPredCflInit_C();
+ dsp::IntraPredDirectionalInit_C();
+ dsp::IntraPredFilterInit_C();
+ dsp::IntraPredInit_C();
+ dsp::IntraPredSmoothInit_C();
+ dsp::InverseTransformInit_C();
+ dsp::LoopFilterInit_C();
+ dsp::LoopRestorationInit_C();
+ dsp::MaskBlendInit_C();
+ dsp::MotionFieldProjectionInit_C();
+ dsp::MotionVectorSearchInit_C();
+ dsp::ObmcInit_C();
+ dsp::SuperResInit_C();
+ dsp::WarpInit_C();
+ dsp::WeightMaskInit_C();
+}
+
dsp::Dsp* GetWritableDspTable(int bitdepth) {
switch (bitdepth) {
case 8: {
@@ -62,23 +89,7 @@ namespace dsp {
void DspInit() {
static std::once_flag once;
std::call_once(once, []() {
- AverageBlendInit_C();
- CdefInit_C();
- ConvolveInit_C();
- DistanceWeightedBlendInit_C();
- FilmGrainInit_C();
- IntraEdgeInit_C();
- IntraPredInit_C();
- InverseTransformInit_C();
- LoopFilterInit_C();
- LoopRestorationInit_C();
- MaskBlendInit_C();
- MotionFieldProjectionInit_C();
- MotionVectorSearchInit_C();
- ObmcInit_C();
- SuperResInit_C();
- WarpInit_C();
- WeightMaskInit_C();
+ dsp_internal::DspInit_C();
#if LIBGAV1_ENABLE_SSE4_1 || LIBGAV1_ENABLE_AVX2
const uint32_t cpu_features = GetCpuInfo();
#if LIBGAV1_ENABLE_SSE4_1
@@ -87,7 +98,11 @@ void DspInit() {
CdefInit_SSE4_1();
ConvolveInit_SSE4_1();
DistanceWeightedBlendInit_SSE4_1();
+ FilmGrainInit_SSE4_1();
IntraEdgeInit_SSE4_1();
+ IntraPredCflInit_SSE4_1();
+ IntraPredDirectionalInit_SSE4_1();
+ IntraPredFilterInit_SSE4_1();
IntraPredInit_SSE4_1();
IntraPredCflInit_SSE4_1();
IntraPredSmoothInit_SSE4_1();
@@ -108,6 +123,7 @@ void DspInit() {
#endif // LIBGAV1_ENABLE_SSE4_1
#if LIBGAV1_ENABLE_AVX2
if ((cpu_features & kAVX2) != 0) {
+ CdefInit_AVX2();
ConvolveInit_AVX2();
LoopRestorationInit_AVX2();
#if LIBGAV1_MAX_BITDEPTH >= 10
@@ -125,7 +141,7 @@ void DspInit() {
IntraEdgeInit_NEON();
IntraPredCflInit_NEON();
IntraPredDirectionalInit_NEON();
- IntraPredFilterIntraInit_NEON();
+ IntraPredFilterInit_NEON();
IntraPredInit_NEON();
IntraPredSmoothInit_NEON();
InverseTransformInit_NEON();
@@ -138,6 +154,9 @@ void DspInit() {
SuperResInit_NEON();
WarpInit_NEON();
WeightMaskInit_NEON();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ InverseTransformInit10bpp_NEON();
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
#endif // LIBGAV1_ENABLE_NEON
});
}
diff --git a/src/dsp/dsp.h b/src/dsp/dsp.h
index fcbac3a..153db7f 100644
--- a/src/dsp/dsp.h
+++ b/src/dsp/dsp.h
@@ -17,7 +17,7 @@
#ifndef LIBGAV1_SRC_DSP_DSP_H_
#define LIBGAV1_SRC_DSP_DSP_H_
-#include <cstddef> // ptrdiff_t
+#include <cstddef>
#include <cstdint>
#include <cstdlib>
@@ -372,8 +372,9 @@ using SuperResCoefficientsFunc = void (*)(int upscaled_width,
// |coefficients| is the upscale filter used by each pixel in a row. It is not
// used by the C function.
// |source| is the input frame buffer. It will be line extended.
+// |source_stride| is given in pixels.
// |dest| is the output buffer.
-// |stride| is given in pixels, and shared by |source| and |dest|.
+// |dest_stride| is given in pixels.
// |height| is the height of the block to be processed.
// |downscaled_width| is the width of the input frame.
// |upscaled_width| is the width of the output frame.
@@ -381,9 +382,10 @@ using SuperResCoefficientsFunc = void (*)(int upscaled_width,
// pixel.
// |initial_subpixel_x| is a base offset from which |step| increments.
using SuperResFunc = void (*)(const void* coefficients, void* source,
- ptrdiff_t stride, int height,
+ ptrdiff_t source_stride, int height,
int downscaled_width, int upscaled_width,
- int initial_subpixel_x, int step, void* dest);
+ int initial_subpixel_x, int step, void* dest,
+ ptrdiff_t dest_stride);
// Loop restoration function signature. Sections 7.16, 7.17.
// |restoration_info| contains loop restoration information, such as filter
@@ -391,14 +393,15 @@ using SuperResFunc = void (*)(const void* coefficients, void* source,
// |source| is the input frame buffer, which is deblocked and cdef filtered.
// |top_border| and |bottom_border| are the top and bottom borders.
// |dest| is the output.
-// |stride| is given in pixels, and shared by |source|, |top_border|,
-// |bottom_border| and |dest|.
+// |stride| is given in pixels, and shared by |source| and |dest|.
+// |top_border_stride| and |bottom_border_stride| are given in pixels.
// |restoration_buffer| contains buffers required for self guided filter and
// wiener filter. They must be initialized before calling.
using LoopRestorationFunc = void (*)(
const RestorationUnitInfo& restoration_info, const void* source,
- const void* top_border, const void* bottom_border, ptrdiff_t stride,
- int width, int height, RestorationBuffer* restoration_buffer, void* dest);
+ ptrdiff_t stride, const void* top_border, ptrdiff_t top_border_stride,
+ const void* bottom_border, ptrdiff_t bottom_border_stride, int width,
+ int height, RestorationBuffer* restoration_buffer, void* dest);
// Index 0 is Wiener Filter.
// Index 1 is Self Guided Restoration Filter.
@@ -900,6 +903,11 @@ namespace dsp_internal {
(LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \
LIBGAV1_Dsp10bpp_##func == LIBGAV1_CPU_SSE4_1)
+// Initializes C-only function pointers. Note some entries may be set to
+// nullptr if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS is not defined. This is meant
+// for use in tests only, it is not thread-safe.
+void DspInit_C();
+
// Returns the appropriate Dsp table for |bitdepth| or nullptr if one doesn't
// exist. This version is meant for use by test or dsp/*Init() functions only.
dsp::Dsp* GetWritableDspTable(int bitdepth);
diff --git a/src/dsp/dsp_test.cc b/src/dsp/dsp_test.cc
new file mode 100644
index 0000000..bf7b9f3
--- /dev/null
+++ b/src/dsp/dsp_test.cc
@@ -0,0 +1,248 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/dsp.h"
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+
+#include "absl/strings/str_cat.h"
+#include "gtest/gtest.h"
+#include "src/dsp/constants.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+#include "tests/utils.h"
+#endif
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// Maps 1D transform to the maximum valid size for the corresponding transform.
+constexpr int kMax1DTransformSize[kNum1DTransforms] = {
+ k1DTransformSize64, // Dct.
+ k1DTransformSize16, // Adst.
+ k1DTransformSize32, // Identity.
+ k1DTransformSize4, // Wht.
+};
+
+void CheckTables(bool c_only) {
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ static constexpr int kBitdepths[] = {kBitdepth8, kBitdepth10};
+#else
+ static constexpr int kBitdepths[] = {kBitdepth8};
+#endif
+
+ for (const auto& bitdepth : kBitdepths) {
+ const Dsp* const dsp = GetDspTable(bitdepth);
+ ASSERT_NE(dsp, nullptr);
+ SCOPED_TRACE(absl::StrCat("bitdepth: ", bitdepth));
+ for (int i = 0; i < kNumTransformSizes; ++i) {
+ for (int j = 0; j < kNumIntraPredictors; ++j) {
+ EXPECT_NE(dsp->intra_predictors[i][j], nullptr)
+ << "index [" << i << "][" << j << "]";
+ }
+ }
+ EXPECT_NE(dsp->directional_intra_predictor_zone1, nullptr);
+ EXPECT_NE(dsp->directional_intra_predictor_zone2, nullptr);
+ EXPECT_NE(dsp->directional_intra_predictor_zone3, nullptr);
+ EXPECT_NE(dsp->filter_intra_predictor, nullptr);
+ for (int i = 0; i < kNumTransformSizes; ++i) {
+ if (std::max(kTransformWidth[i], kTransformHeight[i]) == 64) {
+ EXPECT_EQ(dsp->cfl_intra_predictors[i], nullptr)
+ << "index [" << i << "]";
+ for (int j = 0; j < kNumSubsamplingTypes; ++j) {
+ EXPECT_EQ(dsp->cfl_subsamplers[i][j], nullptr)
+ << "index [" << i << "][" << j << "]";
+ }
+ } else {
+ EXPECT_NE(dsp->cfl_intra_predictors[i], nullptr)
+ << "index [" << i << "]";
+ for (int j = 0; j < kNumSubsamplingTypes; ++j) {
+ EXPECT_NE(dsp->cfl_subsamplers[i][j], nullptr)
+ << "index [" << i << "][" << j << "]";
+ }
+ }
+ }
+ EXPECT_NE(dsp->intra_edge_filter, nullptr);
+ EXPECT_NE(dsp->intra_edge_upsampler, nullptr);
+ for (int i = 0; i < kNum1DTransforms; ++i) {
+ for (int j = 0; j < kNum1DTransformSizes; ++j) {
+ for (int k = 0; k < 2; ++k) {
+ if (j <= kMax1DTransformSize[i]) {
+ EXPECT_NE(dsp->inverse_transforms[i][j][k], nullptr)
+ << "index [" << i << "][" << j << "][" << k << "]";
+ } else {
+ EXPECT_EQ(dsp->inverse_transforms[i][j][k], nullptr)
+ << "index [" << i << "][" << j << "][" << k << "]";
+ }
+ }
+ }
+ }
+ for (int i = 0; i < kNumLoopFilterSizes; ++i) {
+ for (int j = 0; j < kNumLoopFilterTypes; ++j) {
+ EXPECT_NE(dsp->loop_filters[i][j], nullptr)
+ << "index [" << i << "][" << j << "]";
+ }
+ }
+ for (int i = 0; i < 2; ++i) {
+ EXPECT_NE(dsp->loop_restorations[i], nullptr) << "index [" << i << "]";
+ }
+
+ bool super_res_coefficients_is_nonnull = LIBGAV1_ENABLE_NEON;
+#if LIBGAV1_ENABLE_SSE4_1
+ const uint32_t cpu_features = GetCpuInfo();
+ super_res_coefficients_is_nonnull = (cpu_features & kSSE4_1) != 0;
+#endif
+ if (c_only) super_res_coefficients_is_nonnull = false;
+ if (super_res_coefficients_is_nonnull) {
+ EXPECT_NE(dsp->super_res_coefficients, nullptr);
+ } else {
+ EXPECT_EQ(dsp->super_res_coefficients, nullptr);
+ }
+
+ EXPECT_NE(dsp->super_res, nullptr);
+ EXPECT_NE(dsp->cdef_direction, nullptr);
+ for (int i = 0; i < 2; ++i) {
+ for (int j = 0; j < 3; ++j) {
+ EXPECT_NE(dsp->cdef_filters[i][j], nullptr)
+ << "index [" << i << "][" << j << "]";
+ }
+ }
+ for (auto convolve_func : dsp->convolve_scale) {
+ EXPECT_NE(convolve_func, nullptr);
+ }
+ for (int j = 0; j < 2; ++j) {
+ for (int k = 0; k < 2; ++k) {
+ for (int l = 0; l < 2; ++l) {
+ for (int m = 0; m < 2; ++m) {
+ if (j == 1 && k == 1) {
+ EXPECT_EQ(dsp->convolve[j][k][l][m], nullptr);
+ } else {
+ EXPECT_NE(dsp->convolve[j][k][l][m], nullptr);
+ }
+ }
+ }
+ }
+ }
+ for (const auto& m : dsp->mask_blend) {
+ for (int i = 0; i < 2; ++i) {
+ if (i == 0 || bitdepth >= 10) {
+ EXPECT_NE(m[i], nullptr);
+ } else {
+ EXPECT_EQ(m[i], nullptr);
+ }
+ }
+ }
+ for (const auto& m : dsp->inter_intra_mask_blend_8bpp) {
+ if (bitdepth == 8) {
+ EXPECT_NE(m, nullptr);
+ } else {
+ EXPECT_EQ(m, nullptr);
+ }
+ }
+ for (int i = kBlock4x4; i < kMaxBlockSizes; ++i) {
+ const int width_index = k4x4WidthLog2[i] - 1;
+ const int height_index = k4x4HeightLog2[i] - 1;
+ // Only block sizes >= 8x8 are handled with this function.
+ if (width_index < 0 || height_index < 0) continue;
+
+ for (size_t j = 0; j < 2; ++j) {
+ EXPECT_NE(dsp->weight_mask[width_index][height_index][j], nullptr)
+ << ToString(static_cast<BlockSize>(i)) << " index [" << width_index
+ << "]"
+ << "[" << height_index << "][" << j << "]";
+ }
+ }
+
+ EXPECT_NE(dsp->average_blend, nullptr);
+ EXPECT_NE(dsp->distance_weighted_blend, nullptr);
+ for (int i = 0; i < kNumObmcDirections; ++i) {
+ EXPECT_NE(dsp->obmc_blend[i], nullptr)
+ << "index [" << ToString(static_cast<ObmcDirection>(i)) << "]";
+ }
+ EXPECT_NE(dsp->warp, nullptr);
+ EXPECT_NE(dsp->warp_compound, nullptr);
+
+ for (int i = 0; i < kNumAutoRegressionLags - 1; ++i) {
+ EXPECT_NE(dsp->film_grain.luma_auto_regression[i], nullptr)
+ << "index [" << i << "]";
+ }
+ for (int i = 0; i < 2; ++i) {
+ for (int j = 0; j < kNumAutoRegressionLags; ++j) {
+ if (i == 0 && j == 0) {
+ EXPECT_EQ(dsp->film_grain.chroma_auto_regression[i][j], nullptr)
+ << " index [" << i << "]"
+ << "[" << j << "]";
+ } else {
+ EXPECT_NE(dsp->film_grain.chroma_auto_regression[i][j], nullptr)
+ << " index [" << i << "]"
+ << "[" << j << "]";
+ }
+ }
+ EXPECT_NE(dsp->film_grain.construct_noise_stripes[i], nullptr)
+ << "index [" << i << "]";
+ EXPECT_NE(dsp->film_grain.blend_noise_chroma[i], nullptr)
+ << "index [" << i << "]";
+ }
+ EXPECT_NE(dsp->film_grain.construct_noise_image_overlap, nullptr);
+ EXPECT_NE(dsp->film_grain.initialize_scaling_lut, nullptr);
+ EXPECT_NE(dsp->film_grain.blend_noise_luma, nullptr);
+
+ EXPECT_NE(dsp->motion_field_projection_kernel, nullptr);
+ EXPECT_NE(dsp->mv_projection_compound[0], nullptr);
+ EXPECT_NE(dsp->mv_projection_compound[1], nullptr);
+ EXPECT_NE(dsp->mv_projection_compound[2], nullptr);
+ EXPECT_NE(dsp->mv_projection_single[0], nullptr);
+ EXPECT_NE(dsp->mv_projection_single[1], nullptr);
+ EXPECT_NE(dsp->mv_projection_single[2], nullptr);
+ }
+}
+
+TEST(Dsp, TablesArePopulated) {
+ DspInit();
+ CheckTables(/*c_only=*/false);
+}
+
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+TEST(Dsp, TablesArePopulatedCOnly) {
+ test_utils::ResetDspTable(kBitdepth8);
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ test_utils::ResetDspTable(kBitdepth10);
+#endif
+ dsp_internal::DspInit_C();
+ CheckTables(/*c_only=*/true);
+}
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+
+TEST(Dsp, GetDspTable) {
+ EXPECT_EQ(GetDspTable(1), nullptr);
+ EXPECT_NE(GetDspTable(8), nullptr);
+ EXPECT_EQ(dsp_internal::GetWritableDspTable(1), nullptr);
+ EXPECT_NE(dsp_internal::GetWritableDspTable(8), nullptr);
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ EXPECT_NE(GetDspTable(10), nullptr);
+ EXPECT_NE(dsp_internal::GetWritableDspTable(10), nullptr);
+#else
+ EXPECT_EQ(GetDspTable(10), nullptr);
+ EXPECT_EQ(dsp_internal::GetWritableDspTable(10), nullptr);
+#endif
+}
+
+} // namespace
+} // namespace dsp
+} // namespace libgav1
diff --git a/src/dsp/film_grain.h b/src/dsp/film_grain.h
index fe93270..f75a354 100644
--- a/src/dsp/film_grain.h
+++ b/src/dsp/film_grain.h
@@ -25,6 +25,14 @@
// ARM:
#include "src/dsp/arm/film_grain_neon.h"
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/film_grain_sse4.h"
+// clang-format on
+
// IWYU pragma: end_exports
namespace libgav1 {
diff --git a/src/dsp/intra_edge_test.cc b/src/dsp/intra_edge_test.cc
new file mode 100644
index 0000000..90960c6
--- /dev/null
+++ b/src/dsp/intra_edge_test.cc
@@ -0,0 +1,504 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intra_edge.h"
+
+#include <cstdint>
+#include <cstdio>
+#include <ostream>
+
+#include "absl/strings/match.h"
+#include "absl/strings/string_view.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+const char kIntraEdge[] = "IntraEdge";
+const char kIntraEdgeFilterName[] = "Intra Edge Filter";
+const char kIntraEdgeUpsamplerName[] = "Intra Edge Upsampler";
+
+constexpr int kIntraEdgeFilterTestMaxSize = 129;
+constexpr int kIntraEdgeFilterTestFixedInput[kIntraEdgeFilterTestMaxSize] = {
+ 159, 208, 54, 136, 205, 124, 125, 165, 164, 63, 171, 143, 210, 236, 253,
+ 233, 139, 113, 66, 211, 133, 61, 91, 123, 187, 76, 110, 172, 61, 103,
+ 239, 147, 247, 120, 18, 106, 180, 159, 208, 54, 136, 205, 124, 125, 165,
+ 164, 63, 171, 143, 210, 236, 253, 233, 139, 113, 66, 211, 133, 61, 91,
+ 123, 187, 76, 110, 172, 61, 103, 239, 147, 247, 120, 18, 106, 180, 159,
+ 208, 54, 136, 205, 124, 125, 165, 164, 63, 171, 143, 210, 236, 253, 233,
+ 139, 113, 66, 211, 133, 61, 91, 123, 187, 76, 110, 172, 61, 103, 239,
+ 147, 247, 120, 18, 106, 180, 159, 208, 54, 136, 205, 124, 125, 165, 164,
+ 63, 171, 143, 210, 236, 253, 233, 139, 113,
+};
+constexpr int kIntraEdgeUpsamplerTestFixedInput[] = {
+ 208, 54, 136, 205, 124, 125, 165, 164, 63,
+ 171, 143, 210, 236, 208, 54, 136, 205};
+
+struct EdgeFilterParams {
+ int size;
+ int strength;
+};
+
+std::ostream& operator<<(std::ostream& os, const EdgeFilterParams& param) {
+ return os << "size: " << param.size << ", strength: " << param.strength;
+}
+
+// Each size is paired with strength 1, 2, and 3.
+// In general, the size is expressible as 2^n+1, but all sizes up to 129 are
+// permissible.
+constexpr EdgeFilterParams kIntraEdgeFilterParamList[] = {
+ {1, 1}, {1, 2}, {1, 3}, {2, 1}, {2, 2}, {2, 3}, {5, 1}, {5, 2},
+ {5, 3}, {9, 1}, {9, 2}, {9, 3}, {17, 1}, {17, 2}, {17, 3}, {33, 1},
+ {33, 2}, {33, 3}, {50, 1}, {50, 2}, {50, 3}, {55, 1}, {55, 2}, {55, 3},
+ {65, 1}, {65, 2}, {65, 3}, {129, 1}, {129, 2}, {129, 3}};
+
+template <int bitdepth, typename Pixel>
+class IntraEdgeFilterTest : public testing::TestWithParam<EdgeFilterParams> {
+ public:
+ IntraEdgeFilterTest() = default;
+ IntraEdgeFilterTest(const IntraEdgeFilterTest&) = delete;
+ IntraEdgeFilterTest& operator=(const IntraEdgeFilterTest&) = delete;
+ ~IntraEdgeFilterTest() override = default;
+
+ protected:
+ void SetUp() override {
+ test_utils::ResetDspTable(bitdepth);
+ IntraEdgeInit_C();
+
+ const Dsp* const dsp = GetDspTable(bitdepth);
+ ASSERT_NE(dsp, nullptr);
+ base_intra_edge_filter_ = dsp->intra_edge_filter;
+
+ const testing::TestInfo* const test_info =
+ testing::UnitTest::GetInstance()->current_test_info();
+ const absl::string_view test_case = test_info->test_suite_name();
+ if (absl::StartsWith(test_case, "C/")) {
+ base_intra_edge_filter_ = nullptr;
+ } else if (absl::StartsWith(test_case, "SSE41/")) {
+ if ((GetCpuInfo() & kSSE4_1) != 0) {
+ IntraEdgeInit_SSE4_1();
+ }
+ } else if (absl::StartsWith(test_case, "NEON/")) {
+ IntraEdgeInit_NEON();
+ } else {
+ FAIL() << "Unrecognized architecture prefix in test case name: "
+ << test_case;
+ }
+
+ cur_intra_edge_filter_ = dsp->intra_edge_filter;
+ }
+
+ void TestFixedValues(const char* digest);
+ void TestRandomValues(int num_runs);
+
+ Pixel buffer_[kIntraEdgeFilterTestMaxSize];
+ Pixel base_buffer_[kIntraEdgeFilterTestMaxSize];
+ int strength_ = GetParam().strength;
+ int size_ = GetParam().size;
+
+ IntraEdgeFilterFunc base_intra_edge_filter_;
+ IntraEdgeFilterFunc cur_intra_edge_filter_;
+};
+
+template <int bitdepth, typename Pixel>
+void IntraEdgeFilterTest<bitdepth, Pixel>::TestFixedValues(
+ const char* const digest) {
+ if (cur_intra_edge_filter_ == nullptr) return;
+ for (int i = 0; i < kIntraEdgeFilterTestMaxSize; ++i) {
+ buffer_[i] = kIntraEdgeFilterTestFixedInput[i];
+ }
+ const absl::Time start = absl::Now();
+ cur_intra_edge_filter_(buffer_, size_, strength_);
+ const absl::Duration elapsed_time = absl::Now() - start;
+ test_utils::CheckMd5Digest(kIntraEdge, kIntraEdgeFilterName, digest, buffer_,
+ kIntraEdgeFilterTestMaxSize * sizeof(buffer_[0]),
+ elapsed_time);
+}
+
+template <int bitdepth, typename Pixel>
+void IntraEdgeFilterTest<bitdepth, Pixel>::TestRandomValues(int num_runs) {
+ if (base_intra_edge_filter_ == nullptr) return;
+ if (cur_intra_edge_filter_ == nullptr) return;
+ libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+ absl::Duration elapsed_time;
+ absl::Duration base_elapsed_time;
+ for (int num_tests = 0; num_tests < num_runs; ++num_tests) {
+ for (int i = 0; i < kIntraEdgeFilterTestMaxSize; ++i) {
+ const Pixel val = rnd(bitdepth);
+ buffer_[i] = val;
+ base_buffer_[i] = val;
+ }
+ const absl::Time base_start = absl::Now();
+ base_intra_edge_filter_(base_buffer_, size_, strength_);
+ base_elapsed_time += absl::Now() - base_start;
+ const absl::Time start = absl::Now();
+ cur_intra_edge_filter_(buffer_, size_, strength_);
+ elapsed_time += absl::Now() - start;
+ }
+ if (num_runs > 1) {
+ printf("Mode %s[%31s] Size %3d Strength %d C: %5d us SIMD: %5d us %2.2fx\n",
+ kIntraEdge, kIntraEdgeFilterName, size_, strength_,
+ static_cast<int>(absl::ToInt64Microseconds(base_elapsed_time)),
+ static_cast<int>(absl::ToInt64Microseconds(elapsed_time)),
+ absl::ToDoubleMicroseconds(base_elapsed_time) /
+ absl::ToDoubleMicroseconds(elapsed_time));
+ } else {
+ printf("Mode %s[%31s] Size %3d Strength %d\n", kIntraEdge,
+ kIntraEdgeFilterName, size_, strength_);
+ }
+ for (int i = 0; i < kIntraEdgeFilterTestMaxSize; ++i) {
+ EXPECT_EQ(buffer_[i], base_buffer_[i]) << "Mismatch in index: " << i;
+ }
+}
+
+using IntraEdgeFilterTest8bpp = IntraEdgeFilterTest<8, uint8_t>;
+
+const char* GetIntraEdgeFilterDigest8bpp(int strength, int size) {
+ static const char* const kDigestsSize1[3] = {
+ "f7f681cf7047602fafc7fb416ecf46e1", "f7f681cf7047602fafc7fb416ecf46e1",
+ "f7f681cf7047602fafc7fb416ecf46e1"};
+ static const char* const kDigestsSize2[3] = {
+ "cb24cc54900fb75d767f3de797451e43", "380c80c89e1e8cda81ee0d3d4b29b8b7",
+ "a7eb3dba95ff35c2df45a274afbc9772"};
+ static const char* const kDigestsSize5[3] = {
+ "23380cb37688d4c3a8f70a276be65eed", "ec1e23d5b996a527ed3d45c0d552bf22",
+ "d313523d3b7646fdbb873c61ffe7a51a"};
+ static const char* const kDigestsSize9[3] = {
+ "e79597e9d62893754fc77d80ca86329a", "f7644e9748984914100e7031c6432272",
+ "bdf4f16734c86338716fb436c196ecc6"};
+ static const char* const kDigestsSize17[3] = {
+ "13ad15c833e850348eecb9fea4f3cadb", "e5988a72391250c702a8192893df40dd",
+ "8f68603598638fa33203fe1233d273b1"};
+ static const char* const kDigestsSize33[3] = {
+ "51156da8f4d527e0c011040769987dbd", "eff17eaf73a7bb7fd4c921510ade9f67",
+ "aca87680e0649d0728091c92c6de8871"};
+ static const char* const kDigestsSize50[3] = {
+ "87c1d43751125f1ea4987517a90d378d", "942a9d056231683bdfc52346b6b032c2",
+ "16a9148daf0e5f69808b9f0caa1ef110"};
+ static const char* const kDigestsSize55[3] = {
+ "833480d74957fb0356dec5b09412eefa", "a307ef31f10affc3b7fb262d05f1b80a",
+ "0318b2fde088c472215fe155f3b48d36"};
+ static const char* const kDigestsSize65[3] = {
+ "5000dada34ed2e6692bb44a4398ddf53", "8da6c776d897064ecd4a1e84aae92dd3",
+ "d7c71db339c28d33119974987b2f9d85"};
+ static const char* const kDigestsSize129[3] = {
+ "bf174d8b45b8131404fd4a4686f8c117", "e81518d6d85eed2f1b18c59424561d6b",
+ "7306715602b0f5536771724a2f0a39bc"};
+
+ switch (size) {
+ case 1:
+ return kDigestsSize1[strength - 1];
+ case 2:
+ return kDigestsSize2[strength - 1];
+ case 5:
+ return kDigestsSize5[strength - 1];
+ case 9:
+ return kDigestsSize9[strength - 1];
+ case 17:
+ return kDigestsSize17[strength - 1];
+ case 33:
+ return kDigestsSize33[strength - 1];
+ case 50:
+ return kDigestsSize50[strength - 1];
+ case 55:
+ return kDigestsSize55[strength - 1];
+ case 65:
+ return kDigestsSize65[strength - 1];
+ case 129:
+ return kDigestsSize129[strength - 1];
+ default:
+ ADD_FAILURE() << "Unknown edge size: " << size;
+ return nullptr;
+ }
+}
+
+TEST_P(IntraEdgeFilterTest8bpp, Correctness) {
+ TestFixedValues(GetIntraEdgeFilterDigest8bpp(strength_, size_));
+ TestRandomValues(1);
+}
+
+TEST_P(IntraEdgeFilterTest8bpp, DISABLED_Speed) { TestRandomValues(5e7); }
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using IntraEdgeFilterTest10bpp = IntraEdgeFilterTest<10, uint16_t>;
+
+const char* GetIntraEdgeFilterDigest10bpp(int strength, int size) {
+ static const char* const kDigestsSize1[3] = {
+ "2d2088560e3ccb5b809c97f5299bb1c0", "2d2088560e3ccb5b809c97f5299bb1c0",
+ "2d2088560e3ccb5b809c97f5299bb1c0"};
+ static const char* const kDigestsSize2[3] = {
+ "db3e785852e98fba18a1fb531f68466c", "8caea330489bc6ed0f99fbf769f53181",
+ "bcdd1b21f3baf5f6f29caea9ef93fb0c"};
+ static const char* const kDigestsSize5[3] = {
+ "326f4193a62f5a959b86d95f5204608e", "4673e453203f75eae97ef44f43f098f2",
+ "48d516b06313683aca30e975ce6a3cad"};
+ static const char* const kDigestsSize9[3] = {
+ "79217575a32e36a51d9dd40621af9c2d", "ccec1c16bc09b28ad6513c5e4c48b6d2",
+ "bb61aa9c5fa720c667a053769e7b7d08"};
+ static const char* const kDigestsSize17[3] = {
+ "46d90e99ba46e89326a5fa547bcd9361", "824aee8950aecb356d5f4a91dbc90a7d",
+ "37d44d10a2545385af1da55f8c08564f"};
+ static const char* const kDigestsSize33[3] = {
+ "c95108e06eb2aef61ecb6839af306edd", "832c695460b4dd2b85c5f8726e4470d1",
+ "994902f549eefd83fbcbf7ecb7dc5cca"};
+ static const char* const kDigestsSize50[3] = {
+ "48119ef1436c3a4fe69d275bbaafedf8", "72c221c91c3df0a324ccbc9acea35f89",
+ "84e40aadcc416ef3f51cea3cc23b30c7"};
+ static const char* const kDigestsSize55[3] = {
+ "6b68e4e0b00c4eb38a6d0d83c0f34658", "43a919f928a80379df5c9e07c9d8000d",
+ "7c320d55b11f93185b811bdaa379f2db"};
+ static const char* const kDigestsSize65[3] = {
+ "c28de89cf9f3bc5a904647ab2c64caf7", "7ce63b1b28dce0624fc7586e8fb3ab8f",
+ "d06e6b88585f7f1a1f6af5bb59ee2180"};
+ static const char* const kDigestsSize129[3] = {
+ "79160902c5c85004382d5ffa549b43cc", "3b0df95c3ca7b0b559b79234cf434738",
+ "500786d8561effec283d4f3d13886f8c"};
+
+ switch (size) {
+ case 1:
+ return kDigestsSize1[strength - 1];
+ case 2:
+ return kDigestsSize2[strength - 1];
+ case 5:
+ return kDigestsSize5[strength - 1];
+ case 9:
+ return kDigestsSize9[strength - 1];
+ case 17:
+ return kDigestsSize17[strength - 1];
+ case 33:
+ return kDigestsSize33[strength - 1];
+ case 50:
+ return kDigestsSize50[strength - 1];
+ case 55:
+ return kDigestsSize55[strength - 1];
+ case 65:
+ return kDigestsSize65[strength - 1];
+ case 129:
+ return kDigestsSize129[strength - 1];
+ default:
+ ADD_FAILURE() << "Unknown edge size: " << size;
+ return nullptr;
+ }
+}
+
+TEST_P(IntraEdgeFilterTest10bpp, FixedInput) {
+ TestFixedValues(GetIntraEdgeFilterDigest10bpp(strength_, size_));
+ TestRandomValues(1);
+}
+
+TEST_P(IntraEdgeFilterTest10bpp, DISABLED_Speed) { TestRandomValues(5e7); }
+#endif
+
+template <int bitdepth, typename Pixel>
+class IntraEdgeUpsamplerTest : public testing::TestWithParam<int> {
+ public:
+ IntraEdgeUpsamplerTest() = default;
+ IntraEdgeUpsamplerTest(const IntraEdgeUpsamplerTest&) = delete;
+ IntraEdgeUpsamplerTest& operator=(const IntraEdgeUpsamplerTest&) = delete;
+ ~IntraEdgeUpsamplerTest() override = default;
+
+ protected:
+ void SetUp() override {
+ test_utils::ResetDspTable(bitdepth);
+ IntraEdgeInit_C();
+
+ const Dsp* const dsp = GetDspTable(bitdepth);
+ ASSERT_NE(dsp, nullptr);
+ base_intra_edge_upsampler_ = dsp->intra_edge_upsampler;
+ const testing::TestInfo* const test_info =
+ testing::UnitTest::GetInstance()->current_test_info();
+ const absl::string_view test_case = test_info->test_suite_name();
+ if (absl::StartsWith(test_case, "C/")) {
+ base_intra_edge_upsampler_ = nullptr;
+ } else if (absl::StartsWith(test_case, "SSE41/")) {
+ if ((GetCpuInfo() & kSSE4_1) != 0) {
+ IntraEdgeInit_SSE4_1();
+ }
+ } else if (absl::StartsWith(test_case, "NEON/")) {
+ IntraEdgeInit_NEON();
+ } else {
+ FAIL() << "Unrecognized architecture prefix in test case name: "
+ << test_case;
+ }
+ cur_intra_edge_upsampler_ = dsp->intra_edge_upsampler;
+ }
+
+ void TestFixedValues(const char* digest);
+ void TestRandomValues(int num_runs);
+
+ Pixel buffer_[128];
+ Pixel base_buffer_[128];
+ int size_ = GetParam();
+
+ IntraEdgeUpsamplerFunc base_intra_edge_upsampler_;
+ IntraEdgeUpsamplerFunc cur_intra_edge_upsampler_;
+};
+
+template <int bitdepth, typename Pixel>
+void IntraEdgeUpsamplerTest<bitdepth, Pixel>::TestFixedValues(
+ const char* const digest) {
+ if (cur_intra_edge_upsampler_ == nullptr) return;
+ buffer_[0] = 0;
+ for (int i = 0; i < size_ + 1; ++i) {
+ buffer_[i + 1] = kIntraEdgeUpsamplerTestFixedInput[i];
+ }
+ const absl::Time start = absl::Now();
+ cur_intra_edge_upsampler_(buffer_ + 2, size_);
+ const absl::Duration elapsed_time = absl::Now() - start;
+ test_utils::CheckMd5Digest(kIntraEdge, kIntraEdgeUpsamplerName, digest,
+ buffer_, (size_ * 2 + 1) * sizeof(buffer_[0]),
+ elapsed_time);
+}
+
+template <int bitdepth, typename Pixel>
+void IntraEdgeUpsamplerTest<bitdepth, Pixel>::TestRandomValues(int num_runs) {
+ if (base_intra_edge_upsampler_ == nullptr) return;
+ if (cur_intra_edge_upsampler_ == nullptr) return;
+ libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+ absl::Duration base_elapsed_time;
+ absl::Duration elapsed_time;
+ for (int num_tests = 0; num_tests < num_runs; ++num_tests) {
+ // Populate what will be buffer[-2..size] when passed to the upsample
+ // function.
+ buffer_[0] = 0;
+ base_buffer_[0] = 0;
+ for (int i = 1; i < size_ + 2; ++i) {
+ const Pixel val = rnd(bitdepth);
+ buffer_[i] = val;
+ base_buffer_[i] = val;
+ }
+ const absl::Time base_start = absl::Now();
+ base_intra_edge_upsampler_(base_buffer_ + 2, size_);
+ base_elapsed_time += absl::Now() - base_start;
+ const absl::Time start = absl::Now();
+ cur_intra_edge_upsampler_(buffer_ + 2, size_);
+ elapsed_time += absl::Now() - start;
+ }
+ if (num_runs > 1) {
+ printf("Mode %s[%31s] size %d C: %5d us SIMD: %5d us %2.2fx\n", kIntraEdge,
+ kIntraEdgeUpsamplerName, size_,
+ static_cast<int>(absl::ToInt64Microseconds(base_elapsed_time)),
+ static_cast<int>(absl::ToInt64Microseconds(elapsed_time)),
+ absl::ToDoubleMicroseconds(base_elapsed_time) /
+ absl::ToDoubleMicroseconds(elapsed_time));
+ } else {
+ printf("Mode %s[%31s]: size %d \n", kIntraEdge, kIntraEdgeUpsamplerName,
+ size_);
+ }
+
+ for (int i = 0; i < size_ * 2 + 1; ++i) {
+ EXPECT_EQ(buffer_[i], base_buffer_[i]) << "Mismatch in index: " << i;
+ }
+}
+
+using IntraEdgeUpsamplerTest8bpp = IntraEdgeUpsamplerTest<8, uint8_t>;
+
+constexpr int kIntraEdgeUpsampleSizes[] = {4, 8, 12, 16};
+
+const char* GetIntraEdgeUpsampleDigest8bpp(int size) {
+ switch (size) {
+ case 4:
+ return "aa9002e03f8d15eb26bbee76f40bb923";
+ case 8:
+ return "cacfca86d65eff0d951eb21fc15f242a";
+ case 12:
+ return "0529e00a1fa80bc866fa7662ad2d7b9f";
+ case 16:
+ return "03e3b3e0ea438ea48ef05651c0a54986";
+ default:
+ ADD_FAILURE() << "Unknown upsample size: " << size;
+ return "";
+ }
+}
+
+TEST_P(IntraEdgeUpsamplerTest8bpp, Correctness) {
+ TestFixedValues(GetIntraEdgeUpsampleDigest8bpp(size_));
+ TestRandomValues(1);
+}
+
+TEST_P(IntraEdgeUpsamplerTest8bpp, DISABLED_Speed) { TestRandomValues(5e7); }
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using IntraEdgeUpsamplerTest10bpp = IntraEdgeUpsamplerTest<10, uint16_t>;
+
+const char* GetIntraEdgeUpsampleDigest10bpp(int size) {
+ switch (size) {
+ case 4:
+ return "341c6bb705a02bba65b34f92d8ca83cf";
+ case 8:
+ return "fdbe4b3b341921dcb0edf00dfc4d7667";
+ case 12:
+ return "ad69a491287495ec9973d4006d5ac461";
+ case 16:
+ return "04acf32e517d80ce4c4958e711b9b890";
+ default:
+ ADD_FAILURE() << "Unknown upsample size: " << size;
+ return "";
+ }
+}
+
+TEST_P(IntraEdgeUpsamplerTest10bpp, FixedInput) {
+ TestFixedValues(GetIntraEdgeUpsampleDigest10bpp(size_));
+ TestRandomValues(1);
+}
+
+TEST_P(IntraEdgeUpsamplerTest10bpp, DISABLED_Speed) { TestRandomValues(5e7); }
+#endif
+
+INSTANTIATE_TEST_SUITE_P(C, IntraEdgeFilterTest8bpp,
+ testing::ValuesIn(kIntraEdgeFilterParamList));
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, IntraEdgeFilterTest8bpp,
+ testing::ValuesIn(kIntraEdgeFilterParamList));
+#endif
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, IntraEdgeFilterTest8bpp,
+ testing::ValuesIn(kIntraEdgeFilterParamList));
+#endif
+INSTANTIATE_TEST_SUITE_P(C, IntraEdgeUpsamplerTest8bpp,
+ testing::ValuesIn(kIntraEdgeUpsampleSizes));
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, IntraEdgeUpsamplerTest8bpp,
+ testing::ValuesIn(kIntraEdgeUpsampleSizes));
+#endif
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, IntraEdgeUpsamplerTest8bpp,
+ testing::ValuesIn(kIntraEdgeUpsampleSizes));
+#endif
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+INSTANTIATE_TEST_SUITE_P(C, IntraEdgeFilterTest10bpp,
+ testing::ValuesIn(kIntraEdgeFilterParamList));
+INSTANTIATE_TEST_SUITE_P(C, IntraEdgeUpsamplerTest10bpp,
+ testing::ValuesIn(kIntraEdgeUpsampleSizes));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, IntraEdgeFilterTest10bpp,
+ testing::ValuesIn(kIntraEdgeFilterParamList));
+INSTANTIATE_TEST_SUITE_P(NEON, IntraEdgeUpsamplerTest10bpp,
+ testing::ValuesIn(kIntraEdgeUpsampleSizes));
+#endif
+
+#endif
+} // namespace
+} // namespace dsp
+} // namespace libgav1
diff --git a/src/dsp/intrapred.cc b/src/dsp/intrapred.cc
index 4bcb580..4520c2c 100644
--- a/src/dsp/intrapred.cc
+++ b/src/dsp/intrapred.cc
@@ -19,21 +19,18 @@
#include <cstddef>
#include <cstdint>
#include <cstdlib>
-#include <cstring> // memset
+#include <cstring>
#include "src/dsp/constants.h"
#include "src/dsp/dsp.h"
#include "src/utils/common.h"
+#include "src/utils/constants.h"
#include "src/utils/memory.h"
namespace libgav1 {
namespace dsp {
namespace {
-constexpr TransformSize kTransformSizesLargerThan32x32[] = {
- kTransformSize16x64, kTransformSize32x64, kTransformSize64x16,
- kTransformSize64x32, kTransformSize64x64};
-
template <int block_width, int block_height, typename Pixel>
struct IntraPredFuncs_C {
IntraPredFuncs_C() = delete;
@@ -50,12 +47,6 @@ struct IntraPredFuncs_C {
const void* left_column);
static void Paeth(void* dest, ptrdiff_t stride, const void* top_row,
const void* left_column);
- static void Smooth(void* dest, ptrdiff_t stride, const void* top_row,
- const void* left_column);
- static void SmoothVertical(void* dest, ptrdiff_t stride, const void* top_row,
- const void* left_column);
- static void SmoothHorizontal(void* dest, ptrdiff_t stride,
- const void* top_row, const void* left_column);
};
// Intra-predictors that require bitdepth.
@@ -190,16 +181,6 @@ void IntraPredFuncs_C<block_width, block_height, Pixel>::Horizontal(
}
}
-template <typename Pixel>
-inline Pixel Average(Pixel a, Pixel b) {
- return static_cast<Pixel>((a + b + 1) >> 1);
-}
-
-template <typename Pixel>
-inline Pixel Average(Pixel a, Pixel b, Pixel c) {
- return static_cast<Pixel>((a + 2 * b + c + 2) >> 2);
-}
-
// IntraPredFuncs_C::Paeth
template <int block_width, int block_height, typename Pixel>
void IntraPredFuncs_C<block_width, block_height, Pixel>::Paeth(
@@ -238,110 +219,6 @@ void IntraPredFuncs_C<block_width, block_height, Pixel>::Paeth(
}
}
-constexpr uint8_t kSmoothWeights[] = {
- // block dimension = 4
- 255, 149, 85, 64,
- // block dimension = 8
- 255, 197, 146, 105, 73, 50, 37, 32,
- // block dimension = 16
- 255, 225, 196, 170, 145, 123, 102, 84, 68, 54, 43, 33, 26, 20, 17, 16,
- // block dimension = 32
- 255, 240, 225, 210, 196, 182, 169, 157, 145, 133, 122, 111, 101, 92, 83, 74,
- 66, 59, 52, 45, 39, 34, 29, 25, 21, 17, 14, 12, 10, 9, 8, 8,
- // block dimension = 64
- 255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169, 163, 156,
- 150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96, 91, 86, 82, 77, 73,
- 69, 65, 61, 57, 54, 50, 47, 44, 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16,
- 15, 13, 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4};
-
-// IntraPredFuncs_C::Smooth
-template <int block_width, int block_height, typename Pixel>
-void IntraPredFuncs_C<block_width, block_height, Pixel>::Smooth(
- void* const dest, ptrdiff_t stride, const void* const top_row,
- const void* const left_column) {
- const auto* const top = static_cast<const Pixel*>(top_row);
- const auto* const left = static_cast<const Pixel*>(left_column);
- const Pixel top_right = top[block_width - 1];
- const Pixel bottom_left = left[block_height - 1];
- static_assert(
- block_width >= 4 && block_height >= 4,
- "Weights for smooth predictor undefined for block width/height < 4");
- const uint8_t* const weights_x = kSmoothWeights + block_width - 4;
- const uint8_t* const weights_y = kSmoothWeights + block_height - 4;
- const uint16_t scale_value = (1 << kSmoothWeightScale);
- auto* dst = static_cast<Pixel*>(dest);
- stride /= sizeof(Pixel);
-
- for (int y = 0; y < block_height; ++y) {
- for (int x = 0; x < block_width; ++x) {
- assert(scale_value >= weights_y[y] && scale_value >= weights_x[x]);
- uint32_t pred = weights_y[y] * top[x];
- pred += weights_x[x] * left[y];
- pred += static_cast<uint8_t>(scale_value - weights_y[y]) * bottom_left;
- pred += static_cast<uint8_t>(scale_value - weights_x[x]) * top_right;
- // The maximum value of pred with the rounder is 2^9 * (2^bitdepth - 1)
- // + 256. With the descale there's no need for saturation.
- dst[x] = static_cast<Pixel>(
- RightShiftWithRounding(pred, kSmoothWeightScale + 1));
- }
- dst += stride;
- }
-}
-
-// IntraPredFuncs_C::SmoothVertical
-template <int block_width, int block_height, typename Pixel>
-void IntraPredFuncs_C<block_width, block_height, Pixel>::SmoothVertical(
- void* const dest, ptrdiff_t stride, const void* const top_row,
- const void* const left_column) {
- const auto* const top = static_cast<const Pixel*>(top_row);
- const auto* const left = static_cast<const Pixel*>(left_column);
- const Pixel bottom_left = left[block_height - 1];
- static_assert(block_height >= 4,
- "Weights for smooth predictor undefined for block height < 4");
- const uint8_t* const weights_y = kSmoothWeights + block_height - 4;
- const uint16_t scale_value = (1 << kSmoothWeightScale);
- auto* dst = static_cast<Pixel*>(dest);
- stride /= sizeof(Pixel);
-
- for (int y = 0; y < block_height; ++y) {
- for (int x = 0; x < block_width; ++x) {
- assert(scale_value >= weights_y[y]);
- uint32_t pred = weights_y[y] * top[x];
- pred += static_cast<uint8_t>(scale_value - weights_y[y]) * bottom_left;
- dst[x] =
- static_cast<Pixel>(RightShiftWithRounding(pred, kSmoothWeightScale));
- }
- dst += stride;
- }
-}
-
-// IntraPredFuncs_C::SmoothHorizontal
-template <int block_width, int block_height, typename Pixel>
-void IntraPredFuncs_C<block_width, block_height, Pixel>::SmoothHorizontal(
- void* const dest, ptrdiff_t stride, const void* const top_row,
- const void* const left_column) {
- const auto* const top = static_cast<const Pixel*>(top_row);
- const auto* const left = static_cast<const Pixel*>(left_column);
- const Pixel top_right = top[block_width - 1];
- static_assert(block_width >= 4,
- "Weights for smooth predictor undefined for block width < 4");
- const uint8_t* const weights_x = kSmoothWeights + block_width - 4;
- const uint16_t scale_value = (1 << kSmoothWeightScale);
- auto* dst = static_cast<Pixel*>(dest);
- stride /= sizeof(Pixel);
-
- for (int y = 0; y < block_height; ++y) {
- for (int x = 0; x < block_width; ++x) {
- assert(scale_value >= weights_x[x]);
- uint32_t pred = weights_x[x] * left[y];
- pred += static_cast<uint8_t>(scale_value - weights_x[x]) * top_right;
- dst[x] =
- static_cast<Pixel>(RightShiftWithRounding(pred, kSmoothWeightScale));
- }
- dst += stride;
- }
-}
-
//------------------------------------------------------------------------------
// IntraPredBppFuncs_C
template <int fill, typename Pixel>
@@ -366,288 +243,7 @@ void IntraPredBppFuncs_C<block_width, block_height, bitdepth, Pixel>::DcFill(
block_height);
}
-//------------------------------------------------------------------------------
-// FilterIntraPredictor_C
-
-template <int bitdepth, typename Pixel>
-void FilterIntraPredictor_C(void* const dest, ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column,
- const FilterIntraPredictor pred, const int width,
- const int height) {
- const int kMaxPixel = (1 << bitdepth) - 1;
- const auto* const top = static_cast<const Pixel*>(top_row);
- const auto* const left = static_cast<const Pixel*>(left_column);
-
- assert(width <= 32 && height <= 32);
-
- Pixel buffer[3][33]; // cache 2 rows + top & left boundaries
- memcpy(buffer[0], &top[-1], (width + 1) * sizeof(top[0]));
-
- auto* dst = static_cast<Pixel*>(dest);
- stride /= sizeof(Pixel);
- int row0 = 0, row2 = 2;
- int ystep = 1;
- int y = 0;
- do {
- buffer[1][0] = left[y];
- buffer[row2][0] = left[y + 1];
- int x = 1;
- do {
- const Pixel p0 = buffer[row0][x - 1]; // top-left
- const Pixel p1 = buffer[row0][x + 0]; // top 0
- const Pixel p2 = buffer[row0][x + 1]; // top 1
- const Pixel p3 = buffer[row0][x + 2]; // top 2
- const Pixel p4 = buffer[row0][x + 3]; // top 3
- const Pixel p5 = buffer[1][x - 1]; // left 0
- const Pixel p6 = buffer[row2][x - 1]; // left 1
- for (int i = 0; i < 8; ++i) {
- const int xoffset = i & 0x03;
- const int yoffset = (i >> 2) * ystep;
- const int value = kFilterIntraTaps[pred][i][0] * p0 +
- kFilterIntraTaps[pred][i][1] * p1 +
- kFilterIntraTaps[pred][i][2] * p2 +
- kFilterIntraTaps[pred][i][3] * p3 +
- kFilterIntraTaps[pred][i][4] * p4 +
- kFilterIntraTaps[pred][i][5] * p5 +
- kFilterIntraTaps[pred][i][6] * p6;
- buffer[1 + yoffset][x + xoffset] = static_cast<Pixel>(
- Clip3(RightShiftWithRounding(value, 4), 0, kMaxPixel));
- }
- x += 4;
- } while (x < width);
- memcpy(dst, &buffer[1][1], width * sizeof(dst[0]));
- dst += stride;
- memcpy(dst, &buffer[row2][1], width * sizeof(dst[0]));
- dst += stride;
-
- // The final row becomes the top for the next pass.
- row0 ^= 2;
- row2 ^= 2;
- ystep = -ystep;
- y += 2;
- } while (y < height);
-}
-
-//------------------------------------------------------------------------------
-// CflIntraPredictor_C
-
-// |luma| can be within +/-(((1 << bitdepth) - 1) << 3), inclusive.
-// |alpha| can be -16 to 16 (inclusive).
-template <int block_width, int block_height, int bitdepth, typename Pixel>
-void CflIntraPredictor_C(
- void* const dest, ptrdiff_t stride,
- const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
- const int alpha) {
- auto* dst = static_cast<Pixel*>(dest);
- const int dc = dst[0];
- stride /= sizeof(Pixel);
- const int max_value = (1 << bitdepth) - 1;
- for (int y = 0; y < block_height; ++y) {
- for (int x = 0; x < block_width; ++x) {
- assert(luma[y][x] >= -(((1 << bitdepth) - 1) << 3));
- assert(luma[y][x] <= ((1 << bitdepth) - 1) << 3);
- dst[x] = Clip3(dc + RightShiftWithRoundingSigned(alpha * luma[y][x], 6),
- 0, max_value);
- }
- dst += stride;
- }
-}
-
-//------------------------------------------------------------------------------
-// CflSubsampler_C
-
-template <int block_width, int block_height, int bitdepth, typename Pixel,
- int subsampling_x, int subsampling_y>
-void CflSubsampler_C(int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
- const int max_luma_width, const int max_luma_height,
- const void* const source, ptrdiff_t stride) {
- assert(max_luma_width >= 4);
- assert(max_luma_height >= 4);
- const auto* src = static_cast<const Pixel*>(source);
- stride /= sizeof(Pixel);
- int sum = 0;
- for (int y = 0; y < block_height; ++y) {
- for (int x = 0; x < block_width; ++x) {
- const ptrdiff_t luma_x =
- std::min(x << subsampling_x, max_luma_width - (1 << subsampling_x));
- const ptrdiff_t luma_x_next = luma_x + stride;
- luma[y][x] =
- (src[luma_x] + ((subsampling_x != 0) ? src[luma_x + 1] : 0) +
- ((subsampling_y != 0) ? (src[luma_x_next] + src[luma_x_next + 1])
- : 0))
- << (3 - subsampling_x - subsampling_y);
- sum += luma[y][x];
- }
- if ((y << subsampling_y) < (max_luma_height - (1 << subsampling_y))) {
- src += stride << subsampling_y;
- }
- }
- const int average = RightShiftWithRounding(
- sum, FloorLog2(block_width) + FloorLog2(block_height));
- for (int y = 0; y < block_height; ++y) {
- for (int x = 0; x < block_width; ++x) {
- luma[y][x] -= average;
- }
- }
-}
-
-//------------------------------------------------------------------------------
-// 7.11.2.4. Directional intra prediction process
-
-template <typename Pixel>
-void DirectionalIntraPredictorZone1_C(void* const dest, ptrdiff_t stride,
- const void* const top_row,
- const int width, const int height,
- const int xstep,
- const bool upsampled_top) {
- const auto* const top = static_cast<const Pixel*>(top_row);
- auto* dst = static_cast<Pixel*>(dest);
- stride /= sizeof(Pixel);
-
- assert(xstep > 0);
-
- // If xstep == 64 then |shift| always evaluates to 0 which sets |val| to
- // |top[top_base_x]|. This corresponds to a 45 degree prediction.
- if (xstep == 64) {
- // 7.11.2.10. Intra edge upsample selection process
- // if ( d <= 0 || d >= 40 ) useUpsample = 0
- // For |upsampled_top| the delta is |predictor_angle - 90|. Since the
- // |predictor_angle| is 45 the delta is also 45.
- assert(!upsampled_top);
- const Pixel* top_ptr = top + 1;
- for (int y = 0; y < height; ++y, dst += stride, ++top_ptr) {
- memcpy(dst, top_ptr, sizeof(*top_ptr) * width);
- }
- return;
- }
-
- const int upsample_shift = static_cast<int>(upsampled_top);
- const int max_base_x = ((width + height) - 1) << upsample_shift;
- const int scale_bits = 6 - upsample_shift;
- const int base_step = 1 << upsample_shift;
- int top_x = xstep;
- int y = 0;
- do {
- int top_base_x = top_x >> scale_bits;
-
- if (top_base_x >= max_base_x) {
- for (int i = y; i < height; ++i) {
- Memset(dst, top[max_base_x], width);
- dst += stride;
- }
- return;
- }
-
- const int shift = ((top_x << upsample_shift) & 0x3F) >> 1;
- int x = 0;
- do {
- if (top_base_x >= max_base_x) {
- Memset(dst + x, top[max_base_x], width - x);
- break;
- }
-
- const int val =
- top[top_base_x] * (32 - shift) + top[top_base_x + 1] * shift;
- dst[x] = RightShiftWithRounding(val, 5);
- top_base_x += base_step;
- } while (++x < width);
-
- dst += stride;
- top_x += xstep;
- } while (++y < height);
-}
-
-template <typename Pixel>
-void DirectionalIntraPredictorZone2_C(void* const dest, ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column,
- const int width, const int height,
- const int xstep, const int ystep,
- const bool upsampled_top,
- const bool upsampled_left) {
- const auto* const top = static_cast<const Pixel*>(top_row);
- const auto* const left = static_cast<const Pixel*>(left_column);
- auto* dst = static_cast<Pixel*>(dest);
- stride /= sizeof(Pixel);
-
- assert(xstep > 0);
- assert(ystep > 0);
-
- const int upsample_top_shift = static_cast<int>(upsampled_top);
- const int upsample_left_shift = static_cast<int>(upsampled_left);
- const int scale_bits_x = 6 - upsample_top_shift;
- const int scale_bits_y = 6 - upsample_left_shift;
- const int min_base_x = -(1 << upsample_top_shift);
- const int base_step_x = 1 << upsample_top_shift;
- int y = 0;
- int top_x = -xstep;
- do {
- int top_base_x = top_x >> scale_bits_x;
- int left_y = (y << 6) - ystep;
- int x = 0;
- do {
- int val;
- if (top_base_x >= min_base_x) {
- const int shift = ((top_x * (1 << upsample_top_shift)) & 0x3F) >> 1;
- val = top[top_base_x] * (32 - shift) + top[top_base_x + 1] * shift;
- } else {
- // Note this assumes an arithmetic shift to handle negative values.
- const int left_base_y = left_y >> scale_bits_y;
- const int shift = ((left_y * (1 << upsample_left_shift)) & 0x3F) >> 1;
- assert(left_base_y >= -(1 << upsample_left_shift));
- val = left[left_base_y] * (32 - shift) + left[left_base_y + 1] * shift;
- }
- dst[x] = RightShiftWithRounding(val, 5);
- top_base_x += base_step_x;
- left_y -= ystep;
- } while (++x < width);
-
- top_x -= xstep;
- dst += stride;
- } while (++y < height);
-}
-
-template <typename Pixel>
-void DirectionalIntraPredictorZone3_C(void* const dest, ptrdiff_t stride,
- const void* const left_column,
- const int width, const int height,
- const int ystep,
- const bool upsampled_left) {
- const auto* const left = static_cast<const Pixel*>(left_column);
- stride /= sizeof(Pixel);
-
- assert(ystep > 0);
-
- const int upsample_shift = static_cast<int>(upsampled_left);
- const int scale_bits = 6 - upsample_shift;
- const int base_step = 1 << upsample_shift;
- // Zone3 never runs out of left_column values.
- assert((width + height - 1) << upsample_shift > // max_base_y
- ((ystep * width) >> scale_bits) +
- base_step * (height - 1)); // left_base_y
-
- int left_y = ystep;
- int x = 0;
- do {
- auto* dst = static_cast<Pixel*>(dest);
-
- int left_base_y = left_y >> scale_bits;
- int y = 0;
- do {
- const int shift = ((left_y << upsample_shift) & 0x3F) >> 1;
- const int val =
- left[left_base_y] * (32 - shift) + left[left_base_y + 1] * shift;
- dst[x] = RightShiftWithRounding(val, 5);
- dst += stride;
- left_base_y += base_step;
- } while (++y < height);
-
- left_y += ystep;
- } while (++x < width);
-}
-
-//------------------------------------------------------------------------------
+// -----------------------------------------------------------------------------
template <typename Pixel>
struct IntraPredDefs {
@@ -718,15 +314,7 @@ using Defs8bpp = IntraPredBppDefs<8, uint8_t>;
dsp->intra_predictors[kTransformSize##W##x##H][kIntraPredictorHorizontal] = \
DEFS::_##W##x##H::Horizontal; \
dsp->intra_predictors[kTransformSize##W##x##H][kIntraPredictorPaeth] = \
- DEFS::_##W##x##H::Paeth; \
- dsp->intra_predictors[kTransformSize##W##x##H][kIntraPredictorSmooth] = \
- DEFS::_##W##x##H::Smooth; \
- dsp->intra_predictors[kTransformSize##W##x##H] \
- [kIntraPredictorSmoothVertical] = \
- DEFS::_##W##x##H::SmoothVertical; \
- dsp->intra_predictors[kTransformSize##W##x##H] \
- [kIntraPredictorSmoothHorizontal] = \
- DEFS::_##W##x##H::SmoothHorizontal
+ DEFS::_##W##x##H::Paeth
#define INIT_INTRAPREDICTORS(DEFS, DEFSBPP) \
INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 4, 4); \
@@ -749,45 +337,11 @@ using Defs8bpp = IntraPredBppDefs<8, uint8_t>;
INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 64, 32); \
INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 64, 64)
-#define INIT_CFL_INTRAPREDICTOR_WxH(W, H, BITDEPTH, PIXEL) \
- dsp->cfl_intra_predictors[kTransformSize##W##x##H] = \
- CflIntraPredictor_C<W, H, BITDEPTH, PIXEL>; \
- dsp->cfl_subsamplers[kTransformSize##W##x##H][kSubsamplingType444] = \
- CflSubsampler_C<W, H, BITDEPTH, PIXEL, 0, 0>; \
- dsp->cfl_subsamplers[kTransformSize##W##x##H][kSubsamplingType422] = \
- CflSubsampler_C<W, H, BITDEPTH, PIXEL, 1, 0>; \
- dsp->cfl_subsamplers[kTransformSize##W##x##H][kSubsamplingType420] = \
- CflSubsampler_C<W, H, BITDEPTH, PIXEL, 1, 1>
-
-#define INIT_CFL_INTRAPREDICTORS(BITDEPTH, PIXEL) \
- INIT_CFL_INTRAPREDICTOR_WxH(4, 4, BITDEPTH, PIXEL); \
- INIT_CFL_INTRAPREDICTOR_WxH(4, 8, BITDEPTH, PIXEL); \
- INIT_CFL_INTRAPREDICTOR_WxH(4, 16, BITDEPTH, PIXEL); \
- INIT_CFL_INTRAPREDICTOR_WxH(8, 4, BITDEPTH, PIXEL); \
- INIT_CFL_INTRAPREDICTOR_WxH(8, 8, BITDEPTH, PIXEL); \
- INIT_CFL_INTRAPREDICTOR_WxH(8, 16, BITDEPTH, PIXEL); \
- INIT_CFL_INTRAPREDICTOR_WxH(8, 32, BITDEPTH, PIXEL); \
- INIT_CFL_INTRAPREDICTOR_WxH(16, 4, BITDEPTH, PIXEL); \
- INIT_CFL_INTRAPREDICTOR_WxH(16, 8, BITDEPTH, PIXEL); \
- INIT_CFL_INTRAPREDICTOR_WxH(16, 16, BITDEPTH, PIXEL); \
- INIT_CFL_INTRAPREDICTOR_WxH(16, 32, BITDEPTH, PIXEL); \
- INIT_CFL_INTRAPREDICTOR_WxH(32, 8, BITDEPTH, PIXEL); \
- INIT_CFL_INTRAPREDICTOR_WxH(32, 16, BITDEPTH, PIXEL); \
- INIT_CFL_INTRAPREDICTOR_WxH(32, 32, BITDEPTH, PIXEL)
-
void Init8bpp() {
Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
assert(dsp != nullptr);
#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
INIT_INTRAPREDICTORS(Defs, Defs8bpp);
- dsp->directional_intra_predictor_zone1 =
- DirectionalIntraPredictorZone1_C<uint8_t>;
- dsp->directional_intra_predictor_zone2 =
- DirectionalIntraPredictorZone2_C<uint8_t>;
- dsp->directional_intra_predictor_zone3 =
- DirectionalIntraPredictorZone3_C<uint8_t>;
- dsp->filter_intra_predictor = FilterIntraPredictor_C<8, uint8_t>;
- INIT_CFL_INTRAPREDICTORS(8, uint8_t);
#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcFill] =
@@ -816,19 +370,6 @@ void Init8bpp() {
dsp->intra_predictors[kTransformSize4x4][kIntraPredictorPaeth] =
Defs::_4x4::Paeth;
#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmooth] =
- Defs::_4x4::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothVertical] =
- Defs::_4x4::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothHorizontal] =
- Defs::_4x4::SmoothHorizontal;
-#endif
-
#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcFill] =
Defs8bpp::_4x8::DcFill;
@@ -856,19 +397,6 @@ void Init8bpp() {
dsp->intra_predictors[kTransformSize4x8][kIntraPredictorPaeth] =
Defs::_4x8::Paeth;
#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmooth] =
- Defs::_4x8::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothVertical] =
- Defs::_4x8::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothHorizontal] =
- Defs::_4x8::SmoothHorizontal;
-#endif
-
#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcFill] =
Defs8bpp::_4x16::DcFill;
@@ -897,19 +425,6 @@ void Init8bpp() {
dsp->intra_predictors[kTransformSize4x16][kIntraPredictorPaeth] =
Defs::_4x16::Paeth;
#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmooth] =
- Defs::_4x16::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothVertical] =
- Defs::_4x16::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothHorizontal] =
- Defs::_4x16::SmoothHorizontal;
-#endif
-
#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcFill] =
Defs8bpp::_8x4::DcFill;
@@ -937,19 +452,6 @@ void Init8bpp() {
dsp->intra_predictors[kTransformSize8x4][kIntraPredictorPaeth] =
Defs::_8x4::Paeth;
#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmooth] =
- Defs::_8x4::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothVertical] =
- Defs::_8x4::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothHorizontal] =
- Defs::_8x4::SmoothHorizontal;
-#endif
-
#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcFill] =
Defs8bpp::_8x8::DcFill;
@@ -977,19 +479,6 @@ void Init8bpp() {
dsp->intra_predictors[kTransformSize8x8][kIntraPredictorPaeth] =
Defs::_8x8::Paeth;
#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmooth] =
- Defs::_8x8::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothVertical] =
- Defs::_8x8::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothHorizontal] =
- Defs::_8x8::SmoothHorizontal;
-#endif
-
#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcFill] =
Defs8bpp::_8x16::DcFill;
@@ -1018,19 +507,6 @@ void Init8bpp() {
dsp->intra_predictors[kTransformSize8x16][kIntraPredictorPaeth] =
Defs::_8x16::Paeth;
#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmooth] =
- Defs::_8x16::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothVertical] =
- Defs::_8x16::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothHorizontal] =
- Defs::_8x16::SmoothHorizontal;
-#endif
-
#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcFill] =
Defs8bpp::_8x32::DcFill;
@@ -1059,19 +535,6 @@ void Init8bpp() {
dsp->intra_predictors[kTransformSize8x32][kIntraPredictorPaeth] =
Defs::_8x32::Paeth;
#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmooth] =
- Defs::_8x32::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothVertical] =
- Defs::_8x32::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothHorizontal] =
- Defs::_8x32::SmoothHorizontal;
-#endif
-
#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcFill] =
Defs8bpp::_16x4::DcFill;
@@ -1100,19 +563,6 @@ void Init8bpp() {
dsp->intra_predictors[kTransformSize16x4][kIntraPredictorPaeth] =
Defs::_16x4::Paeth;
#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmooth] =
- Defs::_16x4::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothVertical] =
- Defs::_16x4::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothHorizontal] =
- Defs::_16x4::SmoothHorizontal;
-#endif
-
#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcFill] =
Defs8bpp::_16x8::DcFill;
@@ -1141,19 +591,6 @@ void Init8bpp() {
dsp->intra_predictors[kTransformSize16x8][kIntraPredictorPaeth] =
Defs::_16x8::Paeth;
#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmooth] =
- Defs::_16x8::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothVertical] =
- Defs::_16x8::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothHorizontal] =
- Defs::_16x8::SmoothHorizontal;
-#endif
-
#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcFill] =
Defs8bpp::_16x16::DcFill;
@@ -1182,19 +619,6 @@ void Init8bpp() {
dsp->intra_predictors[kTransformSize16x16][kIntraPredictorPaeth] =
Defs::_16x16::Paeth;
#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmooth] =
- Defs::_16x16::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothVertical] =
- Defs::_16x16::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothHorizontal] =
- Defs::_16x16::SmoothHorizontal;
-#endif
-
#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcFill] =
Defs8bpp::_16x32::DcFill;
@@ -1223,19 +647,6 @@ void Init8bpp() {
dsp->intra_predictors[kTransformSize16x32][kIntraPredictorPaeth] =
Defs::_16x32::Paeth;
#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmooth] =
- Defs::_16x32::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothVertical] =
- Defs::_16x32::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothHorizontal] =
- Defs::_16x32::SmoothHorizontal;
-#endif
-
#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcFill] =
Defs8bpp::_16x64::DcFill;
@@ -1264,19 +675,6 @@ void Init8bpp() {
dsp->intra_predictors[kTransformSize16x64][kIntraPredictorPaeth] =
Defs::_16x64::Paeth;
#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmooth] =
- Defs::_16x64::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothVertical] =
- Defs::_16x64::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothHorizontal] =
- Defs::_16x64::SmoothHorizontal;
-#endif
-
#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcFill] =
Defs8bpp::_32x8::DcFill;
@@ -1305,19 +703,6 @@ void Init8bpp() {
dsp->intra_predictors[kTransformSize32x8][kIntraPredictorPaeth] =
Defs::_32x8::Paeth;
#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmooth] =
- Defs::_32x8::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothVertical] =
- Defs::_32x8::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothHorizontal] =
- Defs::_32x8::SmoothHorizontal;
-#endif
-
#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcFill] =
Defs8bpp::_32x16::DcFill;
@@ -1346,19 +731,6 @@ void Init8bpp() {
dsp->intra_predictors[kTransformSize32x16][kIntraPredictorPaeth] =
Defs::_32x16::Paeth;
#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmooth] =
- Defs::_32x16::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothVertical] =
- Defs::_32x16::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothHorizontal] =
- Defs::_32x16::SmoothHorizontal;
-#endif
-
#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcFill] =
Defs8bpp::_32x32::DcFill;
@@ -1387,19 +759,6 @@ void Init8bpp() {
dsp->intra_predictors[kTransformSize32x32][kIntraPredictorPaeth] =
Defs::_32x32::Paeth;
#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmooth] =
- Defs::_32x32::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothVertical] =
- Defs::_32x32::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothHorizontal] =
- Defs::_32x32::SmoothHorizontal;
-#endif
-
#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcFill] =
Defs8bpp::_32x64::DcFill;
@@ -1428,19 +787,6 @@ void Init8bpp() {
dsp->intra_predictors[kTransformSize32x64][kIntraPredictorPaeth] =
Defs::_32x64::Paeth;
#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmooth] =
- Defs::_32x64::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothVertical] =
- Defs::_32x64::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothHorizontal] =
- Defs::_32x64::SmoothHorizontal;
-#endif
-
#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcFill] =
Defs8bpp::_64x16::DcFill;
@@ -1469,19 +815,6 @@ void Init8bpp() {
dsp->intra_predictors[kTransformSize64x16][kIntraPredictorPaeth] =
Defs::_64x16::Paeth;
#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmooth] =
- Defs::_64x16::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothVertical] =
- Defs::_64x16::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothHorizontal] =
- Defs::_64x16::SmoothHorizontal;
-#endif
-
#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcFill] =
Defs8bpp::_64x32::DcFill;
@@ -1510,19 +843,6 @@ void Init8bpp() {
dsp->intra_predictors[kTransformSize64x32][kIntraPredictorPaeth] =
Defs::_64x32::Paeth;
#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmooth] =
- Defs::_64x32::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothVertical] =
- Defs::_64x32::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothHorizontal] =
- Defs::_64x32::SmoothHorizontal;
-#endif
-
#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcFill] =
Defs8bpp::_64x64::DcFill;
@@ -1551,282 +871,7 @@ void Init8bpp() {
dsp->intra_predictors[kTransformSize64x64][kIntraPredictorPaeth] =
Defs::_64x64::Paeth;
#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmooth] =
- Defs::_64x64::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothVertical] =
- Defs::_64x64::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothHorizontal] =
- Defs::_64x64::SmoothHorizontal;
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1
- dsp->directional_intra_predictor_zone1 =
- DirectionalIntraPredictorZone1_C<uint8_t>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2
- dsp->directional_intra_predictor_zone2 =
- DirectionalIntraPredictorZone2_C<uint8_t>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3
- dsp->directional_intra_predictor_zone3 =
- DirectionalIntraPredictorZone3_C<uint8_t>;
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_FilterIntraPredictor
- dsp->filter_intra_predictor = FilterIntraPredictor_C<8, uint8_t>;
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflIntraPredictor
- dsp->cfl_intra_predictors[kTransformSize4x4] =
- CflIntraPredictor_C<4, 4, 8, uint8_t>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler444
- dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] =
- CflSubsampler_C<4, 4, 8, uint8_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler422
- dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType422] =
- CflSubsampler_C<4, 4, 8, uint8_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler420
- dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] =
- CflSubsampler_C<4, 4, 8, uint8_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflIntraPredictor
- dsp->cfl_intra_predictors[kTransformSize4x8] =
- CflIntraPredictor_C<4, 8, 8, uint8_t>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler444
- dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] =
- CflSubsampler_C<4, 8, 8, uint8_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler422
- dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType422] =
- CflSubsampler_C<4, 8, 8, uint8_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler420
- dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] =
- CflSubsampler_C<4, 8, 8, uint8_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflIntraPredictor
- dsp->cfl_intra_predictors[kTransformSize4x16] =
- CflIntraPredictor_C<4, 16, 8, uint8_t>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler444
- dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] =
- CflSubsampler_C<4, 16, 8, uint8_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler422
- dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType422] =
- CflSubsampler_C<4, 16, 8, uint8_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler420
- dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] =
- CflSubsampler_C<4, 16, 8, uint8_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflIntraPredictor
- dsp->cfl_intra_predictors[kTransformSize8x4] =
- CflIntraPredictor_C<8, 4, 8, uint8_t>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler444
- dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] =
- CflSubsampler_C<8, 4, 8, uint8_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler422
- dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType422] =
- CflSubsampler_C<8, 4, 8, uint8_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler420
- dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] =
- CflSubsampler_C<8, 4, 8, uint8_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflIntraPredictor
- dsp->cfl_intra_predictors[kTransformSize8x8] =
- CflIntraPredictor_C<8, 8, 8, uint8_t>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler444
- dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] =
- CflSubsampler_C<8, 8, 8, uint8_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler422
- dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType422] =
- CflSubsampler_C<8, 8, 8, uint8_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler420
- dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] =
- CflSubsampler_C<8, 8, 8, uint8_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflIntraPredictor
- dsp->cfl_intra_predictors[kTransformSize8x16] =
- CflIntraPredictor_C<8, 16, 8, uint8_t>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler444
- dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] =
- CflSubsampler_C<8, 16, 8, uint8_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler422
- dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType422] =
- CflSubsampler_C<8, 16, 8, uint8_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler420
- dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] =
- CflSubsampler_C<8, 16, 8, uint8_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflIntraPredictor
- dsp->cfl_intra_predictors[kTransformSize8x32] =
- CflIntraPredictor_C<8, 32, 8, uint8_t>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler444
- dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] =
- CflSubsampler_C<8, 32, 8, uint8_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler422
- dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType422] =
- CflSubsampler_C<8, 32, 8, uint8_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler420
- dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] =
- CflSubsampler_C<8, 32, 8, uint8_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflIntraPredictor
- dsp->cfl_intra_predictors[kTransformSize16x4] =
- CflIntraPredictor_C<16, 4, 8, uint8_t>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler444
- dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] =
- CflSubsampler_C<16, 4, 8, uint8_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler422
- dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType422] =
- CflSubsampler_C<16, 4, 8, uint8_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler420
- dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] =
- CflSubsampler_C<16, 4, 8, uint8_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflIntraPredictor
- dsp->cfl_intra_predictors[kTransformSize16x8] =
- CflIntraPredictor_C<16, 8, 8, uint8_t>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler444
- dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] =
- CflSubsampler_C<16, 8, 8, uint8_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler422
- dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType422] =
- CflSubsampler_C<16, 8, 8, uint8_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler420
- dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] =
- CflSubsampler_C<16, 8, 8, uint8_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflIntraPredictor
- dsp->cfl_intra_predictors[kTransformSize16x16] =
- CflIntraPredictor_C<16, 16, 8, uint8_t>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler444
- dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] =
- CflSubsampler_C<16, 16, 8, uint8_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler422
- dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType422] =
- CflSubsampler_C<16, 16, 8, uint8_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler420
- dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] =
- CflSubsampler_C<16, 16, 8, uint8_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflIntraPredictor
- dsp->cfl_intra_predictors[kTransformSize16x32] =
- CflIntraPredictor_C<16, 32, 8, uint8_t>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler444
- dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] =
- CflSubsampler_C<16, 32, 8, uint8_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler422
- dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType422] =
- CflSubsampler_C<16, 32, 8, uint8_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler420
- dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] =
- CflSubsampler_C<16, 32, 8, uint8_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflIntraPredictor
- dsp->cfl_intra_predictors[kTransformSize32x8] =
- CflIntraPredictor_C<32, 8, 8, uint8_t>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler444
- dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] =
- CflSubsampler_C<32, 8, 8, uint8_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler422
- dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType422] =
- CflSubsampler_C<32, 8, 8, uint8_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler420
- dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] =
- CflSubsampler_C<32, 8, 8, uint8_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflIntraPredictor
- dsp->cfl_intra_predictors[kTransformSize32x16] =
- CflIntraPredictor_C<32, 16, 8, uint8_t>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler444
- dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] =
- CflSubsampler_C<32, 16, 8, uint8_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler422
- dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType422] =
- CflSubsampler_C<32, 16, 8, uint8_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler420
- dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] =
- CflSubsampler_C<32, 16, 8, uint8_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflIntraPredictor
- dsp->cfl_intra_predictors[kTransformSize32x32] =
- CflIntraPredictor_C<32, 32, 8, uint8_t>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler444
- dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] =
- CflSubsampler_C<32, 32, 8, uint8_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler422
- dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType422] =
- CflSubsampler_C<32, 32, 8, uint8_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler420
- dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] =
- CflSubsampler_C<32, 32, 8, uint8_t, 1, 1>;
-#endif
#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
- // Cfl predictors are available only for transform sizes with max(width,
- // height) <= 32. Set all others to nullptr.
- for (const auto i : kTransformSizesLargerThan32x32) {
- dsp->cfl_intra_predictors[i] = nullptr;
- for (int j = 0; j < kNumSubsamplingTypes; ++j) {
- dsp->cfl_subsamplers[i][j] = nullptr;
- }
- }
} // NOLINT(readability/fn_size)
#if LIBGAV1_MAX_BITDEPTH >= 10
@@ -1838,14 +883,6 @@ void Init10bpp() {
assert(dsp != nullptr);
#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
INIT_INTRAPREDICTORS(DefsHbd, Defs10bpp);
- dsp->directional_intra_predictor_zone1 =
- DirectionalIntraPredictorZone1_C<uint16_t>;
- dsp->directional_intra_predictor_zone2 =
- DirectionalIntraPredictorZone2_C<uint16_t>;
- dsp->directional_intra_predictor_zone3 =
- DirectionalIntraPredictorZone3_C<uint16_t>;
- dsp->filter_intra_predictor = FilterIntraPredictor_C<10, uint16_t>;
- INIT_CFL_INTRAPREDICTORS(10, uint16_t);
#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcFill] =
@@ -1875,19 +912,6 @@ void Init10bpp() {
dsp->intra_predictors[kTransformSize4x4][kIntraPredictorPaeth] =
DefsHbd::_4x4::Paeth;
#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmooth] =
- DefsHbd::_4x4::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothVertical] =
- DefsHbd::_4x4::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothHorizontal] =
- DefsHbd::_4x4::SmoothHorizontal;
-#endif
-
#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcFill] =
Defs10bpp::_4x8::DcFill;
@@ -1916,19 +940,6 @@ void Init10bpp() {
dsp->intra_predictors[kTransformSize4x8][kIntraPredictorPaeth] =
DefsHbd::_4x8::Paeth;
#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmooth] =
- DefsHbd::_4x8::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothVertical] =
- DefsHbd::_4x8::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothHorizontal] =
- DefsHbd::_4x8::SmoothHorizontal;
-#endif
-
#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcFill] =
Defs10bpp::_4x16::DcFill;
@@ -1957,19 +968,6 @@ void Init10bpp() {
dsp->intra_predictors[kTransformSize4x16][kIntraPredictorPaeth] =
DefsHbd::_4x16::Paeth;
#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmooth] =
- DefsHbd::_4x16::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothVertical] =
- DefsHbd::_4x16::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothHorizontal] =
- DefsHbd::_4x16::SmoothHorizontal;
-#endif
-
#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcFill] =
Defs10bpp::_8x4::DcFill;
@@ -1998,19 +996,6 @@ void Init10bpp() {
dsp->intra_predictors[kTransformSize8x4][kIntraPredictorPaeth] =
DefsHbd::_8x4::Paeth;
#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmooth] =
- DefsHbd::_8x4::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothVertical] =
- DefsHbd::_8x4::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothHorizontal] =
- DefsHbd::_8x4::SmoothHorizontal;
-#endif
-
#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcFill] =
Defs10bpp::_8x8::DcFill;
@@ -2039,19 +1024,6 @@ void Init10bpp() {
dsp->intra_predictors[kTransformSize8x8][kIntraPredictorPaeth] =
DefsHbd::_8x8::Paeth;
#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmooth] =
- DefsHbd::_8x8::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothVertical] =
- DefsHbd::_8x8::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothHorizontal] =
- DefsHbd::_8x8::SmoothHorizontal;
-#endif
-
#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcFill] =
Defs10bpp::_8x16::DcFill;
@@ -2080,19 +1052,6 @@ void Init10bpp() {
dsp->intra_predictors[kTransformSize8x16][kIntraPredictorPaeth] =
DefsHbd::_8x16::Paeth;
#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmooth] =
- DefsHbd::_8x16::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothVertical] =
- DefsHbd::_8x16::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothHorizontal] =
- DefsHbd::_8x16::SmoothHorizontal;
-#endif
-
#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcFill] =
Defs10bpp::_8x32::DcFill;
@@ -2121,19 +1080,6 @@ void Init10bpp() {
dsp->intra_predictors[kTransformSize8x32][kIntraPredictorPaeth] =
DefsHbd::_8x32::Paeth;
#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmooth] =
- DefsHbd::_8x32::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothVertical] =
- DefsHbd::_8x32::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothHorizontal] =
- DefsHbd::_8x32::SmoothHorizontal;
-#endif
-
#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcFill] =
Defs10bpp::_16x4::DcFill;
@@ -2162,19 +1108,6 @@ void Init10bpp() {
dsp->intra_predictors[kTransformSize16x4][kIntraPredictorPaeth] =
DefsHbd::_16x4::Paeth;
#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmooth] =
- DefsHbd::_16x4::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothVertical] =
- DefsHbd::_16x4::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothHorizontal] =
- DefsHbd::_16x4::SmoothHorizontal;
-#endif
-
#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcFill] =
Defs10bpp::_16x8::DcFill;
@@ -2203,19 +1136,6 @@ void Init10bpp() {
dsp->intra_predictors[kTransformSize16x8][kIntraPredictorPaeth] =
DefsHbd::_16x8::Paeth;
#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmooth] =
- DefsHbd::_16x8::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothVertical] =
- DefsHbd::_16x8::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothHorizontal] =
- DefsHbd::_16x8::SmoothHorizontal;
-#endif
-
#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcFill] =
Defs10bpp::_16x16::DcFill;
@@ -2244,19 +1164,6 @@ void Init10bpp() {
dsp->intra_predictors[kTransformSize16x16][kIntraPredictorPaeth] =
DefsHbd::_16x16::Paeth;
#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmooth] =
- DefsHbd::_16x16::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothVertical] =
- DefsHbd::_16x16::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothHorizontal] =
- DefsHbd::_16x16::SmoothHorizontal;
-#endif
-
#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcFill] =
Defs10bpp::_16x32::DcFill;
@@ -2285,19 +1192,6 @@ void Init10bpp() {
dsp->intra_predictors[kTransformSize16x32][kIntraPredictorPaeth] =
DefsHbd::_16x32::Paeth;
#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmooth] =
- DefsHbd::_16x32::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothVertical] =
- DefsHbd::_16x32::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothHorizontal] =
- DefsHbd::_16x32::SmoothHorizontal;
-#endif
-
#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcFill] =
Defs10bpp::_16x64::DcFill;
@@ -2326,19 +1220,6 @@ void Init10bpp() {
dsp->intra_predictors[kTransformSize16x64][kIntraPredictorPaeth] =
DefsHbd::_16x64::Paeth;
#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmooth] =
- DefsHbd::_16x64::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothVertical] =
- DefsHbd::_16x64::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothHorizontal] =
- DefsHbd::_16x64::SmoothHorizontal;
-#endif
-
#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcFill] =
Defs10bpp::_32x8::DcFill;
@@ -2367,19 +1248,6 @@ void Init10bpp() {
dsp->intra_predictors[kTransformSize32x8][kIntraPredictorPaeth] =
DefsHbd::_32x8::Paeth;
#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmooth] =
- DefsHbd::_32x8::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothVertical] =
- DefsHbd::_32x8::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothHorizontal] =
- DefsHbd::_32x8::SmoothHorizontal;
-#endif
-
#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcFill] =
Defs10bpp::_32x16::DcFill;
@@ -2408,19 +1276,6 @@ void Init10bpp() {
dsp->intra_predictors[kTransformSize32x16][kIntraPredictorPaeth] =
DefsHbd::_32x16::Paeth;
#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmooth] =
- DefsHbd::_32x16::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothVertical] =
- DefsHbd::_32x16::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothHorizontal] =
- DefsHbd::_32x16::SmoothHorizontal;
-#endif
-
#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcFill] =
Defs10bpp::_32x32::DcFill;
@@ -2449,19 +1304,6 @@ void Init10bpp() {
dsp->intra_predictors[kTransformSize32x32][kIntraPredictorPaeth] =
DefsHbd::_32x32::Paeth;
#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmooth] =
- DefsHbd::_32x32::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothVertical] =
- DefsHbd::_32x32::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothHorizontal] =
- DefsHbd::_32x32::SmoothHorizontal;
-#endif
-
#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcFill] =
Defs10bpp::_32x64::DcFill;
@@ -2490,19 +1332,6 @@ void Init10bpp() {
dsp->intra_predictors[kTransformSize32x64][kIntraPredictorPaeth] =
DefsHbd::_32x64::Paeth;
#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmooth] =
- DefsHbd::_32x64::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothVertical] =
- DefsHbd::_32x64::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothHorizontal] =
- DefsHbd::_32x64::SmoothHorizontal;
-#endif
-
#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcFill] =
Defs10bpp::_64x16::DcFill;
@@ -2531,19 +1360,6 @@ void Init10bpp() {
dsp->intra_predictors[kTransformSize64x16][kIntraPredictorPaeth] =
DefsHbd::_64x16::Paeth;
#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmooth] =
- DefsHbd::_64x16::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothVertical] =
- DefsHbd::_64x16::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothHorizontal] =
- DefsHbd::_64x16::SmoothHorizontal;
-#endif
-
#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcFill] =
Defs10bpp::_64x32::DcFill;
@@ -2572,19 +1388,6 @@ void Init10bpp() {
dsp->intra_predictors[kTransformSize64x32][kIntraPredictorPaeth] =
DefsHbd::_64x32::Paeth;
#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmooth] =
- DefsHbd::_64x32::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothVertical] =
- DefsHbd::_64x32::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothHorizontal] =
- DefsHbd::_64x32::SmoothHorizontal;
-#endif
-
#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcFill] =
Defs10bpp::_64x64::DcFill;
@@ -2613,291 +1416,12 @@ void Init10bpp() {
dsp->intra_predictors[kTransformSize64x64][kIntraPredictorPaeth] =
DefsHbd::_64x64::Paeth;
#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmooth] =
- DefsHbd::_64x64::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothVertical] =
- DefsHbd::_64x64::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothHorizontal] =
- DefsHbd::_64x64::SmoothHorizontal;
-#endif
-
-#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone1
- dsp->directional_intra_predictor_zone1 =
- DirectionalIntraPredictorZone1_C<uint16_t>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone2
- dsp->directional_intra_predictor_zone2 =
- DirectionalIntraPredictorZone2_C<uint16_t>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone3
- dsp->directional_intra_predictor_zone3 =
- DirectionalIntraPredictorZone3_C<uint16_t>;
-#endif
-
-#ifndef LIBGAV1_Dsp10bpp_FilterIntraPredictor
- dsp->filter_intra_predictor = FilterIntraPredictor_C<10, uint16_t>;
-#endif
-
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflIntraPredictor
- dsp->cfl_intra_predictors[kTransformSize4x4] =
- CflIntraPredictor_C<4, 4, 10, uint16_t>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler444
- dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] =
- CflSubsampler_C<4, 4, 10, uint16_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler422
- dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType422] =
- CflSubsampler_C<4, 4, 10, uint16_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler420
- dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] =
- CflSubsampler_C<4, 4, 10, uint16_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflIntraPredictor
- dsp->cfl_intra_predictors[kTransformSize4x8] =
- CflIntraPredictor_C<4, 8, 10, uint16_t>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler444
- dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] =
- CflSubsampler_C<4, 8, 10, uint16_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler422
- dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType422] =
- CflSubsampler_C<4, 8, 10, uint16_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler420
- dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] =
- CflSubsampler_C<4, 8, 10, uint16_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflIntraPredictor
- dsp->cfl_intra_predictors[kTransformSize4x16] =
- CflIntraPredictor_C<4, 16, 10, uint16_t>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler444
- dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] =
- CflSubsampler_C<4, 16, 10, uint16_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler422
- dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType422] =
- CflSubsampler_C<4, 16, 10, uint16_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler420
- dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] =
- CflSubsampler_C<4, 16, 10, uint16_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflIntraPredictor
- dsp->cfl_intra_predictors[kTransformSize8x4] =
- CflIntraPredictor_C<8, 4, 10, uint16_t>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler444
- dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] =
- CflSubsampler_C<8, 4, 10, uint16_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler422
- dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType422] =
- CflSubsampler_C<8, 4, 10, uint16_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler420
- dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] =
- CflSubsampler_C<8, 4, 10, uint16_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflIntraPredictor
- dsp->cfl_intra_predictors[kTransformSize8x8] =
- CflIntraPredictor_C<8, 8, 10, uint16_t>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler444
- dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] =
- CflSubsampler_C<8, 8, 10, uint16_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler422
- dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType422] =
- CflSubsampler_C<8, 8, 10, uint16_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler420
- dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] =
- CflSubsampler_C<8, 8, 10, uint16_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflIntraPredictor
- dsp->cfl_intra_predictors[kTransformSize8x16] =
- CflIntraPredictor_C<8, 16, 10, uint16_t>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler444
- dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] =
- CflSubsampler_C<8, 16, 10, uint16_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler422
- dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType422] =
- CflSubsampler_C<8, 16, 10, uint16_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler420
- dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] =
- CflSubsampler_C<8, 16, 10, uint16_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflIntraPredictor
- dsp->cfl_intra_predictors[kTransformSize8x32] =
- CflIntraPredictor_C<8, 32, 10, uint16_t>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler444
- dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] =
- CflSubsampler_C<8, 32, 10, uint16_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler422
- dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType422] =
- CflSubsampler_C<8, 32, 10, uint16_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler420
- dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] =
- CflSubsampler_C<8, 32, 10, uint16_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflIntraPredictor
- dsp->cfl_intra_predictors[kTransformSize16x4] =
- CflIntraPredictor_C<16, 4, 10, uint16_t>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler444
- dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] =
- CflSubsampler_C<16, 4, 10, uint16_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler422
- dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType422] =
- CflSubsampler_C<16, 4, 10, uint16_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler420
- dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] =
- CflSubsampler_C<16, 4, 10, uint16_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflIntraPredictor
- dsp->cfl_intra_predictors[kTransformSize16x8] =
- CflIntraPredictor_C<16, 8, 10, uint16_t>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler444
- dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] =
- CflSubsampler_C<16, 8, 10, uint16_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler422
- dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType422] =
- CflSubsampler_C<16, 8, 10, uint16_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler420
- dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] =
- CflSubsampler_C<16, 8, 10, uint16_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflIntraPredictor
- dsp->cfl_intra_predictors[kTransformSize16x16] =
- CflIntraPredictor_C<16, 16, 10, uint16_t>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler444
- dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] =
- CflSubsampler_C<16, 16, 10, uint16_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler422
- dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType422] =
- CflSubsampler_C<16, 16, 10, uint16_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler420
- dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] =
- CflSubsampler_C<16, 16, 10, uint16_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflIntraPredictor
- dsp->cfl_intra_predictors[kTransformSize16x32] =
- CflIntraPredictor_C<16, 32, 10, uint16_t>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler444
- dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] =
- CflSubsampler_C<16, 32, 10, uint16_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler422
- dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType422] =
- CflSubsampler_C<16, 32, 10, uint16_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler420
- dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] =
- CflSubsampler_C<16, 32, 10, uint16_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflIntraPredictor
- dsp->cfl_intra_predictors[kTransformSize32x8] =
- CflIntraPredictor_C<32, 8, 10, uint16_t>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler444
- dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] =
- CflSubsampler_C<32, 8, 10, uint16_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler422
- dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType422] =
- CflSubsampler_C<32, 8, 10, uint16_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler420
- dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] =
- CflSubsampler_C<32, 8, 10, uint16_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflIntraPredictor
- dsp->cfl_intra_predictors[kTransformSize32x16] =
- CflIntraPredictor_C<32, 16, 10, uint16_t>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler444
- dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] =
- CflSubsampler_C<32, 16, 10, uint16_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler422
- dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType422] =
- CflSubsampler_C<32, 16, 10, uint16_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler420
- dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] =
- CflSubsampler_C<32, 16, 10, uint16_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflIntraPredictor
- dsp->cfl_intra_predictors[kTransformSize32x32] =
- CflIntraPredictor_C<32, 32, 10, uint16_t>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler444
- dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] =
- CflSubsampler_C<32, 32, 10, uint16_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler422
- dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType422] =
- CflSubsampler_C<32, 32, 10, uint16_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler420
- dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] =
- CflSubsampler_C<32, 32, 10, uint16_t, 1, 1>;
-#endif
-
#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
- // Cfl predictors are available only for transform sizes with max(width,
- // height) <= 32. Set all others to nullptr.
- for (const auto i : kTransformSizesLargerThan32x32) {
- dsp->cfl_intra_predictors[i] = nullptr;
- for (int j = 0; j < kNumSubsamplingTypes; ++j) {
- dsp->cfl_subsamplers[i][j] = nullptr;
- }
- }
} // NOLINT(readability/fn_size)
#endif // LIBGAV1_MAX_BITDEPTH >= 10
-#undef INIT_CFL_INTRAPREDICTOR_WxH
-#undef INIT_CFL_INTRAPREDICTORS
#undef INIT_INTRAPREDICTORS_WxH
#undef INIT_INTRAPREDICTORS
-
} // namespace
void IntraPredInit_C() {
diff --git a/src/dsp/intrapred.h b/src/dsp/intrapred.h
index c5286ef..2cb625d 100644
--- a/src/dsp/intrapred.h
+++ b/src/dsp/intrapred.h
@@ -38,9 +38,7 @@
namespace libgav1 {
namespace dsp {
-// Initializes Dsp::intra_predictors, Dsp::directional_intra_predictor_zone*,
-// Dsp::cfl_intra_predictors, Dsp::cfl_subsamplers and
-// Dsp::filter_intra_predictor. This function is not thread-safe.
+// Initializes Dsp::intra_predictors. This function is not thread-safe.
void IntraPredInit_C();
} // namespace dsp
diff --git a/src/dsp/intrapred_cfl.cc b/src/dsp/intrapred_cfl.cc
new file mode 100644
index 0000000..948c0c0
--- /dev/null
+++ b/src/dsp/intrapred_cfl.cc
@@ -0,0 +1,654 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred_cfl.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr TransformSize kTransformSizesLargerThan32x32[] = {
+ kTransformSize16x64, kTransformSize32x64, kTransformSize64x16,
+ kTransformSize64x32, kTransformSize64x64};
+
+//------------------------------------------------------------------------------
+// CflIntraPredictor_C
+
+// |luma| can be within +/-(((1 << bitdepth) - 1) << 3), inclusive.
+// |alpha| can be -16 to 16 (inclusive).
+template <int block_width, int block_height, int bitdepth, typename Pixel>
+void CflIntraPredictor_C(
+ void* const dest, ptrdiff_t stride,
+ const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int alpha) {
+ auto* dst = static_cast<Pixel*>(dest);
+ const int dc = dst[0];
+ stride /= sizeof(Pixel);
+ const int max_value = (1 << bitdepth) - 1;
+ for (int y = 0; y < block_height; ++y) {
+ for (int x = 0; x < block_width; ++x) {
+ assert(luma[y][x] >= -(((1 << bitdepth) - 1) << 3));
+ assert(luma[y][x] <= ((1 << bitdepth) - 1) << 3);
+ dst[x] = Clip3(dc + RightShiftWithRoundingSigned(alpha * luma[y][x], 6),
+ 0, max_value);
+ }
+ dst += stride;
+ }
+}
+
+//------------------------------------------------------------------------------
+// CflSubsampler_C
+
+template <int block_width, int block_height, int bitdepth, typename Pixel,
+ int subsampling_x, int subsampling_y>
+void CflSubsampler_C(int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* const source, ptrdiff_t stride) {
+ assert(max_luma_width >= 4);
+ assert(max_luma_height >= 4);
+ const auto* src = static_cast<const Pixel*>(source);
+ stride /= sizeof(Pixel);
+ int sum = 0;
+ for (int y = 0; y < block_height; ++y) {
+ for (int x = 0; x < block_width; ++x) {
+ const ptrdiff_t luma_x =
+ std::min(x << subsampling_x, max_luma_width - (1 << subsampling_x));
+ const ptrdiff_t luma_x_next = luma_x + stride;
+ luma[y][x] =
+ (src[luma_x] + ((subsampling_x != 0) ? src[luma_x + 1] : 0) +
+ ((subsampling_y != 0) ? (src[luma_x_next] + src[luma_x_next + 1])
+ : 0))
+ << (3 - subsampling_x - subsampling_y);
+ sum += luma[y][x];
+ }
+ if ((y << subsampling_y) < (max_luma_height - (1 << subsampling_y))) {
+ src += stride << subsampling_y;
+ }
+ }
+ const int average = RightShiftWithRounding(
+ sum, FloorLog2(block_width) + FloorLog2(block_height));
+ for (int y = 0; y < block_height; ++y) {
+ for (int x = 0; x < block_width; ++x) {
+ luma[y][x] -= average;
+ }
+ }
+}
+
+//------------------------------------------------------------------------------
+
+// Initializes dsp entries for kTransformSize|W|x|H|.
+#define INIT_CFL_INTRAPREDICTOR_WxH(W, H, BITDEPTH, PIXEL) \
+ dsp->cfl_intra_predictors[kTransformSize##W##x##H] = \
+ CflIntraPredictor_C<W, H, BITDEPTH, PIXEL>; \
+ dsp->cfl_subsamplers[kTransformSize##W##x##H][kSubsamplingType444] = \
+ CflSubsampler_C<W, H, BITDEPTH, PIXEL, 0, 0>; \
+ dsp->cfl_subsamplers[kTransformSize##W##x##H][kSubsamplingType422] = \
+ CflSubsampler_C<W, H, BITDEPTH, PIXEL, 1, 0>; \
+ dsp->cfl_subsamplers[kTransformSize##W##x##H][kSubsamplingType420] = \
+ CflSubsampler_C<W, H, BITDEPTH, PIXEL, 1, 1>
+
+#define INIT_CFL_INTRAPREDICTORS(BITDEPTH, PIXEL) \
+ INIT_CFL_INTRAPREDICTOR_WxH(4, 4, BITDEPTH, PIXEL); \
+ INIT_CFL_INTRAPREDICTOR_WxH(4, 8, BITDEPTH, PIXEL); \
+ INIT_CFL_INTRAPREDICTOR_WxH(4, 16, BITDEPTH, PIXEL); \
+ INIT_CFL_INTRAPREDICTOR_WxH(8, 4, BITDEPTH, PIXEL); \
+ INIT_CFL_INTRAPREDICTOR_WxH(8, 8, BITDEPTH, PIXEL); \
+ INIT_CFL_INTRAPREDICTOR_WxH(8, 16, BITDEPTH, PIXEL); \
+ INIT_CFL_INTRAPREDICTOR_WxH(8, 32, BITDEPTH, PIXEL); \
+ INIT_CFL_INTRAPREDICTOR_WxH(16, 4, BITDEPTH, PIXEL); \
+ INIT_CFL_INTRAPREDICTOR_WxH(16, 8, BITDEPTH, PIXEL); \
+ INIT_CFL_INTRAPREDICTOR_WxH(16, 16, BITDEPTH, PIXEL); \
+ INIT_CFL_INTRAPREDICTOR_WxH(16, 32, BITDEPTH, PIXEL); \
+ INIT_CFL_INTRAPREDICTOR_WxH(32, 8, BITDEPTH, PIXEL); \
+ INIT_CFL_INTRAPREDICTOR_WxH(32, 16, BITDEPTH, PIXEL); \
+ INIT_CFL_INTRAPREDICTOR_WxH(32, 32, BITDEPTH, PIXEL)
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ INIT_CFL_INTRAPREDICTORS(8, uint8_t);
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize4x4] =
+ CflIntraPredictor_C<4, 4, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] =
+ CflSubsampler_C<4, 4, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType422] =
+ CflSubsampler_C<4, 4, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] =
+ CflSubsampler_C<4, 4, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize4x8] =
+ CflIntraPredictor_C<4, 8, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] =
+ CflSubsampler_C<4, 8, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType422] =
+ CflSubsampler_C<4, 8, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] =
+ CflSubsampler_C<4, 8, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize4x16] =
+ CflIntraPredictor_C<4, 16, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] =
+ CflSubsampler_C<4, 16, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType422] =
+ CflSubsampler_C<4, 16, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] =
+ CflSubsampler_C<4, 16, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize8x4] =
+ CflIntraPredictor_C<8, 4, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] =
+ CflSubsampler_C<8, 4, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType422] =
+ CflSubsampler_C<8, 4, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] =
+ CflSubsampler_C<8, 4, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize8x8] =
+ CflIntraPredictor_C<8, 8, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] =
+ CflSubsampler_C<8, 8, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType422] =
+ CflSubsampler_C<8, 8, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] =
+ CflSubsampler_C<8, 8, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize8x16] =
+ CflIntraPredictor_C<8, 16, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] =
+ CflSubsampler_C<8, 16, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType422] =
+ CflSubsampler_C<8, 16, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] =
+ CflSubsampler_C<8, 16, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize8x32] =
+ CflIntraPredictor_C<8, 32, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] =
+ CflSubsampler_C<8, 32, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType422] =
+ CflSubsampler_C<8, 32, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] =
+ CflSubsampler_C<8, 32, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize16x4] =
+ CflIntraPredictor_C<16, 4, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] =
+ CflSubsampler_C<16, 4, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType422] =
+ CflSubsampler_C<16, 4, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] =
+ CflSubsampler_C<16, 4, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize16x8] =
+ CflIntraPredictor_C<16, 8, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] =
+ CflSubsampler_C<16, 8, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType422] =
+ CflSubsampler_C<16, 8, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] =
+ CflSubsampler_C<16, 8, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize16x16] =
+ CflIntraPredictor_C<16, 16, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] =
+ CflSubsampler_C<16, 16, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType422] =
+ CflSubsampler_C<16, 16, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] =
+ CflSubsampler_C<16, 16, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize16x32] =
+ CflIntraPredictor_C<16, 32, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] =
+ CflSubsampler_C<16, 32, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType422] =
+ CflSubsampler_C<16, 32, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] =
+ CflSubsampler_C<16, 32, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize32x8] =
+ CflIntraPredictor_C<32, 8, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] =
+ CflSubsampler_C<32, 8, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType422] =
+ CflSubsampler_C<32, 8, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] =
+ CflSubsampler_C<32, 8, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize32x16] =
+ CflIntraPredictor_C<32, 16, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] =
+ CflSubsampler_C<32, 16, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType422] =
+ CflSubsampler_C<32, 16, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] =
+ CflSubsampler_C<32, 16, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize32x32] =
+ CflIntraPredictor_C<32, 32, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] =
+ CflSubsampler_C<32, 32, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType422] =
+ CflSubsampler_C<32, 32, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] =
+ CflSubsampler_C<32, 32, 8, uint8_t, 1, 1>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ // Cfl predictors are available only for transform sizes with max(width,
+ // height) <= 32. Set all others to nullptr.
+ for (const auto i : kTransformSizesLargerThan32x32) {
+ dsp->cfl_intra_predictors[i] = nullptr;
+ for (int j = 0; j < kNumSubsamplingTypes; ++j) {
+ dsp->cfl_subsamplers[i][j] = nullptr;
+ }
+ }
+} // NOLINT(readability/fn_size)
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ INIT_CFL_INTRAPREDICTORS(10, uint16_t);
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize4x4] =
+ CflIntraPredictor_C<4, 4, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] =
+ CflSubsampler_C<4, 4, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType422] =
+ CflSubsampler_C<4, 4, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] =
+ CflSubsampler_C<4, 4, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize4x8] =
+ CflIntraPredictor_C<4, 8, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] =
+ CflSubsampler_C<4, 8, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType422] =
+ CflSubsampler_C<4, 8, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] =
+ CflSubsampler_C<4, 8, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize4x16] =
+ CflIntraPredictor_C<4, 16, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] =
+ CflSubsampler_C<4, 16, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType422] =
+ CflSubsampler_C<4, 16, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] =
+ CflSubsampler_C<4, 16, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize8x4] =
+ CflIntraPredictor_C<8, 4, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] =
+ CflSubsampler_C<8, 4, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType422] =
+ CflSubsampler_C<8, 4, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] =
+ CflSubsampler_C<8, 4, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize8x8] =
+ CflIntraPredictor_C<8, 8, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] =
+ CflSubsampler_C<8, 8, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType422] =
+ CflSubsampler_C<8, 8, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] =
+ CflSubsampler_C<8, 8, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize8x16] =
+ CflIntraPredictor_C<8, 16, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] =
+ CflSubsampler_C<8, 16, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType422] =
+ CflSubsampler_C<8, 16, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] =
+ CflSubsampler_C<8, 16, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize8x32] =
+ CflIntraPredictor_C<8, 32, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] =
+ CflSubsampler_C<8, 32, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType422] =
+ CflSubsampler_C<8, 32, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] =
+ CflSubsampler_C<8, 32, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize16x4] =
+ CflIntraPredictor_C<16, 4, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] =
+ CflSubsampler_C<16, 4, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType422] =
+ CflSubsampler_C<16, 4, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] =
+ CflSubsampler_C<16, 4, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize16x8] =
+ CflIntraPredictor_C<16, 8, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] =
+ CflSubsampler_C<16, 8, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType422] =
+ CflSubsampler_C<16, 8, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] =
+ CflSubsampler_C<16, 8, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize16x16] =
+ CflIntraPredictor_C<16, 16, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] =
+ CflSubsampler_C<16, 16, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType422] =
+ CflSubsampler_C<16, 16, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] =
+ CflSubsampler_C<16, 16, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize16x32] =
+ CflIntraPredictor_C<16, 32, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] =
+ CflSubsampler_C<16, 32, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType422] =
+ CflSubsampler_C<16, 32, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] =
+ CflSubsampler_C<16, 32, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize32x8] =
+ CflIntraPredictor_C<32, 8, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] =
+ CflSubsampler_C<32, 8, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType422] =
+ CflSubsampler_C<32, 8, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] =
+ CflSubsampler_C<32, 8, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize32x16] =
+ CflIntraPredictor_C<32, 16, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] =
+ CflSubsampler_C<32, 16, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType422] =
+ CflSubsampler_C<32, 16, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] =
+ CflSubsampler_C<32, 16, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize32x32] =
+ CflIntraPredictor_C<32, 32, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] =
+ CflSubsampler_C<32, 32, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType422] =
+ CflSubsampler_C<32, 32, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] =
+ CflSubsampler_C<32, 32, 10, uint16_t, 1, 1>;
+#endif
+
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ // Cfl predictors are available only for transform sizes with max(width,
+ // height) <= 32. Set all others to nullptr.
+ for (const auto i : kTransformSizesLargerThan32x32) {
+ dsp->cfl_intra_predictors[i] = nullptr;
+ for (int j = 0; j < kNumSubsamplingTypes; ++j) {
+ dsp->cfl_subsamplers[i][j] = nullptr;
+ }
+ }
+} // NOLINT(readability/fn_size)
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#undef INIT_CFL_INTRAPREDICTOR_WxH
+#undef INIT_CFL_INTRAPREDICTORS
+
+} // namespace
+
+void IntraPredCflInit_C() {
+ Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ Init10bpp();
+#endif
+}
+
+} // namespace dsp
+} // namespace libgav1
diff --git a/src/dsp/intrapred_cfl.h b/src/dsp/intrapred_cfl.h
new file mode 100644
index 0000000..4e8a11f
--- /dev/null
+++ b/src/dsp/intrapred_cfl.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_INTRAPRED_CFL_H_
+#define LIBGAV1_SRC_DSP_INTRAPRED_CFL_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/intrapred_cfl_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/intrapred_cfl_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::cfl_intra_predictors and Dsp::cfl_subsamplers.
+// This function is not thread-safe.
+void IntraPredCflInit_C();
+
+} // namespace dsp
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_DSP_INTRAPRED_CFL_H_
diff --git a/src/dsp/intrapred_cfl_test.cc b/src/dsp/intrapred_cfl_test.cc
new file mode 100644
index 0000000..e700a5b
--- /dev/null
+++ b/src/dsp/intrapred_cfl_test.cc
@@ -0,0 +1,923 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred_cfl.h"
+
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <ostream>
+
+#include "absl/strings/match.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "tests/block_utils.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kMaxBlockSize = 64;
+constexpr int kTotalPixels = kMaxBlockSize * kMaxBlockSize;
+
+const char* const kCflIntraPredName = "kCflIntraPredictor";
+
+template <int bitdepth, typename Pixel>
+class IntraPredTestBase : public testing::TestWithParam<TransformSize>,
+ public test_utils::MaxAlignedAllocable {
+ public:
+ IntraPredTestBase() {
+ switch (tx_size_) {
+ case kNumTransformSizes:
+ EXPECT_NE(tx_size_, kNumTransformSizes);
+ break;
+ default:
+ block_width_ = kTransformWidth[tx_size_];
+ block_height_ = kTransformHeight[tx_size_];
+ break;
+ }
+ }
+
+ IntraPredTestBase(const IntraPredTestBase&) = delete;
+ IntraPredTestBase& operator=(const IntraPredTestBase&) = delete;
+ ~IntraPredTestBase() override = default;
+
+ protected:
+ struct IntraPredMem {
+ void Reset(libvpx_test::ACMRandom* rnd) {
+ ASSERT_NE(rnd, nullptr);
+ Pixel* const left = left_mem + 16;
+ Pixel* const top = top_mem + 16;
+ const int mask = (1 << bitdepth) - 1;
+ for (auto& r : ref_src) r = rnd->Rand16() & mask;
+ for (int i = 0; i < kMaxBlockSize; ++i) left[i] = rnd->Rand16() & mask;
+ for (int i = -1; i < kMaxBlockSize; ++i) top[i] = rnd->Rand16() & mask;
+
+ // Some directional predictors require top-right, bottom-left.
+ for (int i = kMaxBlockSize; i < 2 * kMaxBlockSize; ++i) {
+ left[i] = rnd->Rand16() & mask;
+ top[i] = rnd->Rand16() & mask;
+ }
+ // TODO(jzern): reorder this and regenerate the digests after switching
+ // random number generators.
+ // Upsampling in the directional predictors extends left/top[-1] to [-2].
+ left[-1] = rnd->Rand16() & mask;
+ left[-2] = rnd->Rand16() & mask;
+ top[-2] = rnd->Rand16() & mask;
+ memset(left_mem, 0, sizeof(left_mem[0]) * 14);
+ memset(top_mem, 0, sizeof(top_mem[0]) * 14);
+ memset(top_mem + kMaxBlockSize * 2 + 16, 0,
+ sizeof(top_mem[0]) * kTopMemPadding);
+ }
+
+ // Set ref_src, top-left, top and left to |pixel|.
+ void Set(const Pixel pixel) {
+ Pixel* const left = left_mem + 16;
+ Pixel* const top = top_mem + 16;
+ for (auto& r : ref_src) r = pixel;
+ // Upsampling in the directional predictors extends left/top[-1] to [-2].
+ for (int i = -2; i < 2 * kMaxBlockSize; ++i) {
+ left[i] = top[i] = pixel;
+ }
+ }
+
+ // DirectionalZone1_Large() overreads up to 7 pixels in |top_mem|.
+ static constexpr int kTopMemPadding = 7;
+ alignas(kMaxAlignment) Pixel dst[kTotalPixels];
+ alignas(kMaxAlignment) Pixel ref_src[kTotalPixels];
+ alignas(kMaxAlignment) Pixel left_mem[kMaxBlockSize * 2 + 16];
+ alignas(
+ kMaxAlignment) Pixel top_mem[kMaxBlockSize * 2 + 16 + kTopMemPadding];
+ };
+
+ void SetUp() override { test_utils::ResetDspTable(bitdepth); }
+
+ const TransformSize tx_size_ = GetParam();
+ int block_width_;
+ int block_height_;
+ IntraPredMem intra_pred_mem_;
+};
+
+//------------------------------------------------------------------------------
+// CflIntraPredTest
+
+template <int bitdepth, typename Pixel>
+class CflIntraPredTest : public IntraPredTestBase<bitdepth, Pixel> {
+ public:
+ CflIntraPredTest() = default;
+ CflIntraPredTest(const CflIntraPredTest&) = delete;
+ CflIntraPredTest& operator=(const CflIntraPredTest&) = delete;
+ ~CflIntraPredTest() override = default;
+
+ protected:
+ using IntraPredTestBase<bitdepth, Pixel>::tx_size_;
+ using IntraPredTestBase<bitdepth, Pixel>::block_width_;
+ using IntraPredTestBase<bitdepth, Pixel>::block_height_;
+ using IntraPredTestBase<bitdepth, Pixel>::intra_pred_mem_;
+
+ void SetUp() override {
+ IntraPredTestBase<bitdepth, Pixel>::SetUp();
+ IntraPredCflInit_C();
+
+ const Dsp* const dsp = GetDspTable(bitdepth);
+ ASSERT_NE(dsp, nullptr);
+ base_cfl_intra_pred_ = dsp->cfl_intra_predictors[tx_size_];
+
+ const testing::TestInfo* const test_info =
+ testing::UnitTest::GetInstance()->current_test_info();
+ const char* const test_case = test_info->test_suite_name();
+ if (absl::StartsWith(test_case, "C/")) {
+ base_cfl_intra_pred_ = nullptr;
+ } else if (absl::StartsWith(test_case, "NEON/")) {
+ IntraPredCflInit_NEON();
+ } else if (absl::StartsWith(test_case, "SSE41/")) {
+ if ((GetCpuInfo() & kSSE4_1) != 0) {
+ IntraPredCflInit_SSE4_1();
+ }
+ } else {
+ FAIL() << "Unrecognized architecture prefix in test case name: "
+ << test_case;
+ }
+
+ cur_cfl_intra_pred_ = dsp->cfl_intra_predictors[tx_size_];
+
+ if (cur_cfl_intra_pred_ == base_cfl_intra_pred_) {
+ cur_cfl_intra_pred_ = nullptr;
+ }
+ }
+
+ // This test modifies intra_pred_mem_.
+ void TestSpeed(const char* digest, int num_runs);
+ void TestSaturatedValues();
+ void TestRandomValues();
+
+ CflIntraPredictorFunc base_cfl_intra_pred_;
+ CflIntraPredictorFunc cur_cfl_intra_pred_;
+};
+
+template <int bitdepth, typename Pixel>
+void CflIntraPredTest<bitdepth, Pixel>::TestSpeed(const char* const digest,
+ const int num_runs) {
+ if (cur_cfl_intra_pred_ == nullptr) return;
+ libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride] = {};
+ const int alpha = rnd(33) - 16;
+ const int dc = rnd(1 << bitdepth);
+ const int max_luma = ((1 << bitdepth) - 1) << 3;
+ for (int i = 0; i < block_height_; ++i) {
+ for (int j = 0; j < block_width_; ++j) {
+ if (i < kCflLumaBufferStride && j < kCflLumaBufferStride) {
+ luma[i][j] = max_luma - rnd(max_luma << 1);
+ }
+ }
+ }
+ for (auto& r : intra_pred_mem_.ref_src) r = dc;
+
+ absl::Duration elapsed_time;
+ for (int run = 0; run < num_runs; ++run) {
+ const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel);
+ memcpy(intra_pred_mem_.dst, intra_pred_mem_.ref_src,
+ sizeof(intra_pred_mem_.dst));
+ const absl::Time start = absl::Now();
+ cur_cfl_intra_pred_(intra_pred_mem_.dst, stride, luma, alpha);
+ elapsed_time += absl::Now() - start;
+ }
+ test_utils::CheckMd5Digest(ToString(tx_size_), kCflIntraPredName, digest,
+ intra_pred_mem_.dst, sizeof(intra_pred_mem_.dst),
+ elapsed_time);
+}
+
+template <int bitdepth, typename Pixel>
+void CflIntraPredTest<bitdepth, Pixel>::TestSaturatedValues() {
+ // Skip the 'C' test case as this is used as the reference.
+ if (base_cfl_intra_pred_ == nullptr) return;
+
+ int16_t luma_buffer[kCflLumaBufferStride][kCflLumaBufferStride];
+ for (auto& line : luma_buffer) {
+ for (auto& luma : line) luma = ((1 << bitdepth) - 1) << 3;
+ }
+
+ libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+ static constexpr int kSaturatedAlpha[] = {-16, 16};
+ for (const int alpha : kSaturatedAlpha) {
+ for (auto& r : intra_pred_mem_.ref_src) r = (1 << bitdepth) - 1;
+ memcpy(intra_pred_mem_.dst, intra_pred_mem_.ref_src,
+ sizeof(intra_pred_mem_.dst));
+ const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel);
+ base_cfl_intra_pred_(intra_pred_mem_.ref_src, stride, luma_buffer, alpha);
+ cur_cfl_intra_pred_(intra_pred_mem_.dst, stride, luma_buffer, alpha);
+ if (!test_utils::CompareBlocks(intra_pred_mem_.dst, intra_pred_mem_.ref_src,
+ block_width_, block_height_, kMaxBlockSize,
+ kMaxBlockSize, true)) {
+ ADD_FAILURE() << "Result from optimized version of CFL with alpha "
+ << alpha << " differs from reference.";
+ break;
+ }
+ }
+}
+
+template <int bitdepth, typename Pixel>
+void CflIntraPredTest<bitdepth, Pixel>::TestRandomValues() {
+ // Skip the 'C' test case as this is used as the reference.
+ if (base_cfl_intra_pred_ == nullptr) return;
+ int16_t luma_buffer[kCflLumaBufferStride][kCflLumaBufferStride];
+
+ const int max_luma = ((1 << bitdepth) - 1) << 3;
+ // Use an alternate seed to differentiate this test from TestSpeed().
+ libvpx_test::ACMRandom rnd(test_utils::kAlternateDeterministicSeed);
+ for (auto& line : luma_buffer) {
+ for (auto& luma : line) luma = max_luma - rnd(max_luma << 1);
+ }
+ const int dc = rnd(1 << bitdepth);
+ for (auto& r : intra_pred_mem_.ref_src) r = dc;
+ static constexpr int kSaturatedAlpha[] = {-16, 16};
+ for (const int alpha : kSaturatedAlpha) {
+ intra_pred_mem_.Reset(&rnd);
+ memcpy(intra_pred_mem_.dst, intra_pred_mem_.ref_src,
+ sizeof(intra_pred_mem_.dst));
+ const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel);
+ base_cfl_intra_pred_(intra_pred_mem_.ref_src, stride, luma_buffer, alpha);
+ cur_cfl_intra_pred_(intra_pred_mem_.dst, stride, luma_buffer, alpha);
+ if (!test_utils::CompareBlocks(intra_pred_mem_.dst, intra_pred_mem_.ref_src,
+ block_width_, block_height_, kMaxBlockSize,
+ kMaxBlockSize, true)) {
+ ADD_FAILURE() << "Result from optimized version of CFL with alpha "
+ << alpha << " differs from reference.";
+ break;
+ }
+ }
+}
+
+template <int bitdepth, typename Pixel, SubsamplingType subsampling_type>
+class CflSubsamplerTest : public IntraPredTestBase<bitdepth, Pixel> {
+ public:
+ CflSubsamplerTest() = default;
+ CflSubsamplerTest(const CflSubsamplerTest&) = delete;
+ CflSubsamplerTest& operator=(const CflSubsamplerTest&) = delete;
+ ~CflSubsamplerTest() override = default;
+
+ protected:
+ using IntraPredTestBase<bitdepth, Pixel>::tx_size_;
+ using IntraPredTestBase<bitdepth, Pixel>::block_width_;
+ using IntraPredTestBase<bitdepth, Pixel>::block_height_;
+ using IntraPredTestBase<bitdepth, Pixel>::intra_pred_mem_;
+
+ void SetUp() override {
+ IntraPredTestBase<bitdepth, Pixel>::SetUp();
+ IntraPredCflInit_C();
+
+ const Dsp* const dsp = GetDspTable(bitdepth);
+ ASSERT_NE(dsp, nullptr);
+ base_cfl_subsampler_ = dsp->cfl_subsamplers[tx_size_][subsampling_type];
+
+ const testing::TestInfo* const test_info =
+ testing::UnitTest::GetInstance()->current_test_info();
+ const char* const test_case = test_info->test_suite_name();
+ if (absl::StartsWith(test_case, "C/")) {
+ base_cfl_subsampler_ = nullptr;
+ } else if (absl::StartsWith(test_case, "NEON/")) {
+ IntraPredCflInit_NEON();
+ } else if (absl::StartsWith(test_case, "SSE41/")) {
+ if ((GetCpuInfo() & kSSE4_1) != 0) {
+ IntraPredCflInit_SSE4_1();
+ }
+ } else {
+ FAIL() << "Unrecognized architecture prefix in test case name: "
+ << test_case;
+ }
+ cur_cfl_subsampler_ = dsp->cfl_subsamplers[tx_size_][subsampling_type];
+ }
+
+ // This test modifies intra_pred_mem_.
+ void TestSpeed(const char* digest, int num_runs);
+ void TestSaturatedValues();
+ void TestRandomValues();
+
+ enum SubsamplingType SubsamplingType() const { return subsampling_type; }
+
+ CflSubsamplerFunc base_cfl_subsampler_;
+ CflSubsamplerFunc cur_cfl_subsampler_;
+};
+
+// There is no case where both source and output have lowest height or width
+// when that dimension is subsampled.
+int GetLumaWidth(int block_width, SubsamplingType subsampling_type) {
+ if (block_width == 4) {
+ const int width_shift =
+ static_cast<int>(subsampling_type != kSubsamplingType444);
+ return block_width << width_shift;
+ }
+ return block_width;
+}
+
+int GetLumaHeight(int block_height, SubsamplingType subsampling_type) {
+ if (block_height == 4) {
+ const int height_shift =
+ static_cast<int>(subsampling_type == kSubsamplingType420);
+ return block_height << height_shift;
+ }
+ return block_height;
+}
+
+template <int bitdepth, typename Pixel, SubsamplingType subsampling_type>
+void CflSubsamplerTest<bitdepth, Pixel, subsampling_type>::TestSpeed(
+ const char* const digest, const int num_runs) {
+ // C declines initializing the table in normal circumstances because there are
+ // assembly implementations.
+ if (cur_cfl_subsampler_ == nullptr) return;
+ libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+
+ const int width = GetLumaWidth(block_width_, subsampling_type);
+ const int height = GetLumaHeight(block_height_, subsampling_type);
+ Pixel* src = intra_pred_mem_.ref_src;
+ for (int i = 0; i < height; ++i) {
+ for (int j = 0; j < width; ++j) {
+ src[j] = rnd.RandRange(1 << bitdepth);
+ }
+ src += kMaxBlockSize;
+ }
+ const absl::Time start = absl::Now();
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride] = {};
+ const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel);
+ for (int run = 0; run < num_runs; ++run) {
+ cur_cfl_subsampler_(luma, width, height, intra_pred_mem_.ref_src, stride);
+ }
+ const absl::Duration elapsed_time = absl::Now() - start;
+ test_utils::CheckMd5Digest(ToString(tx_size_), kCflIntraPredName, digest,
+ luma, sizeof(luma), elapsed_time);
+}
+
+template <int bitdepth, typename Pixel, SubsamplingType subsampling_type>
+void CflSubsamplerTest<bitdepth, Pixel,
+ subsampling_type>::TestSaturatedValues() {
+ if (base_cfl_subsampler_ == nullptr) return;
+ const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel);
+ for (int width = GetLumaWidth(block_width_, subsampling_type); width > 0;
+ width -= 8) {
+ for (int height = GetLumaHeight(block_height_, subsampling_type);
+ height > 0; height -= 8) {
+ Pixel* src = intra_pred_mem_.ref_src;
+ for (int y = 0; y < height; ++y) {
+ Memset(src, (1 << bitdepth) - 1, width);
+ Memset(src + width, 0, kMaxBlockSize - width);
+ src += kMaxBlockSize;
+ }
+ Memset(intra_pred_mem_.ref_src + kMaxBlockSize * height, 0,
+ kMaxBlockSize * (kMaxBlockSize - height));
+
+ int16_t luma_base[kCflLumaBufferStride][kCflLumaBufferStride] = {};
+ int16_t luma_cur[kCflLumaBufferStride][kCflLumaBufferStride] = {};
+ base_cfl_subsampler_(luma_base, width, height, intra_pred_mem_.ref_src,
+ stride);
+ cur_cfl_subsampler_(luma_cur, width, height, intra_pred_mem_.ref_src,
+ stride);
+ if (!test_utils::CompareBlocks(reinterpret_cast<uint16_t*>(luma_cur[0]),
+ reinterpret_cast<uint16_t*>(luma_base[0]),
+ block_width_, block_height_,
+ kCflLumaBufferStride, kCflLumaBufferStride,
+ true)) {
+ FAIL() << "Result from optimized version of CFL subsampler"
+ << " differs from reference. max_luma_width: " << width
+ << " max_luma_height: " << height;
+ }
+ }
+ }
+}
+
+template <int bitdepth, typename Pixel, SubsamplingType subsampling_type>
+void CflSubsamplerTest<bitdepth, Pixel, subsampling_type>::TestRandomValues() {
+ if (base_cfl_subsampler_ == nullptr) return;
+ const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel);
+ // Use an alternate seed to differentiate this test from TestSpeed().
+ libvpx_test::ACMRandom rnd(test_utils::kAlternateDeterministicSeed);
+ for (int width = GetLumaWidth(block_width_, subsampling_type); width > 0;
+ width -= 8) {
+ for (int height = GetLumaHeight(block_height_, subsampling_type);
+ height > 0; height -= 8) {
+ Pixel* src = intra_pred_mem_.ref_src;
+ for (int i = 0; i < height; ++i) {
+ for (int j = 0; j < width; ++j) {
+ src[j] = rnd.RandRange(1 << bitdepth);
+ }
+ Memset(src + width, 0, kMaxBlockSize - width);
+ src += kMaxBlockSize;
+ }
+ Memset(intra_pred_mem_.ref_src + kMaxBlockSize * height, 0,
+ kMaxBlockSize * (kMaxBlockSize - height));
+
+ int16_t luma_base[kCflLumaBufferStride][kCflLumaBufferStride] = {};
+ int16_t luma_cur[kCflLumaBufferStride][kCflLumaBufferStride] = {};
+ base_cfl_subsampler_(luma_base, width, height, intra_pred_mem_.ref_src,
+ stride);
+ cur_cfl_subsampler_(luma_cur, width, height, intra_pred_mem_.ref_src,
+ stride);
+ if (!test_utils::CompareBlocks(reinterpret_cast<uint16_t*>(luma_cur[0]),
+ reinterpret_cast<uint16_t*>(luma_base[0]),
+ block_width_, block_height_,
+ kCflLumaBufferStride, kCflLumaBufferStride,
+ true)) {
+ FAIL() << "Result from optimized version of CFL subsampler"
+ << " differs from reference. max_luma_width: " << width
+ << " max_luma_height: " << height;
+ }
+ }
+ }
+}
+
+//------------------------------------------------------------------------------
+
+using CflIntraPredTest8bpp = CflIntraPredTest<8, uint8_t>;
+
+const char* GetCflIntraPredDigest8bpp(TransformSize tx_size) {
+ static const char* const kDigest4x4 = "9ea7088e082867fd5ae394ca549fe1ed";
+ static const char* const kDigest4x8 = "323b0b4784b6658da781398e61f2da3d";
+ static const char* const kDigest4x16 = "99eb9c65f227ca7f71dcac24645a4fec";
+ static const char* const kDigest8x4 = "e8e782e31c94f3974b87b93d455262d8";
+ static const char* const kDigest8x8 = "23ab9fb65e7bbbdb985709e115115eb5";
+ static const char* const kDigest8x16 = "52f5add2fc4bbb2ff893148645e95b9c";
+ static const char* const kDigest8x32 = "283fdee9af8afdb76f72dd7339c92c3c";
+ static const char* const kDigest16x4 = "eead35f515b1aa8b5175b283192b86e6";
+ static const char* const kDigest16x8 = "5778e934254eaab04230bc370f64f778";
+ static const char* const kDigest16x16 = "4e8ed38ccba0d62f1213171da2212ed3";
+ static const char* const kDigest16x32 = "61a29bd7699e18ca6ea5641d1d023bfd";
+ static const char* const kDigest32x8 = "7f31607bd4f9ec879aa47f4daf9c7bb0";
+ static const char* const kDigest32x16 = "eb84dfab900fa6a90e132b186b4c6c36";
+ static const char* const kDigest32x32 = "e0ff35d407cb214578d61ef419c94237";
+
+ switch (tx_size) {
+ case kTransformSize4x4:
+ return kDigest4x4;
+ case kTransformSize4x8:
+ return kDigest4x8;
+ case kTransformSize4x16:
+ return kDigest4x16;
+ case kTransformSize8x4:
+ return kDigest8x4;
+ case kTransformSize8x8:
+ return kDigest8x8;
+ case kTransformSize8x16:
+ return kDigest8x16;
+ case kTransformSize8x32:
+ return kDigest8x32;
+ case kTransformSize16x4:
+ return kDigest16x4;
+ case kTransformSize16x8:
+ return kDigest16x8;
+ case kTransformSize16x16:
+ return kDigest16x16;
+ case kTransformSize16x32:
+ return kDigest16x32;
+ case kTransformSize32x8:
+ return kDigest32x8;
+ case kTransformSize32x16:
+ return kDigest32x16;
+ case kTransformSize32x32:
+ return kDigest32x32;
+ default:
+ ADD_FAILURE() << "Unknown transform size: " << tx_size;
+ return nullptr;
+ }
+}
+
+TEST_P(CflIntraPredTest8bpp, DISABLED_Speed) {
+ const auto num_runs =
+ static_cast<int>(2.0e9 / (block_width_ * block_height_));
+ TestSpeed(GetCflIntraPredDigest8bpp(tx_size_), num_runs);
+}
+
+TEST_P(CflIntraPredTest8bpp, FixedInput) {
+ TestSpeed(GetCflIntraPredDigest8bpp(tx_size_), 1);
+}
+
+TEST_P(CflIntraPredTest8bpp, Overflow) { TestSaturatedValues(); }
+
+TEST_P(CflIntraPredTest8bpp, Random) { TestRandomValues(); }
+
+//------------------------------------------------------------------------------
+
+using CflSubsamplerTest8bpp444 =
+ CflSubsamplerTest<8, uint8_t, kSubsamplingType444>;
+using CflSubsamplerTest8bpp422 =
+ CflSubsamplerTest<8, uint8_t, kSubsamplingType422>;
+using CflSubsamplerTest8bpp420 =
+ CflSubsamplerTest<8, uint8_t, kSubsamplingType420>;
+
+const char* GetCflSubsamplerDigest8bpp(TransformSize tx_size,
+ SubsamplingType subsampling_type) {
+ static const char* const kDigests4x4[3] = {
+ "a8fa98d76cc3ccffcffc0d02dfae052c", "929cf2c23d926b500616797f8b1baf5b",
+ "1d03f091956838e7f2b113aabd8b9da9"};
+ static const char* const kDigests4x8[3] = {
+ "717b84f867f413c87c90a7c5d0125c8c", "6ccd9f48842b1a802e128b46b8f4885d",
+ "68a334f5d2abecbc78562b3280b5fb0c"};
+ static const char* const kDigests4x16[3] = {
+ "ecd1340b7e065dd8807fd9861abb7d99", "042c3fee17df7ef8fb8cef616f212a91",
+ "b0600f0bc3fbfc374bb3628360dcae5c"};
+ static const char* const kDigests8x4[3] = {
+ "4ea5617f4ed8e9edc2fff88d0ab8e53f", "b02288905f218c9f54ce4a472ec7b22e",
+ "3522d3a4dd3839d1a86fb39b31a86d52"};
+ static const char* const kDigests8x8[3] = {
+ "a0488493e6bcdb868713a95f9b4a0091", "ff6c1ac1d94fce63c282ba49186529bf",
+ "082e34ba04d04d7cd6fe408823987602"};
+ static const char* const kDigests8x16[3] = {
+ "e01dd4bb21daaa6e991cd5b1e6f30300", "2a1b13f932e39cc5f561afea9956f47a",
+ "d8d266282cb7123f780bd7266e8f5913"};
+ static const char* const kDigests8x32[3] = {
+ "0fc95e4ab798b95ccd2966ff75028b03", "6bc6e45ef2f664134449342fe76006ff",
+ "d294fb6399edaa267aa167407c0ebccb"};
+ static const char* const kDigests16x4[3] = {
+ "4798c2cf649b786bd153ad88353d52aa", "43a4bfa3b8caf4b72f58c6a1d1054f64",
+ "a928ebbec2db1508c8831a440d82eb98"};
+ static const char* const kDigests16x8[3] = {
+ "736b7f5b603cb34abcbe1b7e69b6ce93", "90422000ab20ecb519e4d277a9b3ea2b",
+ "c8e71c2fddbb850c5a50592ee5975368"};
+ static const char* const kDigests16x16[3] = {
+ "4f15a694966ee50a9e987e9a0aa2423b", "9e31e2f5a7ce7bef738b135755e25dcd",
+ "2ffeed4d592a0455f6d888913969827f"};
+ static const char* const kDigests16x32[3] = {
+ "3a10438bfe17ea39efad20608a0520eb", "79e8e8732a6ffc29dfbb0b3fc29c2883",
+ "185ca976ccbef7fb5f3f8c6aa22d5a79"};
+ static const char* const kDigests32x8[3] = {
+ "683704f08839a15e42603e4977a3e815", "13d311635372aee8998fca1758e75e20",
+ "9847d88eaaa57c086a2e6aed583048d3"};
+ static const char* const kDigests32x16[3] = {
+ "14b6761bf9f1156cf2496f532512aa99", "ee57bb7f0aa2302d29cdc1bfce72d5fc",
+ "a4189655fe714b82eb88cb5092c0ad76"};
+ static const char* const kDigests32x32[3] = {
+ "dcfbe71b70a37418ccb90dbf27f04226", "c578556a584019c1bdc2d0c3b9fd0c88",
+ "db200bc8ccbeacd6a42d6b8e5ad1d931"};
+
+ switch (tx_size) {
+ case kTransformSize4x4:
+ return kDigests4x4[subsampling_type];
+ case kTransformSize4x8:
+ return kDigests4x8[subsampling_type];
+ case kTransformSize4x16:
+ return kDigests4x16[subsampling_type];
+ case kTransformSize8x4:
+ return kDigests8x4[subsampling_type];
+ case kTransformSize8x8:
+ return kDigests8x8[subsampling_type];
+ case kTransformSize8x16:
+ return kDigests8x16[subsampling_type];
+ case kTransformSize8x32:
+ return kDigests8x32[subsampling_type];
+ case kTransformSize16x4:
+ return kDigests16x4[subsampling_type];
+ case kTransformSize16x8:
+ return kDigests16x8[subsampling_type];
+ case kTransformSize16x16:
+ return kDigests16x16[subsampling_type];
+ case kTransformSize16x32:
+ return kDigests16x32[subsampling_type];
+ case kTransformSize32x8:
+ return kDigests32x8[subsampling_type];
+ case kTransformSize32x16:
+ return kDigests32x16[subsampling_type];
+ case kTransformSize32x32:
+ return kDigests32x32[subsampling_type];
+ default:
+ ADD_FAILURE() << "Unknown transform size: " << tx_size;
+ return nullptr;
+ }
+}
+
+TEST_P(CflSubsamplerTest8bpp444, DISABLED_Speed) {
+ const auto num_runs =
+ static_cast<int>(2.0e9 / (block_width_ * block_height_));
+ TestSpeed(GetCflSubsamplerDigest8bpp(tx_size_, SubsamplingType()), num_runs);
+}
+
+TEST_P(CflSubsamplerTest8bpp444, FixedInput) {
+ TestSpeed(GetCflSubsamplerDigest8bpp(tx_size_, SubsamplingType()), 1);
+}
+
+TEST_P(CflSubsamplerTest8bpp444, Overflow) { TestSaturatedValues(); }
+
+TEST_P(CflSubsamplerTest8bpp444, Random) { TestRandomValues(); }
+
+TEST_P(CflSubsamplerTest8bpp422, DISABLED_Speed) {
+ const auto num_runs =
+ static_cast<int>(2.0e9 / (block_width_ * block_height_));
+ TestSpeed(GetCflSubsamplerDigest8bpp(tx_size_, SubsamplingType()), num_runs);
+}
+
+TEST_P(CflSubsamplerTest8bpp422, FixedInput) {
+ TestSpeed(GetCflSubsamplerDigest8bpp(tx_size_, SubsamplingType()), 1);
+}
+
+TEST_P(CflSubsamplerTest8bpp422, Overflow) { TestSaturatedValues(); }
+
+TEST_P(CflSubsamplerTest8bpp422, Random) { TestRandomValues(); }
+
+TEST_P(CflSubsamplerTest8bpp420, DISABLED_Speed) {
+ const auto num_runs =
+ static_cast<int>(2.0e9 / (block_width_ * block_height_));
+ TestSpeed(GetCflSubsamplerDigest8bpp(tx_size_, SubsamplingType()), num_runs);
+}
+
+TEST_P(CflSubsamplerTest8bpp420, FixedInput) {
+ TestSpeed(GetCflSubsamplerDigest8bpp(tx_size_, SubsamplingType()), 1);
+}
+
+TEST_P(CflSubsamplerTest8bpp420, Overflow) { TestSaturatedValues(); }
+
+TEST_P(CflSubsamplerTest8bpp420, Random) { TestRandomValues(); }
+
+//------------------------------------------------------------------------------
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+//------------------------------------------------------------------------------
+
+using CflIntraPredTest10bpp = CflIntraPredTest<10, uint16_t>;
+
+const char* GetCflIntraPredDigest10bpp(TransformSize tx_size) {
+ static const char* const kDigest4x4 = "b4ca5f6fbb643a94eb05d59976d44c5d";
+ static const char* const kDigest4x8 = "040139b76ee22af05c56baf887d3d43b";
+ static const char* const kDigest4x16 = "4a1d59ace84ff07e68a0d30e9b1cebdd";
+ static const char* const kDigest8x4 = "c2c149cea5fdcd18bfe5c19ec2a8aa90";
+ static const char* const kDigest8x8 = "68ad90bd6f409548fa5551496b7cb0d0";
+ static const char* const kDigest8x16 = "bdc54eff4de8c5d597b03afaa705d3fe";
+ static const char* const kDigest8x32 = "362aebc6d68ff0d312d55dcd6a8a927d";
+ static const char* const kDigest16x4 = "349e813aedd211581c5e64ba1938eaa7";
+ static const char* const kDigest16x8 = "35c64f6da17f836618b5804185cf3eef";
+ static const char* const kDigest16x16 = "95be0c78dbd8dda793c62c6635b4bfb7";
+ static const char* const kDigest16x32 = "4752b9eda069854d3f5c56d3f2057e79";
+ static const char* const kDigest32x8 = "dafc5e973e4b6a55861f4586a11b7dd1";
+ static const char* const kDigest32x16 = "1e177ed3914a165183916aca1d01bb74";
+ static const char* const kDigest32x32 = "4c9ab3cf9baa27bb34e29729dabc1ea6";
+
+ switch (tx_size) {
+ case kTransformSize4x4:
+ return kDigest4x4;
+ case kTransformSize4x8:
+ return kDigest4x8;
+ case kTransformSize4x16:
+ return kDigest4x16;
+ case kTransformSize8x4:
+ return kDigest8x4;
+ case kTransformSize8x8:
+ return kDigest8x8;
+ case kTransformSize8x16:
+ return kDigest8x16;
+ case kTransformSize8x32:
+ return kDigest8x32;
+ case kTransformSize16x4:
+ return kDigest16x4;
+ case kTransformSize16x8:
+ return kDigest16x8;
+ case kTransformSize16x16:
+ return kDigest16x16;
+ case kTransformSize16x32:
+ return kDigest16x32;
+ case kTransformSize32x8:
+ return kDigest32x8;
+ case kTransformSize32x16:
+ return kDigest32x16;
+ case kTransformSize32x32:
+ return kDigest32x32;
+ default:
+ ADD_FAILURE() << "Unknown transform size: " << tx_size;
+ return nullptr;
+ }
+}
+
+TEST_P(CflIntraPredTest10bpp, DISABLED_Speed) {
+ const auto num_runs =
+ static_cast<int>(2.0e9 / (block_width_ * block_height_));
+ TestSpeed(GetCflIntraPredDigest10bpp(tx_size_), num_runs);
+}
+
+TEST_P(CflIntraPredTest10bpp, FixedInput) {
+ TestSpeed(GetCflIntraPredDigest10bpp(tx_size_), 1);
+}
+
+TEST_P(CflIntraPredTest10bpp, Overflow) { TestSaturatedValues(); }
+
+TEST_P(CflIntraPredTest10bpp, Random) { TestRandomValues(); }
+
+//------------------------------------------------------------------------------
+
+using CflSubsamplerTest10bpp444 =
+ CflSubsamplerTest<10, uint16_t, kSubsamplingType444>;
+using CflSubsamplerTest10bpp422 =
+ CflSubsamplerTest<10, uint16_t, kSubsamplingType422>;
+using CflSubsamplerTest10bpp420 =
+ CflSubsamplerTest<10, uint16_t, kSubsamplingType420>;
+
+const char* GetCflSubsamplerDigest10bpp(TransformSize tx_size,
+ SubsamplingType subsampling_type) {
+ static const char* const kDigests4x4[3] = {
+ "a8abcad9a6c9b046a100689135a108cb", "01081c2a0d0c15dabdbc725be5660451",
+ "93d1d9df2861240d88f5618e42178654"};
+ static const char* const kDigests4x8[3] = {
+ "d1fd8cd0709ca6634ad85f3e331672e1", "0d603fcc910aca3db41fc7f64e826c27",
+ "cf88b6d1b7b025cfa0082361775aeb75"};
+ static const char* const kDigests4x16[3] = {
+ "ce2e036a950388a564d8637b1416a6c6", "6c36c46cd72057a6b36bc12188b6d22c",
+ "0884a0e53384cd5173035ad8966d8f2f"};
+ static const char* const kDigests8x4[3] = {
+ "174e961983ed71fb105ed71aa3f9daf5", "330946cc369a534618a1014b4e3f6f18",
+ "8070668aa389c1d09f8aaf43c1223e8c"};
+ static const char* const kDigests8x8[3] = {
+ "86884feb35217010f73ccdbadecb635e", "b8cbc646e1bf1352e5b4b599eaef1193",
+ "4a1110382e56b42d3b7a4132bccc01ee"};
+ static const char* const kDigests8x16[3] = {
+ "a694c4e1f89648ffb49efd6a1d35b300", "864b9da67d23a2f8284b28b2a1e5aa30",
+ "bd012ca1cea256dd02c231339a4cf200"};
+ static const char* const kDigests8x32[3] = {
+ "60c42201bc24e518c1a3b3b6306d8125", "4d530e47c2b7555d5f311ee910d61842",
+ "71888b17b832ef55c0cd9449c0e6b077"};
+ static const char* const kDigests16x4[3] = {
+ "6b6d5ae4cc294c070ce65ab31c5a7d4f", "0fbecee20d294939e7a0183c2b4a0b96",
+ "917cd884923139d5c05a11000722e3b6"};
+ static const char* const kDigests16x8[3] = {
+ "688c41726d9ac35fb5b18c57bca76b9c", "d439a2e0a60d672b644cd1189e2858b9",
+ "edded6d166a77a6c3ff46fddc13f372f"};
+ static const char* const kDigests16x16[3] = {
+ "feb2bad9f6bb3f60eaeaf6c1bfd89ca5", "d65cabce5fcd9a29d1dfc530e4764f3a",
+ "2f1a91898812d2c9320c7506b3a72eb4"};
+ static const char* const kDigests16x32[3] = {
+ "6f23b1851444d29633e62ce77bf09559", "4a449fd078bd0c9657cdc24b709c0796",
+ "e44e18cb8bda2d34b52c96d5b6b510be"};
+ static const char* const kDigests32x8[3] = {
+ "77bf9ba56f7e1d2f04068a8a00b139da", "a85a1dea82963dedab9a2f7ad4169b5f",
+ "d12746071bee96ddc075c6368bc9fbaf"};
+ static const char* const kDigests32x16[3] = {
+ "cce3422f7f8cf57145f979359ac92f98", "1c18738d40bfa91296e5fdb7230bf9a7",
+ "02513142d109aee10f081cacfb33d1c5"};
+ static const char* const kDigests32x32[3] = {
+ "789008e49d0276de186af968196dd4a7", "b8848b00968a7ba4787765b7214da05f",
+ "12d13828db57605b00ce99469489651d"};
+
+ switch (tx_size) {
+ case kTransformSize4x4:
+ return kDigests4x4[subsampling_type];
+ case kTransformSize4x8:
+ return kDigests4x8[subsampling_type];
+ case kTransformSize4x16:
+ return kDigests4x16[subsampling_type];
+ case kTransformSize8x4:
+ return kDigests8x4[subsampling_type];
+ case kTransformSize8x8:
+ return kDigests8x8[subsampling_type];
+ case kTransformSize8x16:
+ return kDigests8x16[subsampling_type];
+ case kTransformSize8x32:
+ return kDigests8x32[subsampling_type];
+ case kTransformSize16x4:
+ return kDigests16x4[subsampling_type];
+ case kTransformSize16x8:
+ return kDigests16x8[subsampling_type];
+ case kTransformSize16x16:
+ return kDigests16x16[subsampling_type];
+ case kTransformSize16x32:
+ return kDigests16x32[subsampling_type];
+ case kTransformSize32x8:
+ return kDigests32x8[subsampling_type];
+ case kTransformSize32x16:
+ return kDigests32x16[subsampling_type];
+ case kTransformSize32x32:
+ return kDigests32x32[subsampling_type];
+ default:
+ ADD_FAILURE() << "Unknown transform size: " << tx_size;
+ return nullptr;
+ }
+}
+
+TEST_P(CflSubsamplerTest10bpp444, DISABLED_Speed) {
+ const auto num_runs =
+ static_cast<int>(2.0e9 / (block_width_ * block_height_));
+ TestSpeed(GetCflSubsamplerDigest10bpp(tx_size_, SubsamplingType()), num_runs);
+}
+
+TEST_P(CflSubsamplerTest10bpp444, FixedInput) {
+ TestSpeed(GetCflSubsamplerDigest10bpp(tx_size_, SubsamplingType()), 1);
+}
+
+TEST_P(CflSubsamplerTest10bpp444, Overflow) { TestSaturatedValues(); }
+
+TEST_P(CflSubsamplerTest10bpp444, Random) { TestRandomValues(); }
+
+TEST_P(CflSubsamplerTest10bpp422, DISABLED_Speed) {
+ const auto num_runs =
+ static_cast<int>(2.0e9 / (block_width_ * block_height_));
+ TestSpeed(GetCflSubsamplerDigest10bpp(tx_size_, SubsamplingType()), num_runs);
+}
+
+TEST_P(CflSubsamplerTest10bpp422, FixedInput) {
+ TestSpeed(GetCflSubsamplerDigest10bpp(tx_size_, SubsamplingType()), 1);
+}
+
+TEST_P(CflSubsamplerTest10bpp422, Overflow) { TestSaturatedValues(); }
+
+TEST_P(CflSubsamplerTest10bpp422, Random) { TestRandomValues(); }
+
+TEST_P(CflSubsamplerTest10bpp420, DISABLED_Speed) {
+ const auto num_runs =
+ static_cast<int>(2.0e9 / (block_width_ * block_height_));
+ TestSpeed(GetCflSubsamplerDigest10bpp(tx_size_, SubsamplingType()), num_runs);
+}
+
+TEST_P(CflSubsamplerTest10bpp420, FixedInput) {
+ TestSpeed(GetCflSubsamplerDigest10bpp(tx_size_, SubsamplingType()), 1);
+}
+
+TEST_P(CflSubsamplerTest10bpp420, Overflow) { TestSaturatedValues(); }
+
+TEST_P(CflSubsamplerTest10bpp420, Random) { TestRandomValues(); }
+
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+// Cfl predictors are available only for transform sizes with
+// max(width, height) <= 32.
+constexpr TransformSize kTransformSizesSmallerThan32x32[] = {
+ kTransformSize4x4, kTransformSize4x8, kTransformSize4x16,
+ kTransformSize8x4, kTransformSize8x8, kTransformSize8x16,
+ kTransformSize8x32, kTransformSize16x4, kTransformSize16x8,
+ kTransformSize16x16, kTransformSize16x32, kTransformSize32x8,
+ kTransformSize32x16, kTransformSize32x32};
+
+INSTANTIATE_TEST_SUITE_P(C, CflIntraPredTest8bpp,
+ testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(C, CflSubsamplerTest8bpp444,
+ testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(C, CflSubsamplerTest8bpp422,
+ testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(C, CflSubsamplerTest8bpp420,
+ testing::ValuesIn(kTransformSizesSmallerThan32x32));
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, CflIntraPredTest8bpp,
+ testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(SSE41, CflSubsamplerTest8bpp444,
+ testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(SSE41, CflSubsamplerTest8bpp420,
+ testing::ValuesIn(kTransformSizesSmallerThan32x32));
+#endif // LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, CflIntraPredTest8bpp,
+ testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(NEON, CflSubsamplerTest8bpp444,
+ testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(NEON, CflSubsamplerTest8bpp420,
+ testing::ValuesIn(kTransformSizesSmallerThan32x32));
+#endif // LIBGAV1_ENABLE_NEON
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+INSTANTIATE_TEST_SUITE_P(C, CflIntraPredTest10bpp,
+ testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(C, CflSubsamplerTest10bpp444,
+ testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(C, CflSubsamplerTest10bpp422,
+ testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(C, CflSubsamplerTest10bpp420,
+ testing::ValuesIn(kTransformSizesSmallerThan32x32));
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, CflIntraPredTest10bpp,
+ testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(SSE41, CflSubsamplerTest10bpp444,
+ testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(SSE41, CflSubsamplerTest10bpp420,
+ testing::ValuesIn(kTransformSizesSmallerThan32x32));
+#endif // LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, CflIntraPredTest10bpp,
+ testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(NEON, CflSubsamplerTest10bpp444,
+ testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(NEON, CflSubsamplerTest10bpp420,
+ testing::ValuesIn(kTransformSizesSmallerThan32x32));
+#endif // LIBGAV1_ENABLE_NEON
+
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+} // namespace
+} // namespace dsp
+
+static std::ostream& operator<<(std::ostream& os, const TransformSize tx_size) {
+ return os << ToString(tx_size);
+}
+
+} // namespace libgav1
diff --git a/src/dsp/intrapred_directional.cc b/src/dsp/intrapred_directional.cc
new file mode 100644
index 0000000..e670769
--- /dev/null
+++ b/src/dsp/intrapred_directional.cc
@@ -0,0 +1,252 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred_directional.h"
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+//------------------------------------------------------------------------------
+// 7.11.2.4. Directional intra prediction process
+
+template <typename Pixel>
+void DirectionalIntraPredictorZone1_C(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const int width, const int height,
+ const int xstep,
+ const bool upsampled_top) {
+ const auto* const top = static_cast<const Pixel*>(top_row);
+ auto* dst = static_cast<Pixel*>(dest);
+ stride /= sizeof(Pixel);
+
+ assert(xstep > 0);
+
+ // If xstep == 64 then |shift| always evaluates to 0 which sets |val| to
+ // |top[top_base_x]|. This corresponds to a 45 degree prediction.
+ if (xstep == 64) {
+ // 7.11.2.10. Intra edge upsample selection process
+ // if ( d <= 0 || d >= 40 ) useUpsample = 0
+ // For |upsampled_top| the delta is |predictor_angle - 90|. Since the
+ // |predictor_angle| is 45 the delta is also 45.
+ assert(!upsampled_top);
+ const Pixel* top_ptr = top + 1;
+ for (int y = 0; y < height; ++y, dst += stride, ++top_ptr) {
+ memcpy(dst, top_ptr, sizeof(*top_ptr) * width);
+ }
+ return;
+ }
+
+ const int upsample_shift = static_cast<int>(upsampled_top);
+ const int max_base_x = ((width + height) - 1) << upsample_shift;
+ const int scale_bits = 6 - upsample_shift;
+ const int base_step = 1 << upsample_shift;
+ int top_x = xstep;
+ int y = 0;
+ do {
+ int top_base_x = top_x >> scale_bits;
+
+ if (top_base_x >= max_base_x) {
+ for (int i = y; i < height; ++i) {
+ Memset(dst, top[max_base_x], width);
+ dst += stride;
+ }
+ return;
+ }
+
+ const int shift = ((top_x << upsample_shift) & 0x3F) >> 1;
+ int x = 0;
+ do {
+ if (top_base_x >= max_base_x) {
+ Memset(dst + x, top[max_base_x], width - x);
+ break;
+ }
+
+ const int val =
+ top[top_base_x] * (32 - shift) + top[top_base_x + 1] * shift;
+ dst[x] = RightShiftWithRounding(val, 5 /*log2(32)*/);
+ top_base_x += base_step;
+ } while (++x < width);
+
+ dst += stride;
+ top_x += xstep;
+ } while (++y < height);
+}
+
+template <typename Pixel>
+void DirectionalIntraPredictorZone2_C(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column,
+ const int width, const int height,
+ const int xstep, const int ystep,
+ const bool upsampled_top,
+ const bool upsampled_left) {
+ const auto* const top = static_cast<const Pixel*>(top_row);
+ const auto* const left = static_cast<const Pixel*>(left_column);
+ auto* dst = static_cast<Pixel*>(dest);
+ stride /= sizeof(Pixel);
+
+ assert(xstep > 0);
+ assert(ystep > 0);
+
+ const int upsample_top_shift = static_cast<int>(upsampled_top);
+ const int upsample_left_shift = static_cast<int>(upsampled_left);
+ const int scale_bits_x = 6 - upsample_top_shift;
+ const int scale_bits_y = 6 - upsample_left_shift;
+ const int min_base_x = -(1 << upsample_top_shift);
+ const int base_step_x = 1 << upsample_top_shift;
+ int y = 0;
+ int top_x = -xstep;
+ do {
+ int top_base_x = top_x >> scale_bits_x;
+ int left_y = (y << 6) - ystep;
+ int x = 0;
+ do {
+ int val;
+ if (top_base_x >= min_base_x) {
+ const int shift = ((top_x * (1 << upsample_top_shift)) & 0x3F) >> 1;
+ val = top[top_base_x] * (32 - shift) + top[top_base_x + 1] * shift;
+ } else {
+ // Note this assumes an arithmetic shift to handle negative values.
+ const int left_base_y = left_y >> scale_bits_y;
+ const int shift = ((left_y * (1 << upsample_left_shift)) & 0x3F) >> 1;
+ assert(left_base_y >= -(1 << upsample_left_shift));
+ val = left[left_base_y] * (32 - shift) + left[left_base_y + 1] * shift;
+ }
+ dst[x] = RightShiftWithRounding(val, 5);
+ top_base_x += base_step_x;
+ left_y -= ystep;
+ } while (++x < width);
+
+ top_x -= xstep;
+ dst += stride;
+ } while (++y < height);
+}
+
+template <typename Pixel>
+void DirectionalIntraPredictorZone3_C(void* const dest, ptrdiff_t stride,
+ const void* const left_column,
+ const int width, const int height,
+ const int ystep,
+ const bool upsampled_left) {
+ const auto* const left = static_cast<const Pixel*>(left_column);
+ stride /= sizeof(Pixel);
+
+ assert(ystep > 0);
+
+ const int upsample_shift = static_cast<int>(upsampled_left);
+ const int scale_bits = 6 - upsample_shift;
+ const int base_step = 1 << upsample_shift;
+ // Zone3 never runs out of left_column values.
+ assert((width + height - 1) << upsample_shift > // max_base_y
+ ((ystep * width) >> scale_bits) +
+ base_step * (height - 1)); // left_base_y
+
+ int left_y = ystep;
+ int x = 0;
+ do {
+ auto* dst = static_cast<Pixel*>(dest);
+
+ int left_base_y = left_y >> scale_bits;
+ int y = 0;
+ do {
+ const int shift = ((left_y << upsample_shift) & 0x3F) >> 1;
+ const int val =
+ left[left_base_y] * (32 - shift) + left[left_base_y + 1] * shift;
+ dst[x] = RightShiftWithRounding(val, 5);
+ dst += stride;
+ left_base_y += base_step;
+ } while (++y < height);
+
+ left_y += ystep;
+ } while (++x < width);
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->directional_intra_predictor_zone1 =
+ DirectionalIntraPredictorZone1_C<uint8_t>;
+ dsp->directional_intra_predictor_zone2 =
+ DirectionalIntraPredictorZone2_C<uint8_t>;
+ dsp->directional_intra_predictor_zone3 =
+ DirectionalIntraPredictorZone3_C<uint8_t>;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1
+ dsp->directional_intra_predictor_zone1 =
+ DirectionalIntraPredictorZone1_C<uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2
+ dsp->directional_intra_predictor_zone2 =
+ DirectionalIntraPredictorZone2_C<uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3
+ dsp->directional_intra_predictor_zone3 =
+ DirectionalIntraPredictorZone3_C<uint8_t>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->directional_intra_predictor_zone1 =
+ DirectionalIntraPredictorZone1_C<uint16_t>;
+ dsp->directional_intra_predictor_zone2 =
+ DirectionalIntraPredictorZone2_C<uint16_t>;
+ dsp->directional_intra_predictor_zone3 =
+ DirectionalIntraPredictorZone3_C<uint16_t>;
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone1
+ dsp->directional_intra_predictor_zone1 =
+ DirectionalIntraPredictorZone1_C<uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone2
+ dsp->directional_intra_predictor_zone2 =
+ DirectionalIntraPredictorZone2_C<uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone3
+ dsp->directional_intra_predictor_zone3 =
+ DirectionalIntraPredictorZone3_C<uint16_t>;
+#endif
+}
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+} // namespace
+
+void IntraPredDirectionalInit_C() {
+ Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ Init10bpp();
+#endif
+}
+
+} // namespace dsp
+} // namespace libgav1
diff --git a/src/dsp/intrapred_directional.h b/src/dsp/intrapred_directional.h
new file mode 100644
index 0000000..bcd1bc1
--- /dev/null
+++ b/src/dsp/intrapred_directional.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_INTRAPRED_DIRECTIONAL_H_
+#define LIBGAV1_SRC_DSP_INTRAPRED_DIRECTIONAL_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/intrapred_directional_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/intrapred_directional_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::directional_intra_predictor_zone*. This function is not
+// thread-safe.
+void IntraPredDirectionalInit_C();
+
+} // namespace dsp
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_DSP_INTRAPRED_DIRECTIONAL_H_
diff --git a/src/dsp/intrapred_directional_test.cc b/src/dsp/intrapred_directional_test.cc
new file mode 100644
index 0000000..ebf9da0
--- /dev/null
+++ b/src/dsp/intrapred_directional_test.cc
@@ -0,0 +1,929 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred_directional.h"
+
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <ostream>
+
+#include "absl/strings/match.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "tests/block_utils.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kMaxBlockSize = 64;
+constexpr int kTotalPixels = kMaxBlockSize * kMaxBlockSize;
+constexpr int kNumDirectionalIntraPredictors = 3;
+
+constexpr int kBaseAngles[] = {45, 67, 90, 113, 135, 157, 180, 203};
+
+const char* const kDirectionalPredNames[kNumDirectionalIntraPredictors] = {
+ "kDirectionalIntraPredictorZone1", "kDirectionalIntraPredictorZone2",
+ "kDirectionalIntraPredictorZone3"};
+
+int16_t GetDirectionalIntraPredictorDerivative(const int angle) {
+ EXPECT_GE(angle, 3);
+ EXPECT_LE(angle, 87);
+ return kDirectionalIntraPredictorDerivative[DivideBy2(angle) - 1];
+}
+
+template <int bitdepth, typename Pixel>
+class IntraPredTestBase : public testing::TestWithParam<TransformSize>,
+ public test_utils::MaxAlignedAllocable {
+ public:
+ IntraPredTestBase() {
+ switch (tx_size_) {
+ case kNumTransformSizes:
+ EXPECT_NE(tx_size_, kNumTransformSizes);
+ break;
+ default:
+ block_width_ = kTransformWidth[tx_size_];
+ block_height_ = kTransformHeight[tx_size_];
+ break;
+ }
+ }
+
+ IntraPredTestBase(const IntraPredTestBase&) = delete;
+ IntraPredTestBase& operator=(const IntraPredTestBase&) = delete;
+ ~IntraPredTestBase() override = default;
+
+ protected:
+ struct IntraPredMem {
+ void Reset(libvpx_test::ACMRandom* rnd) {
+ ASSERT_NE(rnd, nullptr);
+ Pixel* const left = left_mem + 16;
+ Pixel* const top = top_mem + 16;
+ const int mask = (1 << bitdepth) - 1;
+ for (auto& r : ref_src) r = rnd->Rand16() & mask;
+ for (int i = 0; i < kMaxBlockSize; ++i) left[i] = rnd->Rand16() & mask;
+ for (int i = -1; i < kMaxBlockSize; ++i) top[i] = rnd->Rand16() & mask;
+
+ // Some directional predictors require top-right, bottom-left.
+ for (int i = kMaxBlockSize; i < 2 * kMaxBlockSize; ++i) {
+ left[i] = rnd->Rand16() & mask;
+ top[i] = rnd->Rand16() & mask;
+ }
+ // TODO(jzern): reorder this and regenerate the digests after switching
+ // random number generators.
+ // Upsampling in the directional predictors extends left/top[-1] to [-2].
+ left[-1] = rnd->Rand16() & mask;
+ left[-2] = rnd->Rand16() & mask;
+ top[-2] = rnd->Rand16() & mask;
+ memset(left_mem, 0, sizeof(left_mem[0]) * 14);
+ memset(top_mem, 0, sizeof(top_mem[0]) * 14);
+ memset(top_mem + kMaxBlockSize * 2 + 16, 0,
+ sizeof(top_mem[0]) * kTopMemPadding);
+ }
+
+ // Set ref_src, top-left, top and left to |pixel|.
+ void Set(const Pixel pixel) {
+ Pixel* const left = left_mem + 16;
+ Pixel* const top = top_mem + 16;
+ for (auto& r : ref_src) r = pixel;
+ // Upsampling in the directional predictors extends left/top[-1] to [-2].
+ for (int i = -2; i < 2 * kMaxBlockSize; ++i) {
+ left[i] = top[i] = pixel;
+ }
+ }
+
+ // DirectionalZone1_Large() overreads up to 7 pixels in |top_mem|.
+ static constexpr int kTopMemPadding = 7;
+ alignas(kMaxAlignment) Pixel dst[kTotalPixels];
+ alignas(kMaxAlignment) Pixel ref_src[kTotalPixels];
+ alignas(kMaxAlignment) Pixel left_mem[kMaxBlockSize * 2 + 16];
+ alignas(
+ kMaxAlignment) Pixel top_mem[kMaxBlockSize * 2 + 16 + kTopMemPadding];
+ };
+
+ void SetUp() override { test_utils::ResetDspTable(bitdepth); }
+
+ const TransformSize tx_size_ = GetParam();
+ int block_width_;
+ int block_height_;
+ IntraPredMem intra_pred_mem_;
+};
+
+//------------------------------------------------------------------------------
+// DirectionalIntraPredTest
+
+template <int bitdepth, typename Pixel>
+class DirectionalIntraPredTest : public IntraPredTestBase<bitdepth, Pixel> {
+ public:
+ DirectionalIntraPredTest() = default;
+ DirectionalIntraPredTest(const DirectionalIntraPredTest&) = delete;
+ DirectionalIntraPredTest& operator=(const DirectionalIntraPredTest&) = delete;
+ ~DirectionalIntraPredTest() override = default;
+
+ protected:
+ using IntraPredTestBase<bitdepth, Pixel>::tx_size_;
+ using IntraPredTestBase<bitdepth, Pixel>::block_width_;
+ using IntraPredTestBase<bitdepth, Pixel>::block_height_;
+ using IntraPredTestBase<bitdepth, Pixel>::intra_pred_mem_;
+
+ enum Zone { kZone1, kZone2, kZone3, kNumZones };
+
+ enum { kAngleDeltaStart = -9, kAngleDeltaStop = 9, kAngleDeltaStep = 3 };
+
+ void SetUp() override {
+ IntraPredTestBase<bitdepth, Pixel>::SetUp();
+ IntraPredDirectionalInit_C();
+
+ const Dsp* const dsp = GetDspTable(bitdepth);
+ ASSERT_NE(dsp, nullptr);
+ base_directional_intra_pred_zone1_ = dsp->directional_intra_predictor_zone1;
+ base_directional_intra_pred_zone2_ = dsp->directional_intra_predictor_zone2;
+ base_directional_intra_pred_zone3_ = dsp->directional_intra_predictor_zone3;
+
+ const testing::TestInfo* const test_info =
+ testing::UnitTest::GetInstance()->current_test_info();
+ const char* const test_case = test_info->test_suite_name();
+ if (absl::StartsWith(test_case, "C/")) {
+ base_directional_intra_pred_zone1_ = nullptr;
+ base_directional_intra_pred_zone2_ = nullptr;
+ base_directional_intra_pred_zone3_ = nullptr;
+ } else if (absl::StartsWith(test_case, "NEON/")) {
+ IntraPredDirectionalInit_NEON();
+ } else if (absl::StartsWith(test_case, "SSE41/")) {
+ if ((GetCpuInfo() & kSSE4_1) != 0) {
+ IntraPredDirectionalInit_SSE4_1();
+ }
+ } else {
+ FAIL() << "Unrecognized architecture prefix in test case name: "
+ << test_case;
+ }
+
+ cur_directional_intra_pred_zone1_ = dsp->directional_intra_predictor_zone1;
+ cur_directional_intra_pred_zone2_ = dsp->directional_intra_predictor_zone2;
+ cur_directional_intra_pred_zone3_ = dsp->directional_intra_predictor_zone3;
+
+ // Skip functions that haven't been specialized for this particular
+ // architecture.
+ if (cur_directional_intra_pred_zone1_ ==
+ base_directional_intra_pred_zone1_) {
+ cur_directional_intra_pred_zone1_ = nullptr;
+ }
+ if (cur_directional_intra_pred_zone2_ ==
+ base_directional_intra_pred_zone2_) {
+ cur_directional_intra_pred_zone2_ = nullptr;
+ }
+ if (cur_directional_intra_pred_zone3_ ==
+ base_directional_intra_pred_zone3_) {
+ cur_directional_intra_pred_zone3_ = nullptr;
+ }
+ }
+
+ bool IsEdgeUpsampled(int delta, const int filter_type) const {
+ delta = std::abs(delta);
+ if (delta == 0 || delta >= 40) return false;
+ const int block_wh = block_width_ + block_height_;
+ return (filter_type == 1) ? block_wh <= 8 : block_wh <= 16;
+ }
+
+ // Returns the minimum and maximum (exclusive) range of angles that the
+ // predictor should be applied to.
+ void GetZoneAngleRange(const Zone zone, int* const min_angle,
+ int* const max_angle) const {
+ ASSERT_NE(min_angle, nullptr);
+ ASSERT_NE(max_angle, nullptr);
+ switch (zone) {
+ // The overall minimum angle comes from mode D45_PRED, yielding:
+ // min_angle = 45-(MAX_ANGLE_DELTA*ANGLE_STEP) = 36
+ // The overall maximum angle comes from mode D203_PRED, yielding:
+ // max_angle = 203+(MAX_ANGLE_DELTA*ANGLE_STEP) = 212
+ // The angles 180 and 90 are not permitted because they correspond to
+ // V_PRED and H_PRED, which are handled in distinct functions.
+ case kZone1:
+ *min_angle = 36;
+ *max_angle = 87;
+ break;
+ case kZone2:
+ *min_angle = 93;
+ *max_angle = 177;
+ break;
+ case kZone3:
+ *min_angle = 183;
+ *max_angle = 212;
+ break;
+ case kNumZones:
+ FAIL() << "Invalid zone value: " << zone;
+ break;
+ }
+ }
+
+ // These tests modify intra_pred_mem_.
+ void TestSpeed(const char* const digests[kNumDirectionalIntraPredictors],
+ Zone zone, int num_runs);
+ void TestSaturatedValues();
+ void TestRandomValues();
+
+ DirectionalIntraPredictorZone1Func base_directional_intra_pred_zone1_;
+ DirectionalIntraPredictorZone2Func base_directional_intra_pred_zone2_;
+ DirectionalIntraPredictorZone3Func base_directional_intra_pred_zone3_;
+ DirectionalIntraPredictorZone1Func cur_directional_intra_pred_zone1_;
+ DirectionalIntraPredictorZone2Func cur_directional_intra_pred_zone2_;
+ DirectionalIntraPredictorZone3Func cur_directional_intra_pred_zone3_;
+};
+
+template <int bitdepth, typename Pixel>
+void DirectionalIntraPredTest<bitdepth, Pixel>::TestSpeed(
+ const char* const digests[kNumDirectionalIntraPredictors], const Zone zone,
+ const int num_runs) {
+ switch (zone) {
+ case kZone1:
+ if (cur_directional_intra_pred_zone1_ == nullptr) return;
+ break;
+ case kZone2:
+ if (cur_directional_intra_pred_zone2_ == nullptr) return;
+ break;
+ case kZone3:
+ if (cur_directional_intra_pred_zone3_ == nullptr) return;
+ break;
+ case kNumZones:
+ FAIL() << "Invalid zone value: " << zone;
+ break;
+ }
+ ASSERT_NE(digests, nullptr);
+ const Pixel* const left = intra_pred_mem_.left_mem + 16;
+ const Pixel* const top = intra_pred_mem_.top_mem + 16;
+
+ libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+ intra_pred_mem_.Reset(&rnd);
+
+ // Allocate separate blocks for each angle + filter + upsampled combination.
+ // Add a 1 pixel right border to test for overwrites.
+ static constexpr int kMaxZoneAngles = 27; // zone 2
+ static constexpr int kMaxFilterTypes = 2;
+ static constexpr int kBlockBorder = 1;
+ static constexpr int kBorderSize =
+ kBlockBorder * kMaxZoneAngles * kMaxFilterTypes;
+ const int ref_stride =
+ kMaxZoneAngles * kMaxFilterTypes * block_width_ + kBorderSize;
+ const size_t ref_alloc_size = sizeof(Pixel) * ref_stride * block_height_;
+
+ using AlignedPtr = std::unique_ptr<Pixel[], decltype(&AlignedFree)>;
+ AlignedPtr ref_src(static_cast<Pixel*>(AlignedAlloc(16, ref_alloc_size)),
+ &AlignedFree);
+ AlignedPtr dest(static_cast<Pixel*>(AlignedAlloc(16, ref_alloc_size)),
+ &AlignedFree);
+ ASSERT_NE(ref_src, nullptr);
+ ASSERT_NE(dest, nullptr);
+
+ const int mask = (1 << bitdepth) - 1;
+ for (size_t i = 0; i < ref_alloc_size / sizeof(ref_src[0]); ++i) {
+ ref_src[i] = rnd.Rand16() & mask;
+ }
+
+ int min_angle = 0, max_angle = 0;
+ ASSERT_NO_FATAL_FAILURE(GetZoneAngleRange(zone, &min_angle, &max_angle));
+
+ absl::Duration elapsed_time;
+ for (int run = 0; run < num_runs; ++run) {
+ Pixel* dst = dest.get();
+ memcpy(dst, ref_src.get(), ref_alloc_size);
+ for (const auto& base_angle : kBaseAngles) {
+ for (int filter_type = 0; filter_type <= 1; ++filter_type) {
+ for (int angle_delta = kAngleDeltaStart; angle_delta <= kAngleDeltaStop;
+ angle_delta += kAngleDeltaStep) {
+ const int predictor_angle = base_angle + angle_delta;
+ if (predictor_angle < min_angle || predictor_angle > max_angle) {
+ continue;
+ }
+
+ ASSERT_GT(predictor_angle, 0) << "base_angle: " << base_angle
+ << " angle_delta: " << angle_delta;
+ const bool upsampled_left =
+ IsEdgeUpsampled(predictor_angle - 180, filter_type);
+ const bool upsampled_top =
+ IsEdgeUpsampled(predictor_angle - 90, filter_type);
+ const ptrdiff_t stride = ref_stride * sizeof(ref_src[0]);
+ if (predictor_angle < 90) {
+ ASSERT_EQ(zone, kZone1);
+ const int xstep =
+ GetDirectionalIntraPredictorDerivative(predictor_angle);
+ const absl::Time start = absl::Now();
+ cur_directional_intra_pred_zone1_(dst, stride, top, block_width_,
+ block_height_, xstep,
+ upsampled_top);
+ elapsed_time += absl::Now() - start;
+ } else if (predictor_angle < 180) {
+ ASSERT_EQ(zone, kZone2);
+ const int xstep =
+ GetDirectionalIntraPredictorDerivative(180 - predictor_angle);
+ const int ystep =
+ GetDirectionalIntraPredictorDerivative(predictor_angle - 90);
+ const absl::Time start = absl::Now();
+ cur_directional_intra_pred_zone2_(
+ dst, stride, top, left, block_width_, block_height_, xstep,
+ ystep, upsampled_top, upsampled_left);
+ elapsed_time += absl::Now() - start;
+ } else {
+ ASSERT_EQ(zone, kZone3);
+ ASSERT_LT(predictor_angle, 270);
+ const int ystep =
+ GetDirectionalIntraPredictorDerivative(270 - predictor_angle);
+ const absl::Time start = absl::Now();
+ cur_directional_intra_pred_zone3_(dst, stride, left, block_width_,
+ block_height_, ystep,
+ upsampled_left);
+ elapsed_time += absl::Now() - start;
+ }
+ dst += block_width_ + kBlockBorder;
+ }
+ }
+ }
+ }
+
+ test_utils::CheckMd5Digest(ToString(tx_size_), kDirectionalPredNames[zone],
+ digests[zone], dest.get(), ref_alloc_size,
+ elapsed_time);
+}
+
+template <int bitdepth, typename Pixel>
+void DirectionalIntraPredTest<bitdepth, Pixel>::TestSaturatedValues() {
+ const Pixel* const left = intra_pred_mem_.left_mem + 16;
+ const Pixel* const top = intra_pred_mem_.top_mem + 16;
+ const auto kMaxPixel = static_cast<Pixel>((1 << bitdepth) - 1);
+ intra_pred_mem_.Set(kMaxPixel);
+
+ for (int i = kZone1; i < kNumZones; ++i) {
+ switch (i) {
+ case kZone1:
+ if (cur_directional_intra_pred_zone1_ == nullptr) continue;
+ break;
+ case kZone2:
+ if (cur_directional_intra_pred_zone2_ == nullptr) continue;
+ break;
+ case kZone3:
+ if (cur_directional_intra_pred_zone3_ == nullptr) continue;
+ break;
+ case kNumZones:
+ FAIL() << "Invalid zone value: " << i;
+ break;
+ }
+ int min_angle = 0, max_angle = 0;
+ ASSERT_NO_FATAL_FAILURE(
+ GetZoneAngleRange(static_cast<Zone>(i), &min_angle, &max_angle));
+
+ for (const auto& base_angle : kBaseAngles) {
+ for (int filter_type = 0; filter_type <= 1; ++filter_type) {
+ for (int angle_delta = kAngleDeltaStart; angle_delta <= kAngleDeltaStop;
+ angle_delta += kAngleDeltaStep) {
+ const int predictor_angle = base_angle + angle_delta;
+ if (predictor_angle <= min_angle || predictor_angle >= max_angle) {
+ continue;
+ }
+ ASSERT_GT(predictor_angle, 0) << "base_angle: " << base_angle
+ << " angle_delta: " << angle_delta;
+
+ memcpy(intra_pred_mem_.dst, intra_pred_mem_.ref_src,
+ sizeof(intra_pred_mem_.dst));
+
+ const bool upsampled_left =
+ IsEdgeUpsampled(predictor_angle - 180, filter_type);
+ const bool upsampled_top =
+ IsEdgeUpsampled(predictor_angle - 90, filter_type);
+ const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel);
+ if (predictor_angle < 90) {
+ const int xstep =
+ GetDirectionalIntraPredictorDerivative(predictor_angle);
+ cur_directional_intra_pred_zone1_(intra_pred_mem_.dst, stride, top,
+ block_width_, block_height_,
+ xstep, upsampled_top);
+ } else if (predictor_angle < 180) {
+ const int xstep =
+ GetDirectionalIntraPredictorDerivative(180 - predictor_angle);
+ const int ystep =
+ GetDirectionalIntraPredictorDerivative(predictor_angle - 90);
+ cur_directional_intra_pred_zone2_(
+ intra_pred_mem_.dst, stride, top, left, block_width_,
+ block_height_, xstep, ystep, upsampled_top, upsampled_left);
+ } else {
+ ASSERT_LT(predictor_angle, 270);
+ const int ystep =
+ GetDirectionalIntraPredictorDerivative(270 - predictor_angle);
+ cur_directional_intra_pred_zone3_(intra_pred_mem_.dst, stride, left,
+ block_width_, block_height_,
+ ystep, upsampled_left);
+ }
+
+ if (!test_utils::CompareBlocks(
+ intra_pred_mem_.dst, intra_pred_mem_.ref_src, block_width_,
+ block_height_, kMaxBlockSize, kMaxBlockSize, true)) {
+ ADD_FAILURE() << "Expected " << kDirectionalPredNames[i]
+ << " (angle: " << predictor_angle
+ << " filter type: " << filter_type
+ << ") to produce a block containing '"
+ << static_cast<int>(kMaxPixel) << "'";
+ return;
+ }
+ }
+ }
+ }
+ }
+}
+
+template <int bitdepth, typename Pixel>
+void DirectionalIntraPredTest<bitdepth, Pixel>::TestRandomValues() {
+ const Pixel* const left = intra_pred_mem_.left_mem + 16;
+ const Pixel* const top = intra_pred_mem_.top_mem + 16;
+ // Use an alternate seed to differentiate this test from TestSpeed().
+ libvpx_test::ACMRandom rnd(test_utils::kAlternateDeterministicSeed);
+
+ for (int i = kZone1; i < kNumZones; ++i) {
+ // Only run when there is a reference version (base) and a different
+ // optimized version (cur).
+ switch (i) {
+ case kZone1:
+ if (base_directional_intra_pred_zone1_ == nullptr ||
+ cur_directional_intra_pred_zone1_ == nullptr) {
+ continue;
+ }
+ break;
+ case kZone2:
+ if (base_directional_intra_pred_zone2_ == nullptr ||
+ cur_directional_intra_pred_zone2_ == nullptr) {
+ continue;
+ }
+ break;
+ case kZone3:
+ if (base_directional_intra_pred_zone3_ == nullptr ||
+ cur_directional_intra_pred_zone3_ == nullptr) {
+ continue;
+ }
+ break;
+ case kNumZones:
+ FAIL() << "Invalid zone value: " << i;
+ break;
+ }
+ int min_angle = 0, max_angle = 0;
+ ASSERT_NO_FATAL_FAILURE(
+ GetZoneAngleRange(static_cast<Zone>(i), &min_angle, &max_angle));
+
+ for (const auto& base_angle : kBaseAngles) {
+ for (int n = 0; n < 1000; ++n) {
+ for (int filter_type = 0; filter_type <= 1; ++filter_type) {
+ for (int angle_delta = kAngleDeltaStart;
+ angle_delta <= kAngleDeltaStop; angle_delta += kAngleDeltaStep) {
+ const int predictor_angle = base_angle + angle_delta;
+ if (predictor_angle <= min_angle || predictor_angle >= max_angle) {
+ continue;
+ }
+ ASSERT_GT(predictor_angle, 0) << "base_angle: " << base_angle
+ << " angle_delta: " << angle_delta;
+
+ intra_pred_mem_.Reset(&rnd);
+ memcpy(intra_pred_mem_.dst, intra_pred_mem_.ref_src,
+ sizeof(intra_pred_mem_.dst));
+
+ const bool upsampled_left =
+ IsEdgeUpsampled(predictor_angle - 180, filter_type);
+ const bool upsampled_top =
+ IsEdgeUpsampled(predictor_angle - 90, filter_type);
+ const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel);
+ if (predictor_angle < 90) {
+ const int xstep =
+ GetDirectionalIntraPredictorDerivative(predictor_angle);
+ base_directional_intra_pred_zone1_(
+ intra_pred_mem_.ref_src, stride, top, block_width_,
+ block_height_, xstep, upsampled_top);
+ cur_directional_intra_pred_zone1_(
+ intra_pred_mem_.dst, stride, top, block_width_, block_height_,
+ xstep, upsampled_top);
+ } else if (predictor_angle < 180) {
+ const int xstep =
+ GetDirectionalIntraPredictorDerivative(180 - predictor_angle);
+ const int ystep =
+ GetDirectionalIntraPredictorDerivative(predictor_angle - 90);
+ base_directional_intra_pred_zone2_(
+ intra_pred_mem_.ref_src, stride, top, left, block_width_,
+ block_height_, xstep, ystep, upsampled_top, upsampled_left);
+ cur_directional_intra_pred_zone2_(
+ intra_pred_mem_.dst, stride, top, left, block_width_,
+ block_height_, xstep, ystep, upsampled_top, upsampled_left);
+ } else {
+ ASSERT_LT(predictor_angle, 270);
+ const int ystep =
+ GetDirectionalIntraPredictorDerivative(270 - predictor_angle);
+ base_directional_intra_pred_zone3_(
+ intra_pred_mem_.ref_src, stride, left, block_width_,
+ block_height_, ystep, upsampled_left);
+ cur_directional_intra_pred_zone3_(
+ intra_pred_mem_.dst, stride, left, block_width_,
+ block_height_, ystep, upsampled_left);
+ }
+
+ if (!test_utils::CompareBlocks(
+ intra_pred_mem_.dst, intra_pred_mem_.ref_src, block_width_,
+ block_height_, kMaxBlockSize, kMaxBlockSize, true)) {
+ ADD_FAILURE() << "Result from optimized version of "
+ << kDirectionalPredNames[i]
+ << " differs from reference at angle "
+ << predictor_angle << " with filter type "
+ << filter_type << " in iteration #" << n;
+ return;
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+using DirectionalIntraPredTest8bpp = DirectionalIntraPredTest<8, uint8_t>;
+
+const char* const* GetDirectionalIntraPredDigests8bpp(TransformSize tx_size) {
+ static const char* const kDigests4x4[kNumDirectionalIntraPredictors] = {
+ "9cfc1da729ad08682e165826c29b280b",
+ "bb73539c7afbda7bddd2184723b932d6",
+ "9d2882800ffe948196e984a26a2da72c",
+ };
+ static const char* const kDigests4x8[kNumDirectionalIntraPredictors] = {
+ "090efe6f83cc6fa301f65d3bbd5c38d2",
+ "d0fba4cdfb90f8bd293a94cae9db1a15",
+ "f7ad0eeab4389d0baa485d30fec87617",
+ };
+ static const char* const kDigests4x16[kNumDirectionalIntraPredictors] = {
+ "1d32b33c75fe85248c48cdc8caa78d84",
+ "7000e18159443d366129a6cc6ef8fcee",
+ "06c02fac5f8575f687abb3f634eb0b4c",
+ };
+ static const char* const kDigests8x4[kNumDirectionalIntraPredictors] = {
+ "1b591799685bc135982114b731293f78",
+ "5cd9099acb9f7b2618dafa6712666580",
+ "d023883efede88f99c19d006044d9fa1",
+ };
+ static const char* const kDigests8x8[kNumDirectionalIntraPredictors] = {
+ "f1e46ecf62a2516852f30c5025adb7ea",
+ "864442a209c16998065af28d8cdd839a",
+ "411a6e554868982af577de69e53f12e8",
+ };
+ static const char* const kDigests8x16[kNumDirectionalIntraPredictors] = {
+ "89278302be913a85cfb06feaea339459",
+ "6c42f1a9493490cd4529fd40729cec3c",
+ "2516b5e1c681e5dcb1acedd5f3d41106",
+ };
+ static const char* const kDigests8x32[kNumDirectionalIntraPredictors] = {
+ "aea7078f3eeaa8afbfe6c959c9e676f1",
+ "cad30babf12729dda5010362223ba65c",
+ "ff384ebdc832007775af418a2aae1463",
+ };
+ static const char* const kDigests16x4[kNumDirectionalIntraPredictors] = {
+ "964a821c313c831e12f4d32e616c0b55",
+ "adf6dad3a84ab4d16c16eea218bec57a",
+ "a54fa008d43895e523474686c48a81c2",
+ };
+ static const char* const kDigests16x8[kNumDirectionalIntraPredictors] = {
+ "fe2851b4e4f9fcf924cf17d50415a4c0",
+ "50a0e279c481437ff315d08eb904c733",
+ "0682065c8fb6cbf9be4949316c87c9e5",
+ };
+ static const char* const kDigests16x16[kNumDirectionalIntraPredictors] = {
+ "ef15503b1943642e7a0bace1616c0e11",
+ "bf1a4d3f855f1072a902a88ec6ce0350",
+ "7e87a03e29cd7fd843fd71b729a18f3f",
+ };
+ static const char* const kDigests16x32[kNumDirectionalIntraPredictors] = {
+ "f7b636615d2e5bf289b5db452a6f188d",
+ "e95858c532c10d00b0ce7a02a02121dd",
+ "34a18ccf58ef490f32268e85ce8c7de4",
+ };
+ static const char* const kDigests16x64[kNumDirectionalIntraPredictors] = {
+ "b250099986c2fab9670748598058846b",
+ "f25d80af4da862a9b6b72979f1e17cb4",
+ "5347dc7bc346733b4887f6c8ad5e0898",
+ };
+ static const char* const kDigests32x8[kNumDirectionalIntraPredictors] = {
+ "72e4c9f8af043b1cb1263490351818ab",
+ "1fc010d2df011b9e4e3d0957107c78df",
+ "f4cbfa3ca941ef08b972a68d7e7bafc4",
+ };
+ static const char* const kDigests32x16[kNumDirectionalIntraPredictors] = {
+ "37e5a1aaf7549d2bce08eece9d20f0f6",
+ "6a2794025d0aca414ab17baa3cf8251a",
+ "63dd37a6efdc91eeefef166c99ce2db1",
+ };
+ static const char* const kDigests32x32[kNumDirectionalIntraPredictors] = {
+ "198aabc958992eb49cceab97d1acb43e",
+ "aee88b6c8bacfcf38799fe338e6c66e7",
+ "01e8f8f96696636f6d79d33951907a16",
+ };
+ static const char* const kDigests32x64[kNumDirectionalIntraPredictors] = {
+ "0611390202c4f90f7add7aec763ded58",
+ "960240c7ceda2ccfac7c90b71460578a",
+ "7e7d97594aab8ad56e8c01c340335607",
+ };
+ static const char* const kDigests64x16[kNumDirectionalIntraPredictors] = {
+ "7e1f567e7fc510757f2d89d638bc826f",
+ "c929d687352ce40a58670be2ce3c8c90",
+ "f6881e6a9ba3c3d3d730b425732656b1",
+ };
+ static const char* const kDigests64x32[kNumDirectionalIntraPredictors] = {
+ "27b4c2a7081d4139f22003ba8b6dfdf2",
+ "301e82740866b9274108a04c872fa848",
+ "98d3aa4fef838f4abf00dac33806659f",
+ };
+ static const char* const kDigests64x64[kNumDirectionalIntraPredictors] = {
+ "b31816db8fade3accfd975b21aa264c7",
+ "2adce01a03b9452633d5830e1a9b4e23",
+ "7b988fadba8b07c36e88d7be6b270494",
+ };
+
+ switch (tx_size) {
+ case kTransformSize4x4:
+ return kDigests4x4;
+ case kTransformSize4x8:
+ return kDigests4x8;
+ case kTransformSize4x16:
+ return kDigests4x16;
+ case kTransformSize8x4:
+ return kDigests8x4;
+ case kTransformSize8x8:
+ return kDigests8x8;
+ case kTransformSize8x16:
+ return kDigests8x16;
+ case kTransformSize8x32:
+ return kDigests8x32;
+ case kTransformSize16x4:
+ return kDigests16x4;
+ case kTransformSize16x8:
+ return kDigests16x8;
+ case kTransformSize16x16:
+ return kDigests16x16;
+ case kTransformSize16x32:
+ return kDigests16x32;
+ case kTransformSize16x64:
+ return kDigests16x64;
+ case kTransformSize32x8:
+ return kDigests32x8;
+ case kTransformSize32x16:
+ return kDigests32x16;
+ case kTransformSize32x32:
+ return kDigests32x32;
+ case kTransformSize32x64:
+ return kDigests32x64;
+ case kTransformSize64x16:
+ return kDigests64x16;
+ case kTransformSize64x32:
+ return kDigests64x32;
+ case kTransformSize64x64:
+ return kDigests64x64;
+ default:
+ ADD_FAILURE() << "Unknown transform size: " << tx_size;
+ return nullptr;
+ }
+}
+
+TEST_P(DirectionalIntraPredTest8bpp, DISABLED_Speed) {
+ const auto num_runs = static_cast<int>(5e7 / (block_width_ * block_height_));
+ for (int i = kZone1; i < kNumZones; ++i) {
+ TestSpeed(GetDirectionalIntraPredDigests8bpp(tx_size_),
+ static_cast<Zone>(i), num_runs);
+ }
+}
+
+TEST_P(DirectionalIntraPredTest8bpp, FixedInput) {
+ for (int i = kZone1; i < kNumZones; ++i) {
+ TestSpeed(GetDirectionalIntraPredDigests8bpp(tx_size_),
+ static_cast<Zone>(i), 1);
+ }
+}
+
+TEST_P(DirectionalIntraPredTest8bpp, Overflow) { TestSaturatedValues(); }
+TEST_P(DirectionalIntraPredTest8bpp, Random) { TestRandomValues(); }
+
+//------------------------------------------------------------------------------
+#if LIBGAV1_MAX_BITDEPTH >= 10
+
+using DirectionalIntraPredTest10bpp = DirectionalIntraPredTest<10, uint16_t>;
+
+const char* const* GetDirectionalIntraPredDigests10bpp(TransformSize tx_size) {
+ static const char* const kDigests4x4[kNumDirectionalIntraPredictors] = {
+ "a683f4d7ccd978737615f61ecb4d638d",
+ "90c94374eaf7e9501f197863937b8639",
+ "0d3969cd081523ac6a906eecc7980c43",
+ };
+ static const char* const kDigests4x8[kNumDirectionalIntraPredictors] = {
+ "c3ffa2979b325644e4a56c882fe27347",
+ "1f61f5ee413a9a3b8d1d93869ec2aee0",
+ "4795ea944779ec4a783408769394d874",
+ };
+ static const char* const kDigests4x16[kNumDirectionalIntraPredictors] = {
+ "45c3282c9aa51024c1d64a40f230aa45",
+ "5cd47dd69f8bd0b15365a0c5cfc0a49a",
+ "06336c507b05f98c1d6a21abc43e6182",
+ };
+ static const char* const kDigests8x4[kNumDirectionalIntraPredictors] = {
+ "7370476ff0abbdc5e92f811b8879c861",
+ "a239a50adb28a4791b52a0dfff3bee06",
+ "4779a17f958a9ca04e8ec08c5aba1d36",
+ };
+ static const char* const kDigests8x8[kNumDirectionalIntraPredictors] = {
+ "305463f346c376594f82aad8304e0362",
+ "0cd481e5bda286c87a645417569fd948",
+ "48c7899dc9b7163b0b1f61b3a2b4b73e",
+ };
+ static const char* const kDigests8x16[kNumDirectionalIntraPredictors] = {
+ "5c18fd5339be90628c82b1fb6af50d5e",
+ "35eaa566ebd3bb7c903cfead5dc9ac78",
+ "9fdb0e790e5965810d02c02713c84071",
+ };
+ static const char* const kDigests8x32[kNumDirectionalIntraPredictors] = {
+ "2168d6cc858c704748b7b343ced2ac3a",
+ "1d3ce273107447faafd2e55877e48ffb",
+ "d344164049d1fe9b65a3ae8764bbbd37",
+ };
+ static const char* const kDigests16x4[kNumDirectionalIntraPredictors] = {
+ "dcef2cf51abe3fe150f388a14c762d30",
+ "6a810b289b1c14f8eab8ca1274e91ecd",
+ "c94da7c11f3fb11963d85c8804fce2d9",
+ };
+ static const char* const kDigests16x8[kNumDirectionalIntraPredictors] = {
+ "50a0d08b0d99b7a574bad2cfb36efc39",
+ "2dcb55874db39da70c8ca1318559f9fe",
+ "6390bcd30ff3bc389ecc0a0952bea531",
+ };
+ static const char* const kDigests16x16[kNumDirectionalIntraPredictors] = {
+ "7146c83c2620935606d49f3cb5876f41",
+ "2318ddf30c070a53c9b9cf199cd1b2c5",
+ "e9042e2124925aa7c1b6110617cb10e8",
+ };
+ static const char* const kDigests16x32[kNumDirectionalIntraPredictors] = {
+ "c970f401de7b7c5bb4e3ad447fcbef8f",
+ "a18cc70730eecdaa31dbcf4306ff490f",
+ "32c1528ad4a576a2210399d6b4ccd46e",
+ };
+ static const char* const kDigests16x64[kNumDirectionalIntraPredictors] = {
+ "00b3f0007da2e5d01380594a3d7162d5",
+ "1971af519e4a18967b7311f93efdd1b8",
+ "e6139769ce5a9c4982cfab9363004516",
+ };
+ static const char* const kDigests32x8[kNumDirectionalIntraPredictors] = {
+ "08107ad971179cc9f465ae5966bd4901",
+ "b215212a3c0dfe9182c4f2e903d731f7",
+ "791274416a0da87c674e1ae318b3ce09",
+ };
+ static const char* const kDigests32x16[kNumDirectionalIntraPredictors] = {
+ "94ea6cccae35b5d08799aa003ac08ccf",
+ "ae105e20e63fb55d4fd9d9e59dc62dde",
+ "973d0b2358ea585e4f486e7e645c5310",
+ };
+ static const char* const kDigests32x32[kNumDirectionalIntraPredictors] = {
+ "d14c695c4853ddf5e5d8256bc1d1ed60",
+ "6bd0ebeb53adecc11442b1218b870cb7",
+ "e03bc402a9999aba8272275dce93e89f",
+ };
+ static const char* const kDigests32x64[kNumDirectionalIntraPredictors] = {
+ "b21a8a8723758392ee659eeeae518a1e",
+ "e50285454896210ce44d6f04dfde05a7",
+ "f0f8ea0c6c2acc8d7d390927c3a90370",
+ };
+ static const char* const kDigests64x16[kNumDirectionalIntraPredictors] = {
+ "ce51db16fd4fa56e601631397b098c89",
+ "aa87a8635e02c1e91d13158c61e443f6",
+ "4c1ee3afd46ef34bd711a34d0bf86f13",
+ };
+ static const char* const kDigests64x32[kNumDirectionalIntraPredictors] = {
+ "25aaf5971e24e543e3e69a47254af777",
+ "eb6f444b3df127d69460778ab5bf8fc1",
+ "2f846cc0d506f90c0a58438600819817",
+ };
+ static const char* const kDigests64x64[kNumDirectionalIntraPredictors] = {
+ "b26ce5b5f4b5d4a438b52e5987877fb8",
+ "35721a00a70938111939cf69988d928e",
+ "0af7ec35939483fac82c246a13845806",
+ };
+
+ switch (tx_size) {
+ case kTransformSize4x4:
+ return kDigests4x4;
+ case kTransformSize4x8:
+ return kDigests4x8;
+ case kTransformSize4x16:
+ return kDigests4x16;
+ case kTransformSize8x4:
+ return kDigests8x4;
+ case kTransformSize8x8:
+ return kDigests8x8;
+ case kTransformSize8x16:
+ return kDigests8x16;
+ case kTransformSize8x32:
+ return kDigests8x32;
+ case kTransformSize16x4:
+ return kDigests16x4;
+ case kTransformSize16x8:
+ return kDigests16x8;
+ case kTransformSize16x16:
+ return kDigests16x16;
+ case kTransformSize16x32:
+ return kDigests16x32;
+ case kTransformSize16x64:
+ return kDigests16x64;
+ case kTransformSize32x8:
+ return kDigests32x8;
+ case kTransformSize32x16:
+ return kDigests32x16;
+ case kTransformSize32x32:
+ return kDigests32x32;
+ case kTransformSize32x64:
+ return kDigests32x64;
+ case kTransformSize64x16:
+ return kDigests64x16;
+ case kTransformSize64x32:
+ return kDigests64x32;
+ case kTransformSize64x64:
+ return kDigests64x64;
+ default:
+ ADD_FAILURE() << "Unknown transform size: " << tx_size;
+ return nullptr;
+ }
+}
+
+TEST_P(DirectionalIntraPredTest10bpp, DISABLED_Speed) {
+ const auto num_runs = static_cast<int>(5e7 / (block_width_ * block_height_));
+ for (int i = kZone1; i < kNumZones; ++i) {
+ TestSpeed(GetDirectionalIntraPredDigests10bpp(tx_size_),
+ static_cast<Zone>(i), num_runs);
+ }
+}
+
+TEST_P(DirectionalIntraPredTest10bpp, FixedInput) {
+ for (int i = kZone1; i < kNumZones; ++i) {
+ TestSpeed(GetDirectionalIntraPredDigests10bpp(tx_size_),
+ static_cast<Zone>(i), 1);
+ }
+}
+
+TEST_P(DirectionalIntraPredTest10bpp, Overflow) { TestSaturatedValues(); }
+
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+constexpr TransformSize kTransformSizes[] = {
+ kTransformSize4x4, kTransformSize4x8, kTransformSize4x16,
+ kTransformSize8x4, kTransformSize8x8, kTransformSize8x16,
+ kTransformSize8x32, kTransformSize16x4, kTransformSize16x8,
+ kTransformSize16x16, kTransformSize16x32, kTransformSize16x64,
+ kTransformSize32x8, kTransformSize32x16, kTransformSize32x32,
+ kTransformSize32x64, kTransformSize64x16, kTransformSize64x32,
+ kTransformSize64x64};
+
+INSTANTIATE_TEST_SUITE_P(C, DirectionalIntraPredTest8bpp,
+ testing::ValuesIn(kTransformSizes));
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, DirectionalIntraPredTest8bpp,
+ testing::ValuesIn(kTransformSizes));
+#endif // LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, DirectionalIntraPredTest8bpp,
+ testing::ValuesIn(kTransformSizes));
+#endif // LIBGAV1_ENABLE_NEON
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+INSTANTIATE_TEST_SUITE_P(C, DirectionalIntraPredTest10bpp,
+ testing::ValuesIn(kTransformSizes));
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, DirectionalIntraPredTest10bpp,
+ testing::ValuesIn(kTransformSizes));
+#endif // LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, DirectionalIntraPredTest10bpp,
+ testing::ValuesIn(kTransformSizes));
+#endif // LIBGAV1_ENABLE_NEON
+
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+} // namespace
+} // namespace dsp
+
+static std::ostream& operator<<(std::ostream& os, const TransformSize tx_size) {
+ return os << ToString(tx_size);
+}
+
+} // namespace libgav1
diff --git a/src/dsp/intrapred_filter.cc b/src/dsp/intrapred_filter.cc
new file mode 100644
index 0000000..f4bd296
--- /dev/null
+++ b/src/dsp/intrapred_filter.cc
@@ -0,0 +1,144 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred_filter.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+//------------------------------------------------------------------------------
+// FilterIntraPredictor_C
+
+// The recursive filter applies a different filter to the top 4 and 2 left
+// pixels to produce each pixel in a 4x2 sub-block. Each successive 4x2 uses the
+// prediction output of the blocks above and to the left, unless they are
+// adjacent to the |top_row| or |left_column|. The set of 8 filters is selected
+// according to |pred|.
+template <int bitdepth, typename Pixel>
+void FilterIntraPredictor_C(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column,
+ const FilterIntraPredictor pred, const int width,
+ const int height) {
+ const int kMaxPixel = (1 << bitdepth) - 1;
+ const auto* const top = static_cast<const Pixel*>(top_row);
+ const auto* const left = static_cast<const Pixel*>(left_column);
+
+ assert(width <= 32 && height <= 32);
+
+ Pixel buffer[3][33]; // cache 2 rows + top & left boundaries
+ memcpy(buffer[0], &top[-1], (width + 1) * sizeof(top[0]));
+
+ auto* dst = static_cast<Pixel*>(dest);
+ stride /= sizeof(Pixel);
+ int row0 = 0, row2 = 2;
+ int ystep = 1;
+ int y = 0;
+ do {
+ buffer[1][0] = left[y];
+ buffer[row2][0] = left[y + 1];
+ int x = 1;
+ do {
+ const Pixel p0 = buffer[row0][x - 1]; // top-left
+ const Pixel p1 = buffer[row0][x + 0]; // top 0
+ const Pixel p2 = buffer[row0][x + 1]; // top 1
+ const Pixel p3 = buffer[row0][x + 2]; // top 2
+ const Pixel p4 = buffer[row0][x + 3]; // top 3
+ const Pixel p5 = buffer[1][x - 1]; // left 0
+ const Pixel p6 = buffer[row2][x - 1]; // left 1
+ for (int i = 0; i < 8; ++i) {
+ const int xoffset = i & 0x03;
+ const int yoffset = (i >> 2) * ystep;
+ const int value = kFilterIntraTaps[pred][i][0] * p0 +
+ kFilterIntraTaps[pred][i][1] * p1 +
+ kFilterIntraTaps[pred][i][2] * p2 +
+ kFilterIntraTaps[pred][i][3] * p3 +
+ kFilterIntraTaps[pred][i][4] * p4 +
+ kFilterIntraTaps[pred][i][5] * p5 +
+ kFilterIntraTaps[pred][i][6] * p6;
+ // Section 7.11.2.3 specifies the right-hand side of the assignment as
+ // Clip1( Round2Signed( pr, INTRA_FILTER_SCALE_BITS ) ).
+ // Since Clip1() clips a negative value to 0, it is safe to replace
+ // Round2Signed() with Round2().
+ buffer[1 + yoffset][x + xoffset] = static_cast<Pixel>(
+ Clip3(RightShiftWithRounding(value, 4), 0, kMaxPixel));
+ }
+ x += 4;
+ } while (x < width);
+ memcpy(dst, &buffer[1][1], width * sizeof(dst[0]));
+ dst += stride;
+ memcpy(dst, &buffer[row2][1], width * sizeof(dst[0]));
+ dst += stride;
+
+ // The final row becomes the top for the next pass.
+ row0 ^= 2;
+ row2 ^= 2;
+ ystep = -ystep;
+ y += 2;
+ } while (y < height);
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->filter_intra_predictor = FilterIntraPredictor_C<8, uint8_t>;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_FilterIntraPredictor
+ dsp->filter_intra_predictor = FilterIntraPredictor_C<8, uint8_t>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->filter_intra_predictor = FilterIntraPredictor_C<10, uint16_t>;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_FilterIntraPredictor
+ dsp->filter_intra_predictor = FilterIntraPredictor_C<10, uint16_t>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+} // namespace
+
+void IntraPredFilterInit_C() {
+ Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ Init10bpp();
+#endif
+}
+
+} // namespace dsp
+} // namespace libgav1
diff --git a/src/dsp/intrapred_filter.h b/src/dsp/intrapred_filter.h
new file mode 100644
index 0000000..8146b82
--- /dev/null
+++ b/src/dsp/intrapred_filter.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_INTRAPRED_FILTER_H_
+#define LIBGAV1_SRC_DSP_INTRAPRED_FILTER_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/intrapred_filter_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/intrapred_filter_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::intra_predictors, Dsp::directional_intra_predictor_zone*,
+// Dsp::cfl_intra_predictors, Dsp::cfl_subsamplers and
+// Dsp::filter_intra_predictor. This function is not thread-safe.
+void IntraPredFilterInit_C();
+
+} // namespace dsp
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_DSP_INTRAPRED_FILTER_H_
diff --git a/src/dsp/intrapred_filter_test.cc b/src/dsp/intrapred_filter_test.cc
new file mode 100644
index 0000000..c420f0a
--- /dev/null
+++ b/src/dsp/intrapred_filter_test.cc
@@ -0,0 +1,554 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred_filter.h"
+
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <ostream>
+
+#include "absl/strings/match.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "tests/block_utils.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kMaxBlockSize = 64;
+constexpr int kTotalPixels = kMaxBlockSize * kMaxBlockSize;
+
+const char* const kFilterIntraPredNames[kNumFilterIntraPredictors] = {
+ "kFilterIntraPredictorDc", "kFilterIntraPredictorVertical",
+ "kFilterIntraPredictorHorizontal", "kFilterIntraPredictorD157",
+ "kFilterIntraPredictorPaeth",
+};
+
+template <int bitdepth, typename Pixel>
+class IntraPredTestBase : public testing::TestWithParam<TransformSize>,
+ public test_utils::MaxAlignedAllocable {
+ public:
+ IntraPredTestBase() {
+ switch (tx_size_) {
+ case kNumTransformSizes:
+ EXPECT_NE(tx_size_, kNumTransformSizes);
+ break;
+ default:
+ block_width_ = kTransformWidth[tx_size_];
+ block_height_ = kTransformHeight[tx_size_];
+ break;
+ }
+ }
+
+ IntraPredTestBase(const IntraPredTestBase&) = delete;
+ IntraPredTestBase& operator=(const IntraPredTestBase&) = delete;
+ ~IntraPredTestBase() override = default;
+
+ protected:
+ struct IntraPredMem {
+ void Reset(libvpx_test::ACMRandom* rnd) {
+ ASSERT_NE(rnd, nullptr);
+ Pixel* const left = left_mem + 16;
+ Pixel* const top = top_mem + 16;
+ const int mask = (1 << bitdepth) - 1;
+ for (auto& r : ref_src) r = rnd->Rand16() & mask;
+ for (int i = 0; i < kMaxBlockSize; ++i) left[i] = rnd->Rand16() & mask;
+ for (int i = -1; i < kMaxBlockSize; ++i) top[i] = rnd->Rand16() & mask;
+
+ // Some directional predictors require top-right, bottom-left.
+ for (int i = kMaxBlockSize; i < 2 * kMaxBlockSize; ++i) {
+ left[i] = rnd->Rand16() & mask;
+ top[i] = rnd->Rand16() & mask;
+ }
+ // TODO(jzern): reorder this and regenerate the digests after switching
+ // random number generators.
+ // Upsampling in the directional predictors extends left/top[-1] to [-2].
+ left[-1] = rnd->Rand16() & mask;
+ left[-2] = rnd->Rand16() & mask;
+ top[-2] = rnd->Rand16() & mask;
+ memset(left_mem, 0, sizeof(left_mem[0]) * 14);
+ memset(top_mem, 0, sizeof(top_mem[0]) * 14);
+ memset(top_mem + kMaxBlockSize * 2 + 16, 0,
+ sizeof(top_mem[0]) * kTopMemPadding);
+ }
+
+ // Set ref_src, top-left, top and left to |pixel|.
+ void Set(const Pixel pixel) {
+ Pixel* const left = left_mem + 16;
+ Pixel* const top = top_mem + 16;
+ for (auto& r : ref_src) r = pixel;
+ // Upsampling in the directional predictors extends left/top[-1] to [-2].
+ for (int i = -2; i < 2 * kMaxBlockSize; ++i) {
+ left[i] = top[i] = pixel;
+ }
+ }
+
+ // DirectionalZone1_Large() overreads up to 7 pixels in |top_mem|.
+ static constexpr int kTopMemPadding = 7;
+ alignas(kMaxAlignment) Pixel dst[kTotalPixels];
+ alignas(kMaxAlignment) Pixel ref_src[kTotalPixels];
+ alignas(kMaxAlignment) Pixel left_mem[kMaxBlockSize * 2 + 16];
+ alignas(
+ kMaxAlignment) Pixel top_mem[kMaxBlockSize * 2 + 16 + kTopMemPadding];
+ };
+
+ void SetUp() override { test_utils::ResetDspTable(bitdepth); }
+
+ const TransformSize tx_size_ = GetParam();
+ int block_width_;
+ int block_height_;
+ IntraPredMem intra_pred_mem_;
+};
+
+//------------------------------------------------------------------------------
+// FilterIntraPredTest
+
+template <int bitdepth, typename Pixel>
+class FilterIntraPredTest : public IntraPredTestBase<bitdepth, Pixel> {
+ public:
+ FilterIntraPredTest() = default;
+ FilterIntraPredTest(const FilterIntraPredTest&) = delete;
+ FilterIntraPredTest& operator=(const FilterIntraPredTest&) = delete;
+ ~FilterIntraPredTest() override = default;
+
+ protected:
+ using IntraPredTestBase<bitdepth, Pixel>::tx_size_;
+ using IntraPredTestBase<bitdepth, Pixel>::block_width_;
+ using IntraPredTestBase<bitdepth, Pixel>::block_height_;
+ using IntraPredTestBase<bitdepth, Pixel>::intra_pred_mem_;
+
+ void SetUp() override {
+ IntraPredTestBase<bitdepth, Pixel>::SetUp();
+ IntraPredFilterInit_C();
+
+ const Dsp* const dsp = GetDspTable(bitdepth);
+ ASSERT_NE(dsp, nullptr);
+ base_filter_intra_pred_ = dsp->filter_intra_predictor;
+
+ const testing::TestInfo* const test_info =
+ testing::UnitTest::GetInstance()->current_test_info();
+ const char* const test_case = test_info->test_suite_name();
+ if (absl::StartsWith(test_case, "C/")) {
+ // No need to compare C with itself.
+ base_filter_intra_pred_ = nullptr;
+ } else if (absl::StartsWith(test_case, "SSE41/")) {
+ if ((GetCpuInfo() & kSSE4_1) != 0) {
+ IntraPredFilterInit_SSE4_1();
+ }
+ } else if (absl::StartsWith(test_case, "NEON/")) {
+ IntraPredFilterInit_NEON();
+ } else {
+ FAIL() << "Unrecognized architecture prefix in test case name: "
+ << test_case;
+ }
+
+ // Put the current architecture-specific implementation up for testing and
+ // comparison against C version.
+ cur_filter_intra_pred_ = dsp->filter_intra_predictor;
+ }
+
+ // These tests modify intra_pred_mem_.
+ void TestSpeed(const char* const digests[kNumFilterIntraPredictors],
+ int num_runs);
+ void TestSaturatedValues();
+ void TestRandomValues();
+
+ FilterIntraPredictorFunc base_filter_intra_pred_;
+ FilterIntraPredictorFunc cur_filter_intra_pred_;
+};
+
+template <int bitdepth, typename Pixel>
+void FilterIntraPredTest<bitdepth, Pixel>::TestSpeed(
+ const char* const digests[kNumFilterIntraPredictors], const int num_runs) {
+ ASSERT_NE(digests, nullptr);
+ const Pixel* const left = intra_pred_mem_.left_mem + 16;
+ const Pixel* const top = intra_pred_mem_.top_mem + 16;
+
+ libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+ intra_pred_mem_.Reset(&rnd);
+
+ // IntraPredInit_C() leaves the filter function empty.
+ if (cur_filter_intra_pred_ == nullptr) return;
+ for (int i = 0; i < kNumFilterIntraPredictors; ++i) {
+ memcpy(intra_pred_mem_.dst, intra_pred_mem_.ref_src,
+ sizeof(intra_pred_mem_.dst));
+ const absl::Time start = absl::Now();
+ for (int run = 0; run < num_runs; ++run) {
+ const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel);
+ cur_filter_intra_pred_(intra_pred_mem_.dst, stride, top, left,
+ static_cast<FilterIntraPredictor>(i), block_width_,
+ block_height_);
+ }
+ const absl::Duration elapsed_time = absl::Now() - start;
+ test_utils::CheckMd5Digest(ToString(tx_size_), kFilterIntraPredNames[i],
+ digests[i], intra_pred_mem_.dst,
+ sizeof(intra_pred_mem_.dst), elapsed_time);
+ }
+}
+
+template <int bitdepth, typename Pixel>
+void FilterIntraPredTest<bitdepth, Pixel>::TestSaturatedValues() {
+ Pixel* const left = intra_pred_mem_.left_mem + 16;
+ Pixel* const top = intra_pred_mem_.top_mem + 16;
+ const auto kMaxPixel = static_cast<Pixel>((1 << bitdepth) - 1);
+ intra_pred_mem_.Set(kMaxPixel);
+
+ // IntraPredInit_C() leaves the filter function empty.
+ if (cur_filter_intra_pred_ == nullptr) return;
+ for (int i = 0; i < kNumFilterIntraPredictors; ++i) {
+ memcpy(intra_pred_mem_.dst, intra_pred_mem_.ref_src,
+ sizeof(intra_pred_mem_.dst));
+ const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel);
+ cur_filter_intra_pred_(intra_pred_mem_.dst, stride, top, left,
+ static_cast<FilterIntraPredictor>(i), block_width_,
+ block_height_);
+ if (!test_utils::CompareBlocks(intra_pred_mem_.dst, intra_pred_mem_.ref_src,
+ block_width_, block_height_, kMaxBlockSize,
+ kMaxBlockSize, true)) {
+ ADD_FAILURE() << "Expected " << kFilterIntraPredNames[i]
+ << " to produce a block containing '"
+ << static_cast<int>(kMaxPixel) << "'";
+ }
+ }
+}
+
+template <int bitdepth, typename Pixel>
+void FilterIntraPredTest<bitdepth, Pixel>::TestRandomValues() {
+ // Skip the 'C' test case as this is used as the reference.
+ if (base_filter_intra_pred_ == nullptr) return;
+
+ // Use an alternate seed to differentiate this test from TestSpeed().
+ libvpx_test::ACMRandom rnd(test_utils::kAlternateDeterministicSeed);
+ for (int i = 0; i < kNumFilterIntraPredictors; ++i) {
+ // It may be worthwhile to temporarily increase this loop size when testing
+ // changes that specifically affect this test.
+ for (int n = 0; n < 10000; ++n) {
+ intra_pred_mem_.Reset(&rnd);
+
+ memcpy(intra_pred_mem_.dst, intra_pred_mem_.ref_src,
+ sizeof(intra_pred_mem_.dst));
+ const Pixel* const top = intra_pred_mem_.top_mem + 16;
+ const Pixel* const left = intra_pred_mem_.left_mem + 16;
+ const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel);
+ base_filter_intra_pred_(intra_pred_mem_.ref_src, stride, top, left,
+ static_cast<FilterIntraPredictor>(i),
+ block_width_, block_height_);
+ cur_filter_intra_pred_(intra_pred_mem_.dst, stride, top, left,
+ static_cast<FilterIntraPredictor>(i), block_width_,
+ block_height_);
+ if (!test_utils::CompareBlocks(
+ intra_pred_mem_.dst, intra_pred_mem_.ref_src, block_width_,
+ block_height_, kMaxBlockSize, kMaxBlockSize, true)) {
+ ADD_FAILURE() << "Result from optimized version of "
+ << kFilterIntraPredNames[i]
+ << " differs from reference in iteration #" << n;
+ break;
+ }
+ }
+ }
+}
+
+//------------------------------------------------------------------------------
+using FilterIntraPredTest8bpp = FilterIntraPredTest<8, uint8_t>;
+
+const char* const* GetFilterIntraPredDigests8bpp(TransformSize tx_size) {
+ static const char* const kDigests4x4[kNumFilterIntraPredictors] = {
+ "a2486efcfb351d60a8941203073e89c6", "240716ae5ecaedc19edae1bdef49e05d",
+ "dacf4af66a966aca7c75abe24cd9ba99", "311888773676f3c2ae3334c4e0f141e5",
+ "2d3711616c8d8798f608e313cb07a72a",
+ };
+ static const char* const kDigests4x8[kNumFilterIntraPredictors] = {
+ "1cb74ba1abc68d936e87c13511ed5fbf", "d64c2c08586a762dbdfa8e1150bede06",
+ "73e9d1a9b6fa3e96fbd65c7dce507529", "e3ae17d9338e5aa3420d31d0e2d7ee87",
+ "750dbfe3bc5508b7031957a1d315b8bc",
+ };
+ static const char* const kDigests4x16[kNumFilterIntraPredictors] = {
+ "48a1060701bf68ec6342d6e24c10ef17", "0c91ff7988814d192ed95e840a87b4bf",
+ "efe586b891c8828c4116c9fbf50850cc", "a3bfa10be2b155826f107e9256ac3ba1",
+ "976273745b94a561fd52f5aa96fb280f",
+ };
+ static const char* const kDigests8x4[kNumFilterIntraPredictors] = {
+ "73f82633aeb28db1d254d077edefd8a9", "8eee505cdb5828e33b67ff5572445dac",
+ "9b0f101c28c66a916079fe5ed33b4021", "47fd44a7e5a5b55f067908192698e25c",
+ "eab59a3710d9bdeca8fa03a15d3f95d6",
+ };
+ static const char* const kDigests8x8[kNumFilterIntraPredictors] = {
+ "aa07b7a007c4c1d494ddb44a23c27bcd", "d27eee43f15dfcfe4c46cd46b681983b",
+ "1015d26022cf57acfdb11fd3f6b9ccb0", "4f0e00ef556fbcac2fb31e3b18869070",
+ "918c2553635763a0756b20154096bca6",
+ };
+ static const char* const kDigests8x16[kNumFilterIntraPredictors] = {
+ "a8ac58b2efb02092035cca206dbf5fbe", "0b22b000b7f124b32545bc86dd9f0142",
+ "cd6a08e023cad301c084b6ec2999da63", "c017f5f4fa5c05e7638ae4db98512b13",
+ "893e6995522e23ed3d613ef3797ca580",
+ };
+ static const char* const kDigests8x32[kNumFilterIntraPredictors] = {
+ "b3d5d4f09b778ae2b8cc0e9014c22320", "e473874a1e65228707489be9ca6477aa",
+ "91bda5a2d32780af345bb3d49324732f", "20f2ff26f004f02e8e2be49e6cadc32f",
+ "00c909b749e36142b133a7357271e83e",
+ };
+ static const char* const kDigests16x4[kNumFilterIntraPredictors] = {
+ "ef252f074fc3f5367748436e676e78ca", "cd436d8803ea40db3a849e7c869855c7",
+ "9cd8601b5d66e61fd002f8b11bfa58d9", "b982f17ee36ef0d1c2cfea20197d5666",
+ "9e350d1cd65d520194281633f566810d",
+ };
+ static const char* const kDigests16x8[kNumFilterIntraPredictors] = {
+ "9a7e0cf9b023a89ee619ee672ba2a219", "c20186bc642912ecd4d48bc4924a79b1",
+ "77de044f4c7f717f947a36fc0aa17946", "3f2fc68f11e6ee0220adb8d1ee085c8e",
+ "2f37e586769dfb88d9d4116b9c28c5ab",
+ };
+ static const char* const kDigests16x16[kNumFilterIntraPredictors] = {
+ "36c5b85b9a6b1d2e8f44f09c81adfe9c", "78494ce3a6a78aa2879ad2e24d43a005",
+ "aa30cd29a74407dbec80161745161eb2", "ae2a0975ef166e05e5e8c3701bd19e93",
+ "6322fba6f3bcb1f6c8e78160d200809c",
+ };
+ static const char* const kDigests16x32[kNumFilterIntraPredictors] = {
+ "82d54732c37424946bc73f5a78f64641", "071773c82869bb103c31e05f14ed3c2f",
+ "3a0094c150bd6e21ce1f17243b21e76b", "998ffef26fc65333ae407bbe9d41a252",
+ "6491add6b665aafc364c8c104a6a233d",
+ };
+ static const char* const kDigests32x8[kNumFilterIntraPredictors] = {
+ "c60062105dd727e94f744c35f0d2156e", "36a9e4d543701c4c546016e35e9c4337",
+ "05a8d07fe271023e63febfb44814d114", "0a28606925519d1ed067d64761619dc8",
+ "bb8c34b143910ba49b01d13e94d936ac",
+ };
+ static const char* const kDigests32x16[kNumFilterIntraPredictors] = {
+ "60e6caeec9194fcb409469e6e1393128", "5d764ead046443eb14f76822a569b056",
+ "b1bf22fcc282614354166fa1eb6e5f8b", "4b188e729fe49ae24100b3ddd8f17313",
+ "75f430fdea0b7b5b66866fd68a795a6a",
+ };
+ static const char* const kDigests32x32[kNumFilterIntraPredictors] = {
+ "5bb91a37b1979866eb23b59dd352229d", "589aa983109500749609d7be1cb79711",
+ "5e8fb1927cdbe21143494b56b5d400f6", "9e28f741d19c64b2a0577d83546d32d9",
+ "73c73237a5d891096066b186abf96854",
+ };
+
+ switch (tx_size) {
+ case kTransformSize4x4:
+ return kDigests4x4;
+ case kTransformSize4x8:
+ return kDigests4x8;
+ case kTransformSize4x16:
+ return kDigests4x16;
+ case kTransformSize8x4:
+ return kDigests8x4;
+ case kTransformSize8x8:
+ return kDigests8x8;
+ case kTransformSize8x16:
+ return kDigests8x16;
+ case kTransformSize8x32:
+ return kDigests8x32;
+ case kTransformSize16x4:
+ return kDigests16x4;
+ case kTransformSize16x8:
+ return kDigests16x8;
+ case kTransformSize16x16:
+ return kDigests16x16;
+ case kTransformSize16x32:
+ return kDigests16x32;
+ case kTransformSize32x8:
+ return kDigests32x8;
+ case kTransformSize32x16:
+ return kDigests32x16;
+ case kTransformSize32x32:
+ return kDigests32x32;
+ default:
+ ADD_FAILURE() << "Unknown transform size: " << tx_size;
+ return nullptr;
+ }
+}
+
+TEST_P(FilterIntraPredTest8bpp, DISABLED_Speed) {
+ const auto num_runs =
+ static_cast<int>(2.5e8 / (block_width_ * block_height_));
+ TestSpeed(GetFilterIntraPredDigests8bpp(tx_size_), num_runs);
+}
+
+TEST_P(FilterIntraPredTest8bpp, FixedInput) {
+ TestSpeed(GetFilterIntraPredDigests8bpp(tx_size_), 1);
+}
+
+TEST_P(FilterIntraPredTest8bpp, Overflow) { TestSaturatedValues(); }
+TEST_P(FilterIntraPredTest8bpp, Random) { TestRandomValues(); }
+
+//------------------------------------------------------------------------------
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using FilterIntraPredTest10bpp = FilterIntraPredTest<10, uint16_t>;
+
+const char* const* GetFilterIntraPredDigests10bpp(TransformSize tx_size) {
+ static const char* const kDigests4x4[kNumFilterIntraPredictors] = {
+ "13a9014d9e255cde8e3e85abf6ef5151", "aee33aa3f3baec87a8c019743fff40f1",
+ "fdd8ca2be424501f51fcdb603c2e757c", "aed00c082d1980d4bab45e9318b939f0",
+ "1b363db246aa5400f49479b7d5d41799",
+ };
+ static const char* const kDigests4x8[kNumFilterIntraPredictors] = {
+ "e718b9e31ba3da0392fd4b6cfba5d882", "31ba22989cdc3bb80749685f42c6c697",
+ "6bc5b3a55b94018117569cfdced17bf9", "ec29979fb4936116493dfa1cfc93901c",
+ "c6bcf564e63c42148d9917f089566432",
+ };
+ static const char* const kDigests4x16[kNumFilterIntraPredictors] = {
+ "404bddd88dff2c0414b5398287e54f18", "ff4fb3039cec6c9ffed6d259cbbfd854",
+ "7d6fa3ed9e728ff056a73c40bb6edeb6", "82845d942ad8048578e0037336905146",
+ "f3c07ea65db08c639136a5a9270f95ff",
+ };
+ static const char* const kDigests8x4[kNumFilterIntraPredictors] = {
+ "2008981638f27ba9123973a733e46c3d", "47efecf1f7628cbd8c22e168fcceb5ce",
+ "04c857ffbd1edd6e2788b17410a4a39c", "deb0236c4277b4d7b174fba407e1c9d7",
+ "5b58567f94ae9fa930f700c68c17399d",
+ };
+ static const char* const kDigests8x8[kNumFilterIntraPredictors] = {
+ "d9bab44a6d1373e758bfa0ee88239093", "29b10ddb32d9de2ff0cad6126f010ff6",
+ "1a03f9a18bdbab0811138cd969bf1f93", "e3273c24e77095ffa033a073f5bbcf7b",
+ "5187bb3df943d154cb01fb2f244ff86f",
+ };
+ static const char* const kDigests8x16[kNumFilterIntraPredictors] = {
+ "a2199f792634a56f1c4e88510e408773", "8fd8a98969d19832975ee7131cca9dbb",
+ "d897380941f75b04b1327e63f136d7d6", "d36f52a157027d53b15b7c02a7983436",
+ "0a8c23047b0364f5687b62b01f043359",
+ };
+ static const char* const kDigests8x32[kNumFilterIntraPredictors] = {
+ "5b74ea8e4f60151cf2db9b23d803a2e2", "e0d6bb5fa7d181589c31fcf2755d7c0b",
+ "42e590ffc88b8940b7aade22e13bbb6a", "e47c39ec1761aa7b5a9b1368ede7cfdc",
+ "6e963a89beac6f3a362c269d1017f9a8",
+ };
+ static const char* const kDigests16x4[kNumFilterIntraPredictors] = {
+ "9eaa079622b5dd95ad3a8feb68fa9bbb", "17e3aa6a0034e9eedcfc65b8ce6e7205",
+ "eac5a5337dbaf9bcbc3d320745c8e190", "c6ba9a7e518be04f725bc1dbd399c204",
+ "19020b82ce8bb49a511820c7e1d58e99",
+ };
+ static const char* const kDigests16x8[kNumFilterIntraPredictors] = {
+ "2d2c3255d5dfc1479a5d82a7d5a0d42e", "0fbb4ee851b4ee58c6d30dd820d19e38",
+ "fa77a1b056e8dc8efb702c7832531b32", "186269ca219dc663ad9b4a53e011a54b",
+ "c12180a6dcde0c3579befbb5304ff70b",
+ };
+ static const char* const kDigests16x16[kNumFilterIntraPredictors] = {
+ "dbb81d7ee7d3c83c271400d0160b2e83", "4da656a3ef238d90bb8339471a6fdb7e",
+ "d95006bf299b84a1b04e38d5fa8fb4f7", "742a03331f0fbd66c57df0ae31104aca",
+ "4d20aa440e38b6b7ac83c8c54d313169",
+ };
+ static const char* const kDigests16x32[kNumFilterIntraPredictors] = {
+ "6247730c93789cc25bcb837781dfa05b", "9a93e14b06dd145e35ab21a0353bdebe",
+ "6c5866353e30296a67d9bd7a65d6998d", "389d7f038d7997871745bb1305156ff9",
+ "e7640d81f891e1d06e7da75c6ae74d93",
+ };
+ static const char* const kDigests32x8[kNumFilterIntraPredictors] = {
+ "68f3a603b7c25dd78deffe91aef22834", "48c735e4aa951d6333d99e571bfeadc8",
+ "35239df0993a429fc599a3037c731e4b", "ba7dd72e04af1a1fc1b30784c11df783",
+ "78e9017f7434665d32ec59795aed0012",
+ };
+ static const char* const kDigests32x16[kNumFilterIntraPredictors] = {
+ "8cf2f11f7f77901cb0c522ad191eb998", "204c76d68c5117b89b5c3a05d5548883",
+ "f3751e41e7a595f43d8aaf9a40644e05", "81ea1a7d608d7b91dd3ede0f87e750ee",
+ "b5951334dfbe6229d828e03cd2d98538",
+ };
+ static const char* const kDigests32x32[kNumFilterIntraPredictors] = {
+ "9d8630188c3d1a4f28a6106e343c9380", "c6c92e059faa17163522409b7bf93230",
+ "62e4c959cb06ec661d98769981fbd555", "01e61673f11011571246668e36cc61c5",
+ "4530222ea1de546e202630fcf43f4526",
+ };
+
+ switch (tx_size) {
+ case kTransformSize4x4:
+ return kDigests4x4;
+ case kTransformSize4x8:
+ return kDigests4x8;
+ case kTransformSize4x16:
+ return kDigests4x16;
+ case kTransformSize8x4:
+ return kDigests8x4;
+ case kTransformSize8x8:
+ return kDigests8x8;
+ case kTransformSize8x16:
+ return kDigests8x16;
+ case kTransformSize8x32:
+ return kDigests8x32;
+ case kTransformSize16x4:
+ return kDigests16x4;
+ case kTransformSize16x8:
+ return kDigests16x8;
+ case kTransformSize16x16:
+ return kDigests16x16;
+ case kTransformSize16x32:
+ return kDigests16x32;
+ case kTransformSize32x8:
+ return kDigests32x8;
+ case kTransformSize32x16:
+ return kDigests32x16;
+ case kTransformSize32x32:
+ return kDigests32x32;
+ default:
+ ADD_FAILURE() << "Unknown transform size: " << tx_size;
+ return nullptr;
+ }
+}
+
+TEST_P(FilterIntraPredTest10bpp, DISABLED_Speed) {
+ const auto num_runs =
+ static_cast<int>(2.5e8 / (block_width_ * block_height_));
+ TestSpeed(GetFilterIntraPredDigests10bpp(tx_size_), num_runs);
+}
+
+TEST_P(FilterIntraPredTest10bpp, FixedInput) {
+ TestSpeed(GetFilterIntraPredDigests10bpp(tx_size_), 1);
+}
+
+TEST_P(FilterIntraPredTest10bpp, Overflow) { TestSaturatedValues(); }
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+// Filter-intra and Cfl predictors are available only for transform sizes
+// with max(width, height) <= 32.
+constexpr TransformSize kTransformSizesSmallerThan32x32[] = {
+ kTransformSize4x4, kTransformSize4x8, kTransformSize4x16,
+ kTransformSize8x4, kTransformSize8x8, kTransformSize8x16,
+ kTransformSize8x32, kTransformSize16x4, kTransformSize16x8,
+ kTransformSize16x16, kTransformSize16x32, kTransformSize32x8,
+ kTransformSize32x16, kTransformSize32x32};
+
+INSTANTIATE_TEST_SUITE_P(C, FilterIntraPredTest8bpp,
+ testing::ValuesIn(kTransformSizesSmallerThan32x32));
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, FilterIntraPredTest8bpp,
+ testing::ValuesIn(kTransformSizesSmallerThan32x32));
+#endif // LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, FilterIntraPredTest8bpp,
+ testing::ValuesIn(kTransformSizesSmallerThan32x32));
+#endif // LIBGAV1_ENABLE_NEON
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+INSTANTIATE_TEST_SUITE_P(C, FilterIntraPredTest10bpp,
+ testing::ValuesIn(kTransformSizesSmallerThan32x32));
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+} // namespace
+} // namespace dsp
+
+static std::ostream& operator<<(std::ostream& os, const TransformSize tx_size) {
+ return os << ToString(tx_size);
+}
+
+} // namespace libgav1
diff --git a/src/dsp/intrapred_smooth.cc b/src/dsp/intrapred_smooth.cc
new file mode 100644
index 0000000..83c005e
--- /dev/null
+++ b/src/dsp/intrapred_smooth.cc
@@ -0,0 +1,738 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred_smooth.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdlib>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+template <int block_width, int block_height, typename Pixel>
+struct SmoothFuncs_C {
+ SmoothFuncs_C() = delete;
+
+ static void Smooth(void* dest, ptrdiff_t stride, const void* top_row,
+ const void* left_column);
+ static void SmoothVertical(void* dest, ptrdiff_t stride, const void* top_row,
+ const void* left_column);
+ static void SmoothHorizontal(void* dest, ptrdiff_t stride,
+ const void* top_row, const void* left_column);
+};
+
+constexpr uint8_t kSmoothWeights[] = {
+ // block dimension = 4
+ 255, 149, 85, 64,
+ // block dimension = 8
+ 255, 197, 146, 105, 73, 50, 37, 32,
+ // block dimension = 16
+ 255, 225, 196, 170, 145, 123, 102, 84, 68, 54, 43, 33, 26, 20, 17, 16,
+ // block dimension = 32
+ 255, 240, 225, 210, 196, 182, 169, 157, 145, 133, 122, 111, 101, 92, 83, 74,
+ 66, 59, 52, 45, 39, 34, 29, 25, 21, 17, 14, 12, 10, 9, 8, 8,
+ // block dimension = 64
+ 255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169, 163, 156,
+ 150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96, 91, 86, 82, 77, 73,
+ 69, 65, 61, 57, 54, 50, 47, 44, 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16,
+ 15, 13, 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4};
+
+// SmoothFuncs_C::Smooth
+template <int block_width, int block_height, typename Pixel>
+void SmoothFuncs_C<block_width, block_height, Pixel>::Smooth(
+ void* const dest, ptrdiff_t stride, const void* const top_row,
+ const void* const left_column) {
+ const auto* const top = static_cast<const Pixel*>(top_row);
+ const auto* const left = static_cast<const Pixel*>(left_column);
+ const Pixel top_right = top[block_width - 1];
+ const Pixel bottom_left = left[block_height - 1];
+ static_assert(
+ block_width >= 4 && block_height >= 4,
+ "Weights for smooth predictor undefined for block width/height < 4");
+ const uint8_t* const weights_x = kSmoothWeights + block_width - 4;
+ const uint8_t* const weights_y = kSmoothWeights + block_height - 4;
+ const uint16_t scale_value = (1 << kSmoothWeightScale);
+ auto* dst = static_cast<Pixel*>(dest);
+ stride /= sizeof(Pixel);
+
+ for (int y = 0; y < block_height; ++y) {
+ for (int x = 0; x < block_width; ++x) {
+ assert(scale_value >= weights_y[y] && scale_value >= weights_x[x]);
+ uint32_t pred = weights_y[y] * top[x];
+ pred += weights_x[x] * left[y];
+ pred += static_cast<uint8_t>(scale_value - weights_y[y]) * bottom_left;
+ pred += static_cast<uint8_t>(scale_value - weights_x[x]) * top_right;
+ // The maximum value of pred with the rounder is 2^9 * (2^bitdepth - 1)
+ // + 256. With the descale there's no need for saturation.
+ dst[x] = static_cast<Pixel>(
+ RightShiftWithRounding(pred, kSmoothWeightScale + 1));
+ }
+ dst += stride;
+ }
+}
+
+// SmoothFuncs_C::SmoothVertical
+template <int block_width, int block_height, typename Pixel>
+void SmoothFuncs_C<block_width, block_height, Pixel>::SmoothVertical(
+ void* const dest, ptrdiff_t stride, const void* const top_row,
+ const void* const left_column) {
+ const auto* const top = static_cast<const Pixel*>(top_row);
+ const auto* const left = static_cast<const Pixel*>(left_column);
+ const Pixel bottom_left = left[block_height - 1];
+ static_assert(block_height >= 4,
+ "Weights for smooth predictor undefined for block height < 4");
+ const uint8_t* const weights_y = kSmoothWeights + block_height - 4;
+ const uint16_t scale_value = (1 << kSmoothWeightScale);
+ auto* dst = static_cast<Pixel*>(dest);
+ stride /= sizeof(Pixel);
+
+ for (int y = 0; y < block_height; ++y) {
+ for (int x = 0; x < block_width; ++x) {
+ assert(scale_value >= weights_y[y]);
+ uint32_t pred = weights_y[y] * top[x];
+ pred += static_cast<uint8_t>(scale_value - weights_y[y]) * bottom_left;
+ dst[x] =
+ static_cast<Pixel>(RightShiftWithRounding(pred, kSmoothWeightScale));
+ }
+ dst += stride;
+ }
+}
+
+// SmoothFuncs_C::SmoothHorizontal
+template <int block_width, int block_height, typename Pixel>
+void SmoothFuncs_C<block_width, block_height, Pixel>::SmoothHorizontal(
+ void* const dest, ptrdiff_t stride, const void* const top_row,
+ const void* const left_column) {
+ const auto* const top = static_cast<const Pixel*>(top_row);
+ const auto* const left = static_cast<const Pixel*>(left_column);
+ const Pixel top_right = top[block_width - 1];
+ static_assert(block_width >= 4,
+ "Weights for smooth predictor undefined for block width < 4");
+ const uint8_t* const weights_x = kSmoothWeights + block_width - 4;
+ const uint16_t scale_value = (1 << kSmoothWeightScale);
+ auto* dst = static_cast<Pixel*>(dest);
+ stride /= sizeof(Pixel);
+
+ for (int y = 0; y < block_height; ++y) {
+ for (int x = 0; x < block_width; ++x) {
+ assert(scale_value >= weights_x[x]);
+ uint32_t pred = weights_x[x] * left[y];
+ pred += static_cast<uint8_t>(scale_value - weights_x[x]) * top_right;
+ dst[x] =
+ static_cast<Pixel>(RightShiftWithRounding(pred, kSmoothWeightScale));
+ }
+ dst += stride;
+ }
+}
+
+// -----------------------------------------------------------------------------
+
+template <typename Pixel>
+struct SmoothDefs {
+ SmoothDefs() = delete;
+
+ using _4x4 = SmoothFuncs_C<4, 4, Pixel>;
+ using _4x8 = SmoothFuncs_C<4, 8, Pixel>;
+ using _4x16 = SmoothFuncs_C<4, 16, Pixel>;
+ using _8x4 = SmoothFuncs_C<8, 4, Pixel>;
+ using _8x8 = SmoothFuncs_C<8, 8, Pixel>;
+ using _8x16 = SmoothFuncs_C<8, 16, Pixel>;
+ using _8x32 = SmoothFuncs_C<8, 32, Pixel>;
+ using _16x4 = SmoothFuncs_C<16, 4, Pixel>;
+ using _16x8 = SmoothFuncs_C<16, 8, Pixel>;
+ using _16x16 = SmoothFuncs_C<16, 16, Pixel>;
+ using _16x32 = SmoothFuncs_C<16, 32, Pixel>;
+ using _16x64 = SmoothFuncs_C<16, 64, Pixel>;
+ using _32x8 = SmoothFuncs_C<32, 8, Pixel>;
+ using _32x16 = SmoothFuncs_C<32, 16, Pixel>;
+ using _32x32 = SmoothFuncs_C<32, 32, Pixel>;
+ using _32x64 = SmoothFuncs_C<32, 64, Pixel>;
+ using _64x16 = SmoothFuncs_C<64, 16, Pixel>;
+ using _64x32 = SmoothFuncs_C<64, 32, Pixel>;
+ using _64x64 = SmoothFuncs_C<64, 64, Pixel>;
+};
+
+using Defs = SmoothDefs<uint8_t>;
+
+// Initializes dsp entries for kTransformSize|W|x|H| from |DEFS| of
+// the same size.
+#define INIT_SMOOTH_WxH(DEFS, W, H) \
+ dsp->intra_predictors[kTransformSize##W##x##H][kIntraPredictorSmooth] = \
+ DEFS::_##W##x##H::Smooth; \
+ dsp->intra_predictors[kTransformSize##W##x##H] \
+ [kIntraPredictorSmoothVertical] = \
+ DEFS::_##W##x##H::SmoothVertical; \
+ dsp->intra_predictors[kTransformSize##W##x##H] \
+ [kIntraPredictorSmoothHorizontal] = \
+ DEFS::_##W##x##H::SmoothHorizontal
+
+#define INIT_SMOOTH(DEFS) \
+ INIT_SMOOTH_WxH(DEFS, 4, 4); \
+ INIT_SMOOTH_WxH(DEFS, 4, 8); \
+ INIT_SMOOTH_WxH(DEFS, 4, 16); \
+ INIT_SMOOTH_WxH(DEFS, 8, 4); \
+ INIT_SMOOTH_WxH(DEFS, 8, 8); \
+ INIT_SMOOTH_WxH(DEFS, 8, 16); \
+ INIT_SMOOTH_WxH(DEFS, 8, 32); \
+ INIT_SMOOTH_WxH(DEFS, 16, 4); \
+ INIT_SMOOTH_WxH(DEFS, 16, 8); \
+ INIT_SMOOTH_WxH(DEFS, 16, 16); \
+ INIT_SMOOTH_WxH(DEFS, 16, 32); \
+ INIT_SMOOTH_WxH(DEFS, 16, 64); \
+ INIT_SMOOTH_WxH(DEFS, 32, 8); \
+ INIT_SMOOTH_WxH(DEFS, 32, 16); \
+ INIT_SMOOTH_WxH(DEFS, 32, 32); \
+ INIT_SMOOTH_WxH(DEFS, 32, 64); \
+ INIT_SMOOTH_WxH(DEFS, 64, 16); \
+ INIT_SMOOTH_WxH(DEFS, 64, 32); \
+ INIT_SMOOTH_WxH(DEFS, 64, 64)
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ INIT_SMOOTH(Defs);
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmooth] =
+ Defs::_4x4::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothVertical] =
+ Defs::_4x4::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothHorizontal] =
+ Defs::_4x4::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmooth] =
+ Defs::_4x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothVertical] =
+ Defs::_4x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothHorizontal] =
+ Defs::_4x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmooth] =
+ Defs::_4x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothVertical] =
+ Defs::_4x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothHorizontal] =
+ Defs::_4x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmooth] =
+ Defs::_8x4::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothVertical] =
+ Defs::_8x4::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothHorizontal] =
+ Defs::_8x4::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmooth] =
+ Defs::_8x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothVertical] =
+ Defs::_8x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothHorizontal] =
+ Defs::_8x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmooth] =
+ Defs::_8x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothVertical] =
+ Defs::_8x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothHorizontal] =
+ Defs::_8x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmooth] =
+ Defs::_8x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothVertical] =
+ Defs::_8x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothHorizontal] =
+ Defs::_8x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmooth] =
+ Defs::_16x4::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothVertical] =
+ Defs::_16x4::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothHorizontal] =
+ Defs::_16x4::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmooth] =
+ Defs::_16x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothVertical] =
+ Defs::_16x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothHorizontal] =
+ Defs::_16x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmooth] =
+ Defs::_16x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothVertical] =
+ Defs::_16x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothHorizontal] =
+ Defs::_16x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmooth] =
+ Defs::_16x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothVertical] =
+ Defs::_16x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothHorizontal] =
+ Defs::_16x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmooth] =
+ Defs::_16x64::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothVertical] =
+ Defs::_16x64::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothHorizontal] =
+ Defs::_16x64::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmooth] =
+ Defs::_32x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothVertical] =
+ Defs::_32x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothHorizontal] =
+ Defs::_32x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmooth] =
+ Defs::_32x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothVertical] =
+ Defs::_32x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothHorizontal] =
+ Defs::_32x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmooth] =
+ Defs::_32x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothVertical] =
+ Defs::_32x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothHorizontal] =
+ Defs::_32x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmooth] =
+ Defs::_32x64::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothVertical] =
+ Defs::_32x64::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothHorizontal] =
+ Defs::_32x64::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmooth] =
+ Defs::_64x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothVertical] =
+ Defs::_64x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothHorizontal] =
+ Defs::_64x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmooth] =
+ Defs::_64x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothVertical] =
+ Defs::_64x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothHorizontal] =
+ Defs::_64x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmooth] =
+ Defs::_64x64::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothVertical] =
+ Defs::_64x64::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothHorizontal] =
+ Defs::_64x64::SmoothHorizontal;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+} // NOLINT(readability/fn_size)
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using DefsHbd = SmoothDefs<uint16_t>;
+
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ INIT_SMOOTH(DefsHbd);
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmooth] =
+ DefsHbd::_4x4::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothVertical] =
+ DefsHbd::_4x4::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_4x4::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmooth] =
+ DefsHbd::_4x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothVertical] =
+ DefsHbd::_4x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_4x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmooth] =
+ DefsHbd::_4x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothVertical] =
+ DefsHbd::_4x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_4x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmooth] =
+ DefsHbd::_8x4::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothVertical] =
+ DefsHbd::_8x4::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_8x4::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmooth] =
+ DefsHbd::_8x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothVertical] =
+ DefsHbd::_8x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_8x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmooth] =
+ DefsHbd::_8x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothVertical] =
+ DefsHbd::_8x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_8x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmooth] =
+ DefsHbd::_8x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothVertical] =
+ DefsHbd::_8x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_8x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmooth] =
+ DefsHbd::_16x4::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothVertical] =
+ DefsHbd::_16x4::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_16x4::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmooth] =
+ DefsHbd::_16x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothVertical] =
+ DefsHbd::_16x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_16x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmooth] =
+ DefsHbd::_16x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothVertical] =
+ DefsHbd::_16x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_16x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmooth] =
+ DefsHbd::_16x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothVertical] =
+ DefsHbd::_16x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_16x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmooth] =
+ DefsHbd::_16x64::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothVertical] =
+ DefsHbd::_16x64::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_16x64::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmooth] =
+ DefsHbd::_32x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothVertical] =
+ DefsHbd::_32x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_32x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmooth] =
+ DefsHbd::_32x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothVertical] =
+ DefsHbd::_32x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_32x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmooth] =
+ DefsHbd::_32x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothVertical] =
+ DefsHbd::_32x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_32x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmooth] =
+ DefsHbd::_32x64::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothVertical] =
+ DefsHbd::_32x64::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_32x64::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmooth] =
+ DefsHbd::_64x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothVertical] =
+ DefsHbd::_64x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_64x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmooth] =
+ DefsHbd::_64x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothVertical] =
+ DefsHbd::_64x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_64x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmooth] =
+ DefsHbd::_64x64::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothVertical] =
+ DefsHbd::_64x64::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_64x64::SmoothHorizontal;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+} // NOLINT(readability/fn_size)
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#undef INIT_SMOOTH_WxH
+#undef INIT_SMOOTH
+} // namespace
+
+void IntraPredSmoothInit_C() {
+ Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ Init10bpp();
+#endif
+}
+
+} // namespace dsp
+} // namespace libgav1
diff --git a/src/dsp/intrapred_smooth.h b/src/dsp/intrapred_smooth.h
new file mode 100644
index 0000000..6802003
--- /dev/null
+++ b/src/dsp/intrapred_smooth.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_INTRAPRED_SMOOTH_H_
+#define LIBGAV1_SRC_DSP_INTRAPRED_SMOOTH_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/intrapred_smooth_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/intrapred_smooth_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::intra_predictors[][kIntraPredictorSmooth.*].
+// This function is not thread-safe.
+void IntraPredSmoothInit_C();
+
+} // namespace dsp
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_DSP_INTRAPRED_SMOOTH_H_
diff --git a/src/dsp/intrapred_test.cc b/src/dsp/intrapred_test.cc
new file mode 100644
index 0000000..335aa2f
--- /dev/null
+++ b/src/dsp/intrapred_test.cc
@@ -0,0 +1,710 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred.h"
+
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <ostream>
+
+#include "absl/strings/match.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/intrapred_smooth.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "tests/block_utils.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kMaxBlockSize = 64;
+constexpr int kTotalPixels = kMaxBlockSize * kMaxBlockSize;
+
+template <int bitdepth, typename Pixel>
+class IntraPredTestBase : public testing::TestWithParam<TransformSize>,
+ public test_utils::MaxAlignedAllocable {
+ public:
+ IntraPredTestBase() {
+ switch (tx_size_) {
+ case kNumTransformSizes:
+ EXPECT_NE(tx_size_, kNumTransformSizes);
+ break;
+ default:
+ block_width_ = kTransformWidth[tx_size_];
+ block_height_ = kTransformHeight[tx_size_];
+ break;
+ }
+ }
+
+ IntraPredTestBase(const IntraPredTestBase&) = delete;
+ IntraPredTestBase& operator=(const IntraPredTestBase&) = delete;
+ ~IntraPredTestBase() override = default;
+
+ protected:
+ struct IntraPredMem {
+ void Reset(libvpx_test::ACMRandom* rnd) {
+ ASSERT_NE(rnd, nullptr);
+ Pixel* const left = left_mem + 16;
+ Pixel* const top = top_mem + 16;
+ const int mask = (1 << bitdepth) - 1;
+ for (auto& r : ref_src) r = rnd->Rand16() & mask;
+ for (int i = 0; i < kMaxBlockSize; ++i) left[i] = rnd->Rand16() & mask;
+ for (int i = -1; i < kMaxBlockSize; ++i) top[i] = rnd->Rand16() & mask;
+
+ // Some directional predictors require top-right, bottom-left.
+ for (int i = kMaxBlockSize; i < 2 * kMaxBlockSize; ++i) {
+ left[i] = rnd->Rand16() & mask;
+ top[i] = rnd->Rand16() & mask;
+ }
+ // TODO(jzern): reorder this and regenerate the digests after switching
+ // random number generators.
+ // Upsampling in the directional predictors extends left/top[-1] to [-2].
+ left[-1] = rnd->Rand16() & mask;
+ left[-2] = rnd->Rand16() & mask;
+ top[-2] = rnd->Rand16() & mask;
+ memset(left_mem, 0, sizeof(left_mem[0]) * 14);
+ memset(top_mem, 0, sizeof(top_mem[0]) * 14);
+ memset(top_mem + kMaxBlockSize * 2 + 16, 0,
+ sizeof(top_mem[0]) * kTopMemPadding);
+ }
+
+ // Set ref_src, top-left, top and left to |pixel|.
+ void Set(const Pixel pixel) {
+ Pixel* const left = left_mem + 16;
+ Pixel* const top = top_mem + 16;
+ for (auto& r : ref_src) r = pixel;
+ // Upsampling in the directional predictors extends left/top[-1] to [-2].
+ for (int i = -2; i < 2 * kMaxBlockSize; ++i) {
+ left[i] = top[i] = pixel;
+ }
+ }
+
+ // DirectionalZone1_Large() overreads up to 7 pixels in |top_mem|.
+ static constexpr int kTopMemPadding = 7;
+ alignas(kMaxAlignment) Pixel dst[kTotalPixels];
+ alignas(kMaxAlignment) Pixel ref_src[kTotalPixels];
+ alignas(kMaxAlignment) Pixel left_mem[kMaxBlockSize * 2 + 16];
+ alignas(
+ kMaxAlignment) Pixel top_mem[kMaxBlockSize * 2 + 16 + kTopMemPadding];
+ };
+
+ void SetUp() override { test_utils::ResetDspTable(bitdepth); }
+
+ const TransformSize tx_size_ = GetParam();
+ int block_width_;
+ int block_height_;
+ IntraPredMem intra_pred_mem_;
+};
+
+//------------------------------------------------------------------------------
+// IntraPredTest
+
+template <int bitdepth, typename Pixel>
+class IntraPredTest : public IntraPredTestBase<bitdepth, Pixel> {
+ public:
+ IntraPredTest() = default;
+ IntraPredTest(const IntraPredTest&) = delete;
+ IntraPredTest& operator=(const IntraPredTest&) = delete;
+ ~IntraPredTest() override = default;
+
+ protected:
+ using IntraPredTestBase<bitdepth, Pixel>::tx_size_;
+ using IntraPredTestBase<bitdepth, Pixel>::block_width_;
+ using IntraPredTestBase<bitdepth, Pixel>::block_height_;
+ using IntraPredTestBase<bitdepth, Pixel>::intra_pred_mem_;
+
+ void SetUp() override {
+ IntraPredTestBase<bitdepth, Pixel>::SetUp();
+ IntraPredInit_C();
+ IntraPredSmoothInit_C();
+
+ const Dsp* const dsp = GetDspTable(bitdepth);
+ ASSERT_NE(dsp, nullptr);
+ memcpy(base_intrapreds_, dsp->intra_predictors[tx_size_],
+ sizeof(base_intrapreds_));
+
+ const testing::TestInfo* const test_info =
+ testing::UnitTest::GetInstance()->current_test_info();
+ const char* const test_case = test_info->test_suite_name();
+ if (absl::StartsWith(test_case, "C/")) {
+ memset(base_intrapreds_, 0, sizeof(base_intrapreds_));
+ } else if (absl::StartsWith(test_case, "SSE41/")) {
+ if ((GetCpuInfo() & kSSE4_1) != 0) {
+ IntraPredInit_SSE4_1();
+ IntraPredSmoothInit_SSE4_1();
+ }
+ } else if (absl::StartsWith(test_case, "NEON/")) {
+ IntraPredInit_NEON();
+ IntraPredSmoothInit_NEON();
+ } else {
+ FAIL() << "Unrecognized architecture prefix in test case name: "
+ << test_case;
+ }
+
+ memcpy(cur_intrapreds_, dsp->intra_predictors[tx_size_],
+ sizeof(cur_intrapreds_));
+
+ for (int i = 0; i < kNumIntraPredictors; ++i) {
+ // skip functions that haven't been specialized for this particular
+ // architecture.
+ if (cur_intrapreds_[i] == base_intrapreds_[i]) {
+ cur_intrapreds_[i] = nullptr;
+ }
+ }
+ }
+
+ // These tests modify intra_pred_mem_.
+ void TestSpeed(const char* const digests[kNumIntraPredictors], int num_runs);
+ void TestSaturatedValues();
+ void TestRandomValues();
+
+ IntraPredictorFunc base_intrapreds_[kNumIntraPredictors];
+ IntraPredictorFunc cur_intrapreds_[kNumIntraPredictors];
+};
+
+template <int bitdepth, typename Pixel>
+void IntraPredTest<bitdepth, Pixel>::TestSpeed(
+ const char* const digests[kNumIntraPredictors], const int num_runs) {
+ ASSERT_NE(digests, nullptr);
+ const auto* const left =
+ reinterpret_cast<const uint8_t*>(intra_pred_mem_.left_mem + 16);
+ const auto* const top =
+ reinterpret_cast<const uint8_t*>(intra_pred_mem_.top_mem + 16);
+
+ libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+ intra_pred_mem_.Reset(&rnd);
+
+ for (int i = 0; i < kNumIntraPredictors; ++i) {
+ if (cur_intrapreds_[i] == nullptr) continue;
+ memcpy(intra_pred_mem_.dst, intra_pred_mem_.ref_src,
+ sizeof(intra_pred_mem_.dst));
+ const absl::Time start = absl::Now();
+ for (int run = 0; run < num_runs; ++run) {
+ const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel);
+ cur_intrapreds_[i](intra_pred_mem_.dst, stride, top, left);
+ }
+ const absl::Duration elapsed_time = absl::Now() - start;
+ test_utils::CheckMd5Digest(ToString(tx_size_),
+ ToString(static_cast<IntraPredictor>(i)),
+ digests[i], intra_pred_mem_.dst,
+ sizeof(intra_pred_mem_.dst), elapsed_time);
+ }
+}
+
+template <int bitdepth, typename Pixel>
+void IntraPredTest<bitdepth, Pixel>::TestSaturatedValues() {
+ Pixel* const left = intra_pred_mem_.left_mem + 16;
+ Pixel* const top = intra_pred_mem_.top_mem + 16;
+ const auto kMaxPixel = static_cast<Pixel>((1 << bitdepth) - 1);
+ intra_pred_mem_.Set(kMaxPixel);
+
+ // skip DcFill
+ for (int i = 1; i < kNumIntraPredictors; ++i) {
+ if (cur_intrapreds_[i] == nullptr) continue;
+ memcpy(intra_pred_mem_.dst, intra_pred_mem_.ref_src,
+ sizeof(intra_pred_mem_.dst));
+ const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel);
+ cur_intrapreds_[i](intra_pred_mem_.dst, stride, top, left);
+ if (!test_utils::CompareBlocks(intra_pred_mem_.dst, intra_pred_mem_.ref_src,
+ block_width_, block_height_, kMaxBlockSize,
+ kMaxBlockSize, true)) {
+ ADD_FAILURE() << "Expected " << ToString(static_cast<IntraPredictor>(i))
+ << " to produce a block containing '"
+ << static_cast<int>(kMaxPixel) << "'";
+ }
+ }
+}
+
+template <int bitdepth, typename Pixel>
+void IntraPredTest<bitdepth, Pixel>::TestRandomValues() {
+ // Use an alternate seed to differentiate this test from TestSpeed().
+ libvpx_test::ACMRandom rnd(test_utils::kAlternateDeterministicSeed);
+ for (int i = 0; i < kNumIntraPredictors; ++i) {
+ // Skip the 'C' test case as this is used as the reference.
+ if (base_intrapreds_[i] == nullptr) continue;
+ if (cur_intrapreds_[i] == nullptr) continue;
+ // It may be worthwhile to temporarily increase this loop size when testing
+ // changes that specifically affect this test.
+ for (int n = 0; n < 10000; ++n) {
+ intra_pred_mem_.Reset(&rnd);
+
+ memcpy(intra_pred_mem_.dst, intra_pred_mem_.ref_src,
+ sizeof(intra_pred_mem_.dst));
+ const Pixel* const top = intra_pred_mem_.top_mem + 16;
+ const Pixel* const left = intra_pred_mem_.left_mem + 16;
+ const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel);
+ base_intrapreds_[i](intra_pred_mem_.ref_src, stride, top, left);
+ cur_intrapreds_[i](intra_pred_mem_.dst, stride, top, left);
+ if (!test_utils::CompareBlocks(
+ intra_pred_mem_.dst, intra_pred_mem_.ref_src, block_width_,
+ block_height_, kMaxBlockSize, kMaxBlockSize, true)) {
+ ADD_FAILURE() << "Result from optimized version of "
+ << ToString(static_cast<IntraPredictor>(i))
+ << " differs from reference in iteration #" << n;
+ break;
+ }
+ }
+ }
+}
+
+//------------------------------------------------------------------------------
+
+using IntraPredTest8bpp = IntraPredTest<8, uint8_t>;
+
+const char* const* GetIntraPredDigests8bpp(TransformSize tx_size) {
+ static const char* const kDigests4x4[kNumIntraPredictors] = {
+ "7b1c762e28747f885d2b7d83cb8aa75c", "73353f179207f1432d40a132809e3a50",
+ "80c9237c838b0ec0674ccb070df633d5", "1cd79116b41fda884e7fa047f5eb14df",
+ "33211425772ee539a59981a2e9dc10c1", "d6f5f65a267f0e9a2752e8151cc1dcd7",
+ "7ff8c762cb766eb0665682152102ce4b", "2276b861ae4599de15938651961907ec",
+ "766982bc69f4aaaa8e71014c2dc219bc", "e2c31b5fd2199c49e17c31610339ab3f",
+ };
+ static const char* const kDigests4x8[kNumIntraPredictors] = {
+ "0a0d8641ecfa0e82f541acdc894d5574", "1a40371af6cff9c278c5b0def9e4b3e7",
+ "3631a7a99569663b514f15b590523822", "646c7b592136285bd31501494e7393e7",
+ "ecbe89cc64dc2688123d3cfe865b5237", "79048e70ecbb7d43a4703f62718588c0",
+ "f3de11bf1198a00675d806d29c41d676", "32bb6cd018f6e871c342fcc21c7180cf",
+ "6f076a1e5ab3d69cf08811d62293e4be", "2a84460a8b189b4589824cf6b3b39954",
+ };
+ static const char* const kDigests4x16[kNumIntraPredictors] = {
+ "cb8240be98444ede5ae98ca94afc1557", "460acbcf825a1fa0d8f2aa6bf2d6a21c",
+ "7896fdbbfe538dce1dc3a5b0873d74b0", "504aea29c6b27f21555d5516b8de2d8a",
+ "c5738e7fa82b91ea0e39232120da56ea", "19abbd934c243a6d9df7585d81332dd5",
+ "9e42b7b342e45c842dfa8aedaddbdfaa", "0e9eb07a89f8bf96bc219d5d1c3d9f6d",
+ "659393c31633e0f498bae384c9df5c7b", "bee3a28312da99dd550ec309ae4fff25",
+ };
+ static const char* const kDigests8x4[kNumIntraPredictors] = {
+ "5950744064518f77867c8e14ebd8b5d7", "46b6cbdc76efd03f4ac77870d54739f7",
+ "efe21fd1b98cb1663950e0bf49483b3b", "3c647b64760b298092cbb8e2f5c06bfd",
+ "c3595929687ffb04c59b128d56e2632f", "d89ad2ddf8a74a520fdd1d7019fd75b4",
+ "53907cb70ad597ee5885f6c58201f98b", "09d2282a29008b7fb47eb60ed6653d06",
+ "e341fc1c910d7cb2dac5dbc58b9c9af9", "a8fabd4c259b607a90a2e4d18cae49de",
+ };
+ static const char* const kDigests8x8[kNumIntraPredictors] = {
+ "06fb7cb52719855a38b4883b4b241749", "2013aafd42a4303efb553e42264ab8b0",
+ "2f070511d5680c12ca73a20e47fd6e23", "9923705af63e454392625794d5459fe0",
+ "04007a0d39778621266e2208a22c4fac", "2d296c202d36b4a53f1eaddda274e4a1",
+ "c87806c220d125c7563c2928e836fbbd", "339b49710a0099087e51ab5afc8d8713",
+ "c90fbc020afd9327bf35dccae099bf77", "95b356a7c346334d29294a5e2d13cfd9",
+ };
+ static const char* const kDigests8x16[kNumIntraPredictors] = {
+ "3c5a4574d96b5bb1013429636554e761", "8cf56b17c52d25eb785685f2ab48b194",
+ "7911e2e02abfbe226f17529ac5db08fc", "064e509948982f66a14293f406d88d42",
+ "5c443aa713891406d5be3af4b3cf67c6", "5d2cb98e532822ca701110cda9ada968",
+ "3d58836e17918b8890012dd96b95bb9d", "20e8d61ddc451b9e553a294073349ffd",
+ "a9aa6cf9d0dcf1977a1853ccc264e40b", "103859f85750153f47b81f68ab7881f2",
+ };
+ static const char* const kDigests8x32[kNumIntraPredictors] = {
+ "b393a2db7a76acaccc39e04d9dc3e8ac", "bbda713ee075a7ef095f0f479b5a1f82",
+ "f337dce3980f70730d6f6c2c756e3b62", "796189b05dc026e865c9e95491b255d1",
+ "ea932c21e7189eeb215c1990491320ab", "a9fffdf9455eba5e3b01317cae140289",
+ "9525dbfdbf5fba61ef9c7aa5fe887503", "8c6a7e3717ff8a459f415c79bb17341c",
+ "3761071bfaa2363a315fe07223f95a2d", "0e5aeb9b3f485b90df750469f60c15aa",
+ };
+ static const char* const kDigests16x4[kNumIntraPredictors] = {
+ "1c0a950b3ac500def73b165b6a38467c", "95e7f7300f19da280c6a506e40304462",
+ "28a6af15e31f76d3ff189012475d78f5", "e330d67b859bceef62b96fc9e1f49a34",
+ "36eca3b8083ce2fb5f7e6227dfc34e71", "08f567d2abaa8e83e4d9b33b3f709538",
+ "dc2d0ba13aa9369446932f03b53dc77d", "9ab342944c4b1357aa79d39d7bebdd3a",
+ "77ec278c5086c88b91d68eef561ed517", "60fbe11bfe216c182aaacdec326c4dae",
+ };
+ static const char* const kDigests16x8[kNumIntraPredictors] = {
+ "053a2bc4b5b7287fee524af4e77f077a", "619b720b13f14f32391a99ea7ff550d5",
+ "728d61c11b06baf7fe77881003a918b9", "889997b89a44c9976cb34f573e2b1eea",
+ "b43bfc31d1c770bb9ca5ca158c9beec4", "9d3fe9f762e0c6e4f114042147c50c7f",
+ "c74fdd7c9938603b01e7ecf9fdf08d61", "870c7336db1102f80f74526bd5a7cf4e",
+ "3fd5354a6190903d6a0b661fe177daf6", "409ca6b0b2558aeadf5ef2b8a887e67a",
+ };
+ static const char* const kDigests16x16[kNumIntraPredictors] = {
+ "1fa9e2086f6594bda60c30384fbf1635", "2098d2a030cd7c6be613edc74dc2faf8",
+ "f3c72b0c8e73f1ddca04d14f52d194d8", "6b31f2ee24cf88d3844a2fc67e1f39f3",
+ "d91a22a83575e9359c5e4871ab30ddca", "24c32a0d38b4413d2ef9bf1f842c8634",
+ "6e9e47bf9da9b2b9ae293e0bbd8ff086", "968b82804b5200b074bcdba9718140d4",
+ "4e6d7e612c5ae0bbdcc51a453cd1db3f", "ce763a41977647d072f33e277d69c7b9",
+ };
+ static const char* const kDigests16x32[kNumIntraPredictors] = {
+ "01afd04432026ff56327d6226b720be2", "a6e7be906cc6f1e7a520151bfa7c303d",
+ "bc05c46f18d0638f0228f1de64f07cd5", "204e613e429935f721a5b29cec7d44bb",
+ "aa0a7c9a7482dfc06d9685072fc5bafd", "ffb60f090d83c624bb4f7dc3a630ac4f",
+ "36bcb9ca9bb5eac520b050409de25da5", "34d9a5dd3363668391bc3bd05b468182",
+ "1e149c28db8b234e43931c347a523794", "6e8aff02470f177c3ff4416db79fc508",
+ };
+ static const char* const kDigests16x64[kNumIntraPredictors] = {
+ "727797ef15ccd8d325476fe8f12006a3", "f77c544ac8035e01920deae40cee7b07",
+ "12b0c69595328c465e0b25e0c9e3e9fc", "3b2a053ee8b05a8ac35ad23b0422a151",
+ "f3be77c0fe67eb5d9d515e92bec21eb7", "f1ece6409e01e9dd98b800d49628247d",
+ "efd2ec9bfbbd4fd1f6604ea369df1894", "ec703de918422b9e03197ba0ed60a199",
+ "739418efb89c07f700895deaa5d0b3e3", "9943ae1bbeeebfe1d3a92dc39e049d63",
+ };
+ static const char* const kDigests32x8[kNumIntraPredictors] = {
+ "4da55401331ed98acec0c516d7307513", "0ae6f3974701a5e6c20baccd26b4ca52",
+ "79b799f1eb77d5189535dc4e18873a0e", "90e943adf3de4f913864dce4e52b4894",
+ "5e1b9cc800a89ef45f5bdcc9e99e4e96", "3103405df20d254cbf32ac30872ead4b",
+ "648550e369b77687bff3c7d6f249b02f", "f9f73bcd8aadfc059fa260325df957a1",
+ "204cef70d741c25d4fe2b1d10d2649a5", "04c05e18488496eba64100faa25e8baf",
+ };
+ static const char* const kDigests32x16[kNumIntraPredictors] = {
+ "86ad1e1047abaf9959150222e8f19593", "1908cbe04eb4e5c9d35f1af7ffd7ee72",
+ "6ad3bb37ebe8374b0a4c2d18fe3ebb6a", "08d3cfe7a1148bff55eb6166da3378c6",
+ "656a722394764d17b6c42401b9e0ad3b", "4aa00c192102efeb325883737e562f0d",
+ "9881a90ca88bca4297073e60b3bb771a", "8cd74aada398a3d770fc3ace38ecd311",
+ "0a927e3f5ff8e8338984172cc0653b13", "d881d68b4eb3ee844e35e04ad6721f5f",
+ };
+ static const char* const kDigests32x32[kNumIntraPredictors] = {
+ "1303ca680644e3d8c9ffd4185bb2835b", "2a4d9f5cc8da307d4cf7dc021df10ba9",
+ "ced60d3f4e4b011a6a0314dd8a4b1fd8", "ced60d3f4e4b011a6a0314dd8a4b1fd8",
+ "1464b01aa928e9bd82c66bad0f921693", "90deadfb13d7c3b855ba21b326c1e202",
+ "af96a74f8033dff010e53a8521bc6f63", "9f1039f2ef082aaee69fcb7d749037c2",
+ "3f82893e478e204f2d254b34222d14dc", "ddb2b95ffb65b84dd4ff1f7256223305",
+ };
+ static const char* const kDigests32x64[kNumIntraPredictors] = {
+ "e1e8ed803236367821981500a3d9eebe", "0f46d124ba9f48cdd5d5290acf786d6d",
+ "4e2a2cfd8f56f15939bdfc753145b303", "0ce332b343934b34cd4417725faa85cb",
+ "1d2f8e48e3adb7c448be05d9f66f4954", "9fb2e176636a5689b26f73ca73fcc512",
+ "e720ebccae7e25e36f23da53ae5b5d6a", "86fe4364734169aaa4520d799890d530",
+ "b1870290764bb1b100d1974e2bd70f1d", "ce5b238e19d85ef69d85badfab4e63ae",
+ };
+ static const char* const kDigests64x16[kNumIntraPredictors] = {
+ "de1b736e9d99129609d6ef3a491507a0", "516d8f6eb054d74d150e7b444185b6b9",
+ "69e462c3338a9aaf993c3f7cfbc15649", "821b76b1494d4f84d20817840f719a1a",
+ "fd9b4276e7affe1e0e4ce4f428058994", "cd82fd361a4767ac29a9f406b480b8f3",
+ "2792c2f810157a4a6cb13c28529ff779", "1220442d90c4255ba0969d28b91e93a6",
+ "c7253e10b45f7f67dfee3256c9b94825", "879792198071c7e0b50b9b5010d8c18f",
+ };
+ static const char* const kDigests64x32[kNumIntraPredictors] = {
+ "e48e1ac15e97191a8fda08d62fff343e", "80c15b303235f9bc2259027bb92dfdc4",
+ "538424b24bd0830f21788e7238ca762f", "a6c5aeb722615089efbca80b02951ceb",
+ "12604b37875533665078405ef4582e35", "0048afa17bd3e1632d68b96048836530",
+ "07a0cfcb56a5eed50c4bd6c26814336b", "529d8a070de5bc6531fa3ee8f450c233",
+ "33c50a11c7d78f72434064f634305e95", "e0ef7f0559c1a50ec5a8c12011b962f7",
+ };
+ static const char* const kDigests64x64[kNumIntraPredictors] = {
+ "a1650dbcd56e10288c3e269eca37967d", "be91585259bc37bf4dc1651936e90b3e",
+ "afe020786b83b793c2bbd9468097ff6e", "6e1094fa7b50bc813aa2ba29f5df8755",
+ "9e5c34f3797e0cdd3cd9d4c05b0d8950", "bc87be7ac899cc6a28f399d7516c49fe",
+ "9811fd0d2dd515f06122f5d1bd18b784", "3c140e466f2c2c0d9cb7d2157ab8dc27",
+ "9543de76c925a8f6adc884cc7f98dc91", "df1df0376cc944afe7e74e94f53e575a",
+ };
+
+ switch (tx_size) {
+ case kTransformSize4x4:
+ return kDigests4x4;
+ case kTransformSize4x8:
+ return kDigests4x8;
+ case kTransformSize4x16:
+ return kDigests4x16;
+ case kTransformSize8x4:
+ return kDigests8x4;
+ case kTransformSize8x8:
+ return kDigests8x8;
+ case kTransformSize8x16:
+ return kDigests8x16;
+ case kTransformSize8x32:
+ return kDigests8x32;
+ case kTransformSize16x4:
+ return kDigests16x4;
+ case kTransformSize16x8:
+ return kDigests16x8;
+ case kTransformSize16x16:
+ return kDigests16x16;
+ case kTransformSize16x32:
+ return kDigests16x32;
+ case kTransformSize16x64:
+ return kDigests16x64;
+ case kTransformSize32x8:
+ return kDigests32x8;
+ case kTransformSize32x16:
+ return kDigests32x16;
+ case kTransformSize32x32:
+ return kDigests32x32;
+ case kTransformSize32x64:
+ return kDigests32x64;
+ case kTransformSize64x16:
+ return kDigests64x16;
+ case kTransformSize64x32:
+ return kDigests64x32;
+ case kTransformSize64x64:
+ return kDigests64x64;
+ default:
+ ADD_FAILURE() << "Unknown transform size: " << tx_size;
+ return nullptr;
+ }
+}
+
+TEST_P(IntraPredTest8bpp, DISABLED_Speed) {
+ const auto num_runs =
+ static_cast<int>(2.0e9 / (block_width_ * block_height_));
+ TestSpeed(GetIntraPredDigests8bpp(tx_size_), num_runs);
+}
+
+TEST_P(IntraPredTest8bpp, FixedInput) {
+ TestSpeed(GetIntraPredDigests8bpp(tx_size_), 1);
+}
+
+TEST_P(IntraPredTest8bpp, Overflow) { TestSaturatedValues(); }
+TEST_P(IntraPredTest8bpp, Random) { TestRandomValues(); }
+
+//------------------------------------------------------------------------------
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using IntraPredTest10bpp = IntraPredTest<10, uint16_t>;
+
+const char* const* GetIntraPredDigests10bpp(TransformSize tx_size) {
+ static const char* const kDigests4x4[kNumIntraPredictors] = {
+ "432bf9e762416bec582cb3654cbc4545", "8b9707ff4d506e0cb326f2d9a8d78705",
+ "a076275258cc5af87ed8b075136fb219", "f9587004012a8d2cecaa347331ccdf96",
+ "1c4e6890c5e6eed495fe54a6b6df8d6f", "0ae15fae8969a3c972ee895f325955a3",
+ "97db177738b831da8066df4f3fb7adbd", "4add5685b8a56991c9dce4ff7086ec25",
+ "75c6a655256188e378e70658b8f1631f", "14a27db20f9d5594ef74a7ea10c3e5ef",
+ };
+ static const char* const kDigests4x8[kNumIntraPredictors] = {
+ "9cbd7c18aca2737fa41db27150798819", "13d1e734692e27339c10b07da33c1113",
+ "0617cf74e2dd5d34ea517af1767fa47e", "c6a7b01228ccdf74af8528ef8f5f55c6",
+ "13b05d87b3d566b2f7a4b332cd8a762e", "b26ae0e8da1fe8989dfe2900fa2c3847",
+ "c30f3acdd386bdac91028fe48b751810", "04d2baf5192c5af97ca18d3b9b0d5968",
+ "a0ef82983822fc815bf1e8326cd41e33", "20bf218bae5f6b5c6d56b85f3f9bbadb",
+ };
+ static const char* const kDigests4x16[kNumIntraPredictors] = {
+ "d9b47bdddaa5e22312ff9ece7a3cae08", "cb76c79971b502dd8999a7047b3e2f86",
+ "3b09a3ff431d03b379acfdc444602540", "88608f6fcd687831e871053723cf76c3",
+ "a7bd2a17de1cf19c9a4b2c550f277a5c", "29b389f564f266a67687b8d2bc750418",
+ "4680847c30fe93c06f87e2ee1da544d6", "0e4eda11e1fe6ebe8526c2a2c5390bbb",
+ "bf3e20197282885acabb158f3a77ba59", "fccea71d1a253316b905f4a073c84a36",
+ };
+ static const char* const kDigests8x4[kNumIntraPredictors] = {
+ "05ba0ed96aac48cd94e7597f12184320", "d97d04e791904d3cedc34d5430a4d1d2",
+ "49217081a169c2d30b0a43f816d0b58b", "09e2a6a6bfe35b83e9434ee9c8dcf417",
+ "4b03c8822169ee4fa058513d65f0e32f", "cabdeebc923837ee3f2d3480354d6a81",
+ "957eda610a23a011ed25976aee94eaf0", "4a197e3dfce1f0d3870138a9b66423aa",
+ "18c0d0fbe0e96a0baf2f98fa1908cbb9", "21114e5737328cdbba9940e4f85a0855",
+ };
+ static const char* const kDigests8x8[kNumIntraPredictors] = {
+ "430e99eecda7e6434e1973dbdcc2a29d", "88864d7402c09b57735db49c58707304",
+ "8312f80b936380ceb51375e29a4fd75d", "472a7ed9c68bdbd9ecca197b7a8b3f01",
+ "4f66ee4dc0cb752c3b65d576cd06bb5c", "36383d6f61799143470129e2d5241a6f",
+ "c96279406c8d2d02771903e93a4e8d37", "4fb64f9700ed0bf08fbe7ab958535348",
+ "c008c33453ac9cf8c42ae6ec88f9941c", "39c401a9938b23e318ae7819e458daf1",
+ };
+ static const char* const kDigests8x16[kNumIntraPredictors] = {
+ "bda6b75fedfe0705f9732ff84c918672", "4ff130a47429e0762386557018ec10b2",
+ "8156557bf938d8e3a266318e57048fc5", "bdfa8e01a825ec7ae2d80519e3c94eec",
+ "108fc8e5608fe09f9cc30d7a52cbc0c1", "a2271660af5424b64c6399ca5509dee1",
+ "b09af9729f39516b28ff62363f8c0cb2", "4fe67869dac99048dfcf4d4e621884ec",
+ "311f498369a9c98f77a961bf91e73e65", "d66e78b9f41d5ee6a4b25e37ec9af324",
+ };
+ static const char* const kDigests8x32[kNumIntraPredictors] = {
+ "26c45325f02521e7e5c66c0aa0819329", "79dfb68513d4ccd2530c485f0367858e",
+ "8288e99b4d738b13956882c3ad3f03fe", "7c4993518b1620b8be8872581bb72239",
+ "2b1c3126012d981f787ed0a2601ee377", "051ba9f0c4d4fecb1fcd81fdea94cae4",
+ "320362239ad402087303a4df39512bb1", "210df35b2055c9c01b9e3e5ae24e524b",
+ "f8536db74ce68c0081bbd8799dac25f9", "27f2fe316854282579906d071af6b705",
+ };
+ static const char* const kDigests16x4[kNumIntraPredictors] = {
+ "decff67721ff7e9e65ec641e78f5ccf3", "99e3b2fbdabfa9b76b749cfb6530a9fd",
+ "accdb3d25629916963a069f1e1c0e061", "ad42855e9146748b0e235b8428487b4b",
+ "53025e465f267e7af2896ebd028447a0", "577d26fcd2d655cc77a1f1f875648699",
+ "7a61a3619267221b448b20723840e9f0", "fb4ccc569bdae3614e87bc5be1e84284",
+ "b866095d8a3e6910cc4f92f8d8d6075a", "6ba9013cba1624872bfbac111e8d344a",
+ };
+ static const char* const kDigests16x8[kNumIntraPredictors] = {
+ "2832156bd076c75f8be5622f34cb3efe", "da70e516f5a8842dd4965b80cd8d2a76",
+ "c3e137c6d79c57be2073d1eda22c8d1e", "8c5d28c7b3301b50326582dd7f89a175",
+ "9d8558775155b201cd178ab61458b642", "ecbddb9c6808e0c609c8fe537b7f7408",
+ "29a123c22cb4020170f9a80edf1208da", "653d0cd0688aa682334156f7b4599b34",
+ "1bfa66ae92a22a0346511db1713fe7df", "1802ad1e657e7fc08fc063342f471ca1",
+ };
+ static const char* const kDigests16x16[kNumIntraPredictors] = {
+ "2270c626de9d49769660ae9184a6428f", "9f069625cdcdd856e2e7ec19ff4fcd50",
+ "34167b9c413362a377aa7b1faf92ae6d", "3cec2b23d179765daea8dfb87c9efdd5",
+ "daa8f0863a5df2aef2b20999961cc8f8", "d9e4dd4bc63991e4f09cb97eb25f4db4",
+ "4e1a182fc3fcf5b9f5a73898f81c2004", "c58e4275406c9fd1c2a74b40c27afff0",
+ "b8092796fd4e4dd9d2b92afb770129ba", "75424d1f18ff00c4093743d033c6c9b6",
+ };
+ static const char* const kDigests16x32[kNumIntraPredictors] = {
+ "5aa050947f3d488537f5a68c23bb135b", "9e66143a2c3863b6fe171275a192d378",
+ "86b0c4777625e84d52913073d234f860", "9e2144fcf2107c76cec4241416bbecd5",
+ "c72be592efc72c3c86f2359b6f622aba", "c4e0e735545f78f43e21e9c39eab7b8f",
+ "52122e7c84a4bab67a8a359efb427023", "7b5fd8bb7e0744e81fd6fa4ed4c2e0fb",
+ "a9950d110bffb0411a8fcd1262dceef0", "2a2dd496f01f5d87f257ed202a703cbe",
+ };
+ static const char* const kDigests16x64[kNumIntraPredictors] = {
+ "eeb1b873e81ca428b11f162bd5b28843", "39ce7d22791f82562b0ca1e0afdf1604",
+ "6bd6bdac8982a4b84613f9963d35d5e9", "a9ac2438e87522621c7e6fe6d02c01ab",
+ "a8b9c471fe6c66ed0717e77fea77bba1", "e050b6aa38aee6e951d3be5a94a8abd0",
+ "3c5ecc31aa45e8175d37e90af247bca6", "30c0f9e412ea726970f575f910edfb94",
+ "f3d96395816ce58fb98480a5b4c32ab2", "9c14811957e013fb009dcd4a3716b338",
+ };
+ static const char* const kDigests32x8[kNumIntraPredictors] = {
+ "d6560d7fc9ae9bd7c25e2983b4a825e3", "90a67154bbdc26cd06ab0fa25fff3c53",
+ "c42d37c5a634e68fafc982626842db0b", "ecc8646d258cfa431facbc0dba168f80",
+ "9f3c167b790b52242dc8686c68eac389", "62dc3bc34406636ccec0941579461f65",
+ "5c0f0ebdb3c936d4decc40d5261aec7c", "dbfc0f056ca25e0331042da6d292e10a",
+ "14fa525d74e6774781198418d505c595", "5f95e70db03da9ed70cd79e23f19199c",
+ };
+ static const char* const kDigests32x16[kNumIntraPredictors] = {
+ "dfe3630aa9eeb1adcc8604269a309f26", "ba6180227d09f5a573f69dc6ee1faf80",
+ "03edea9d71ca3d588e1a0a69aecdf555", "2c8805415f44b4fac6692090dc1b1ddd",
+ "18efd17ed72a6e92ef8b0a692cf7a2e3", "63a6e0abfb839b43c68c23b2c43c8918",
+ "be15479205bb60f5a17baaa81a6b47ad", "243d21e1d9f9dd2b981292ac7769315a",
+ "21de1cb5269e0e1d08930c519e676bf7", "73065b3e27e9c4a3a6d043712d3d8b25",
+ };
+ static const char* const kDigests32x32[kNumIntraPredictors] = {
+ "c3136bb829088e33401b1affef91f692", "68bbcf93d17366db38bbc7605e07e322",
+ "2786be5fb7c25eeec4d2596c4154c3eb", "25ac7468e691753b8291be859aac7493",
+ "a6805ce21bfd26760e749efc8f590fa3", "5a38fd324b466e8ac43f5e289d38107e",
+ "dd0628fc5cc920b82aa941378fa907c8", "8debadbdb2dec3dc7eb43927e9d36998",
+ "61e1bc223c9e04c64152cc4531b6c099", "900b00ac1f20c0a8d22f8b026c0ee1cc",
+ };
+ static const char* const kDigests32x64[kNumIntraPredictors] = {
+ "5a591b2b83f0a6cce3c57ce164a5f983", "f42167ec516102b83b2c5176df57316b",
+ "58f3772d3df511c8289b340beb178d96", "c24166e7dc252d34ac6f92712956d751",
+ "7dca3acfe2ea09e6292a9ece2078b827", "5c029235fc0820804e40187d2b22a96e",
+ "375572944368afbc04ca97dab7fb3328", "8867235908736fd99c4022e4ed604e6e",
+ "63ec336034d62846b75558c49082870f", "46f35d85eb8499d61bfeac1c49e52531",
+ };
+ static const char* const kDigests64x16[kNumIntraPredictors] = {
+ "67755882209304659a0e6bfc324e16b9", "cd89b272fecb5f23431b3f606f590722",
+ "9bcff7d971a4af0a2d1cac6d66d83482", "d8d6bb55ebeec4f03926908d391e15ba",
+ "0eb5b5ced3e7177a1dd6a1e72e7a7d21", "92b47fe431d9cf66f9e601854f0f3017",
+ "7dc599557eddb2ea480f86fc89c76b30", "4f40175676c164320fe8005440ad9217",
+ "b00eacb24081a041127f136e9e5983ec", "cb0ab76a5e90f2eb75c38b99b9833ff8",
+ };
+ static const char* const kDigests64x32[kNumIntraPredictors] = {
+ "21d873011d1b4ef1daedd9aa8c6938ea", "4866da21db0261f738903d97081cb785",
+ "a722112233a82595a8d001a4078b834d", "24c7a133c6fcb59129c3782ef908a6c1",
+ "490e40505dd255d3a909d8a72c280cbc", "2afe719fb30bf2a664829bb74c8f9e2a",
+ "623adad2ebb8f23e355cd77ace4616cd", "d6092541e9262ad009bef79a5d350a86",
+ "ae86d8fba088683ced8abfd7e1ddf380", "32aa8aa21f2f24333d31f99e12b95c53",
+ };
+ static const char* const kDigests64x64[kNumIntraPredictors] = {
+ "6d88aeb40dfe3ac43c68808ca3c00806", "6a75d88ac291d6a3aaf0eec0ddf2aa65",
+ "30ef52d7dc451affdd587c209f5cb2dd", "e073f7969f392258eaa907cf0636452a",
+ "de10f07016a2343bcd3a9deb29f4361e", "dc35ff273fea4355d2c8351c2ed14e6e",
+ "01b9a545968ac75c3639ddabb837fa0b", "85c98ed9c0ea1523a15281bc9a909b8c",
+ "4c255f7ef7fd46db83f323806d79dca4", "fe2fe6ffb19cb8330e2f2534271d6522",
+ };
+
+ switch (tx_size) {
+ case kTransformSize4x4:
+ return kDigests4x4;
+ case kTransformSize4x8:
+ return kDigests4x8;
+ case kTransformSize4x16:
+ return kDigests4x16;
+ case kTransformSize8x4:
+ return kDigests8x4;
+ case kTransformSize8x8:
+ return kDigests8x8;
+ case kTransformSize8x16:
+ return kDigests8x16;
+ case kTransformSize8x32:
+ return kDigests8x32;
+ case kTransformSize16x4:
+ return kDigests16x4;
+ case kTransformSize16x8:
+ return kDigests16x8;
+ case kTransformSize16x16:
+ return kDigests16x16;
+ case kTransformSize16x32:
+ return kDigests16x32;
+ case kTransformSize16x64:
+ return kDigests16x64;
+ case kTransformSize32x8:
+ return kDigests32x8;
+ case kTransformSize32x16:
+ return kDigests32x16;
+ case kTransformSize32x32:
+ return kDigests32x32;
+ case kTransformSize32x64:
+ return kDigests32x64;
+ case kTransformSize64x16:
+ return kDigests64x16;
+ case kTransformSize64x32:
+ return kDigests64x32;
+ case kTransformSize64x64:
+ return kDigests64x64;
+ default:
+ ADD_FAILURE() << "Unknown transform size: " << tx_size;
+ return nullptr;
+ }
+}
+
+TEST_P(IntraPredTest10bpp, DISABLED_Speed) {
+ const auto num_runs =
+ static_cast<int>(2.0e9 / (block_width_ * block_height_));
+ TestSpeed(GetIntraPredDigests10bpp(tx_size_), num_runs);
+}
+
+TEST_P(IntraPredTest10bpp, FixedInput) {
+ TestSpeed(GetIntraPredDigests10bpp(tx_size_), 1);
+}
+
+TEST_P(IntraPredTest10bpp, Overflow) { TestSaturatedValues(); }
+TEST_P(IntraPredTest10bpp, Random) { TestRandomValues(); }
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+constexpr TransformSize kTransformSizes[] = {
+ kTransformSize4x4, kTransformSize4x8, kTransformSize4x16,
+ kTransformSize8x4, kTransformSize8x8, kTransformSize8x16,
+ kTransformSize8x32, kTransformSize16x4, kTransformSize16x8,
+ kTransformSize16x16, kTransformSize16x32, kTransformSize16x64,
+ kTransformSize32x8, kTransformSize32x16, kTransformSize32x32,
+ kTransformSize32x64, kTransformSize64x16, kTransformSize64x32,
+ kTransformSize64x64};
+
+INSTANTIATE_TEST_SUITE_P(C, IntraPredTest8bpp,
+ testing::ValuesIn(kTransformSizes));
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, IntraPredTest8bpp,
+ testing::ValuesIn(kTransformSizes));
+#endif // LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, IntraPredTest8bpp,
+ testing::ValuesIn(kTransformSizes));
+#endif // LIBGAV1_ENABLE_NEON
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+INSTANTIATE_TEST_SUITE_P(C, IntraPredTest10bpp,
+ testing::ValuesIn(kTransformSizes));
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, IntraPredTest10bpp,
+ testing::ValuesIn(kTransformSizes));
+#endif // LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, IntraPredTest10bpp,
+ testing::ValuesIn(kTransformSizes));
+#endif // LIBGAV1_ENABLE_NEON
+
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+} // namespace
+} // namespace dsp
+
+static std::ostream& operator<<(std::ostream& os, const TransformSize tx_size) {
+ return os << ToString(tx_size);
+}
+
+} // namespace libgav1
diff --git a/src/dsp/inverse_transform.cc b/src/dsp/inverse_transform.cc
index a03fad2..ed984d8 100644
--- a/src/dsp/inverse_transform.cc
+++ b/src/dsp/inverse_transform.cc
@@ -1184,9 +1184,10 @@ void TransformLoop_C(TransformType tx_type, TransformSize tx_size,
Residual tx_buffer[64];
for (int j = 0; j < tx_width; ++j) {
const int flipped_j = flip_columns ? tx_width - j - 1 : j;
- for (int i = 0; i < tx_height; ++i) {
+ int i = 0;
+ do {
tx_buffer[i] = residual[i][flipped_j];
- }
+ } while (++i != tx_height);
if (adjusted_tx_height == 1) {
dconly_transform1d(tx_buffer, column_clamp_range, false, 0, false);
} else {
@@ -1211,6 +1212,7 @@ void TransformLoop_C(TransformType tx_type, TransformSize tx_size,
//------------------------------------------------------------------------------
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
template <int bitdepth, typename Residual, typename Pixel>
void InitAll(Dsp* const dsp) {
// Maximum transform size for Dct is 64.
@@ -1325,6 +1327,7 @@ void InitAll(Dsp* const dsp) {
Wht4DcOnly_C<bitdepth, Residual>, Wht4_C<Residual>,
/*is_row=*/false>;
}
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
void Init8bpp() {
Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
diff --git a/src/dsp/inverse_transform_test.cc b/src/dsp/inverse_transform_test.cc
new file mode 100644
index 0000000..623e203
--- /dev/null
+++ b/src/dsp/inverse_transform_test.cc
@@ -0,0 +1,536 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/inverse_transform.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+#include <ostream>
+
+#include "absl/strings/match.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/bit_mask_set.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "tests/block_utils.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kMaxBlockSize = 64;
+constexpr int kTotalPixels = kMaxBlockSize * kMaxBlockSize;
+
+const char* const kTransformSize1DNames[kNum1DTransformSizes] = {
+ "k1DTransformSize4", "k1DTransformSize8", "k1DTransformSize16",
+ "k1DTransformSize32", "k1DTransformSize64"};
+
+constexpr TransformSize1D kRow1DTransformSizes[] = {
+ k1DTransformSize4, k1DTransformSize4, k1DTransformSize4,
+ k1DTransformSize8, k1DTransformSize8, k1DTransformSize8,
+ k1DTransformSize8, k1DTransformSize16, k1DTransformSize16,
+ k1DTransformSize16, k1DTransformSize16, k1DTransformSize16,
+ k1DTransformSize32, k1DTransformSize32, k1DTransformSize32,
+ k1DTransformSize32, k1DTransformSize64, k1DTransformSize64,
+ k1DTransformSize64};
+
+constexpr TransformSize1D kCol1DTransformSizes[] = {
+ k1DTransformSize4, k1DTransformSize8, k1DTransformSize16,
+ k1DTransformSize4, k1DTransformSize8, k1DTransformSize16,
+ k1DTransformSize32, k1DTransformSize4, k1DTransformSize8,
+ k1DTransformSize16, k1DTransformSize32, k1DTransformSize64,
+ k1DTransformSize8, k1DTransformSize16, k1DTransformSize32,
+ k1DTransformSize64, k1DTransformSize16, k1DTransformSize32,
+ k1DTransformSize64};
+
+template <int bitdepth, typename SrcPixel, typename DstPixel>
+class InverseTransformTestBase : public testing::TestWithParam<TransformSize>,
+ public test_utils::MaxAlignedAllocable {
+ public:
+ InverseTransformTestBase() {
+ switch (tx_size_) {
+ case kNumTransformSizes:
+ EXPECT_NE(tx_size_, kNumTransformSizes);
+ break;
+ default:
+ block_width_ = kTransformWidth[tx_size_];
+ block_height_ = kTransformHeight[tx_size_];
+ break;
+ }
+ }
+
+ InverseTransformTestBase(const InverseTransformTestBase&) = delete;
+ InverseTransformTestBase& operator=(const InverseTransformTestBase&) = delete;
+ ~InverseTransformTestBase() override = default;
+
+ protected:
+ struct InverseTransformMem {
+ void Reset(libvpx_test::ACMRandom* rnd, int width, int height) {
+ ASSERT_NE(rnd, nullptr);
+ // Limit the size of the residual values to bitdepth + sign in order
+ // to prevent outranging in the transforms.
+ const int num_bits = bitdepth + 1;
+ const int sign_shift = (bitdepth == 8 ? 16 : 32) - num_bits;
+ const int mask = (1 << num_bits) - 1;
+ // Fill residual with random data. For widths == 64, only fill the upper
+ // left 32 x min(block_height_, 32).
+ memset(ref_src, 0, sizeof(ref_src));
+ SrcPixel* r = ref_src;
+ const int stride = width;
+ for (int y = 0; y < std::min(height, 32); ++y) {
+ for (int x = 0; x < std::min(width, 32); ++x) {
+ r[x] = rnd->Rand16() & mask;
+ // The msb of num_bits is the sign bit, so force each 16 bit value to
+ // the correct sign.
+ r[x] = (r[x] << sign_shift) >> sign_shift;
+ }
+ r += stride;
+ }
+
+ // Set frame data to random values.
+ for (int y = 0; y < kMaxBlockSize; ++y) {
+ for (int x = 0; x < kMaxBlockSize; ++x) {
+ const int mask = (1 << bitdepth) - 1;
+ cur_frame[y * kMaxBlockSize + x] = base_frame[y * kMaxBlockSize + x] =
+ rnd->Rand16() & mask;
+ }
+ }
+ }
+
+ // Set ref_src to |pixel|.
+ void Set(const SrcPixel pixel) {
+ for (auto& r : ref_src) r = pixel;
+ }
+
+ alignas(kMaxAlignment) DstPixel base_frame[kTotalPixels];
+ alignas(kMaxAlignment) DstPixel cur_frame[kTotalPixels];
+
+ alignas(kMaxAlignment) SrcPixel base_residual[kTotalPixels];
+ alignas(kMaxAlignment) SrcPixel cur_residual[kTotalPixels];
+
+ alignas(kMaxAlignment) SrcPixel ref_src[kTotalPixels];
+ };
+
+ void SetUp() override { test_utils::ResetDspTable(bitdepth); }
+
+ const TransformSize tx_size_ = GetParam();
+ int block_width_;
+ int block_height_;
+ InverseTransformMem inverse_transform_mem_;
+};
+
+//------------------------------------------------------------------------------
+// InverseTransformTest
+
+template <int bitdepth, typename Pixel, typename DstPixel>
+class InverseTransformTest
+ : public InverseTransformTestBase<bitdepth, Pixel, DstPixel> {
+ public:
+ InverseTransformTest() = default;
+ InverseTransformTest(const InverseTransformTest&) = delete;
+ InverseTransformTest& operator=(const InverseTransformTest&) = delete;
+ ~InverseTransformTest() override = default;
+
+ protected:
+ using InverseTransformTestBase<bitdepth, Pixel, DstPixel>::tx_size_;
+ using InverseTransformTestBase<bitdepth, Pixel, DstPixel>::block_width_;
+ using InverseTransformTestBase<bitdepth, Pixel, DstPixel>::block_height_;
+ using InverseTransformTestBase<bitdepth, Pixel,
+ DstPixel>::inverse_transform_mem_;
+
+ void SetUp() override {
+ InverseTransformTestBase<bitdepth, Pixel, DstPixel>::SetUp();
+ InverseTransformInit_C();
+
+ const Dsp* const dsp = GetDspTable(bitdepth);
+ ASSERT_NE(dsp, nullptr);
+
+ tx_size_1d_row_ = kRow1DTransformSizes[tx_size_];
+ tx_size_1d_column_ = kCol1DTransformSizes[tx_size_];
+
+ memcpy(base_inverse_transforms_, dsp->inverse_transforms,
+ sizeof(base_inverse_transforms_));
+
+ const testing::TestInfo* const test_info =
+ testing::UnitTest::GetInstance()->current_test_info();
+ const char* const test_case = test_info->test_suite_name();
+ if (absl::StartsWith(test_case, "C/")) {
+ memset(base_inverse_transforms_, 0, sizeof(base_inverse_transforms_));
+ } else if (absl::StartsWith(test_case, "SSE41/")) {
+ if ((GetCpuInfo() & kSSE4_1) != 0) {
+ InverseTransformInit_SSE4_1();
+ }
+ } else if (absl::StartsWith(test_case, "NEON/")) {
+ InverseTransformInit_NEON();
+ InverseTransformInit10bpp_NEON();
+ } else {
+ FAIL() << "Unrecognized architecture prefix in test case name: "
+ << test_case;
+ }
+
+ memcpy(cur_inverse_transforms_, dsp->inverse_transforms,
+ sizeof(cur_inverse_transforms_));
+
+ for (int i = 0; i < kNum1DTransforms; ++i) {
+ // skip functions that haven't been specialized for this particular
+ // architecture.
+ if (cur_inverse_transforms_[i][tx_size_1d_row_][kRow] ==
+ base_inverse_transforms_[i][tx_size_1d_row_][kRow]) {
+ cur_inverse_transforms_[i][tx_size_1d_row_][kRow] = nullptr;
+ }
+ if (cur_inverse_transforms_[i][tx_size_1d_column_][kColumn] ==
+ base_inverse_transforms_[i][tx_size_1d_column_][kColumn]) {
+ cur_inverse_transforms_[i][tx_size_1d_column_][kColumn] = nullptr;
+ }
+ }
+
+ base_frame_buffer_.Reset(kMaxBlockSize, kMaxBlockSize,
+ inverse_transform_mem_.base_frame);
+
+ cur_frame_buffer_.Reset(kMaxBlockSize, kMaxBlockSize,
+ inverse_transform_mem_.cur_frame);
+ }
+
+ // These tests modify inverse_transform_mem_.
+ void TestRandomValues(int num_tests);
+ void TestDcOnlyRandomValue(int num_tests);
+
+ Array2DView<DstPixel> base_frame_buffer_;
+ Array2DView<DstPixel> cur_frame_buffer_;
+
+ TransformSize1D tx_size_1d_row_ = k1DTransformSize4;
+ TransformSize1D tx_size_1d_column_ = k1DTransformSize4;
+
+ InverseTransformAddFuncs base_inverse_transforms_;
+ InverseTransformAddFuncs cur_inverse_transforms_;
+};
+
+constexpr TransformType kLibgav1TxType[kNumTransformTypes] = {
+ kTransformTypeDctDct, kTransformTypeAdstDct,
+ kTransformTypeDctAdst, kTransformTypeAdstAdst,
+ kTransformTypeFlipadstDct, kTransformTypeDctFlipadst,
+ kTransformTypeFlipadstFlipadst, kTransformTypeAdstFlipadst,
+ kTransformTypeFlipadstAdst, kTransformTypeIdentityIdentity,
+ kTransformTypeIdentityDct, kTransformTypeDctIdentity,
+ kTransformTypeIdentityAdst, kTransformTypeAdstIdentity,
+ kTransformTypeIdentityFlipadst, kTransformTypeFlipadstIdentity};
+
+// Maps TransformType to dsp::Transform1D for the row transforms.
+constexpr Transform1D kRowTransform[kNumTransformTypes] = {
+ k1DTransformDct, k1DTransformAdst, k1DTransformDct,
+ k1DTransformAdst, k1DTransformAdst, k1DTransformDct,
+ k1DTransformAdst, k1DTransformAdst, k1DTransformAdst,
+ k1DTransformIdentity, k1DTransformIdentity, k1DTransformDct,
+ k1DTransformIdentity, k1DTransformAdst, k1DTransformIdentity,
+ k1DTransformAdst};
+
+// Maps TransformType to dsp::Transform1D for the column transforms.
+constexpr Transform1D kColumnTransform[kNumTransformTypes] = {
+ k1DTransformDct, k1DTransformDct, k1DTransformAdst,
+ k1DTransformAdst, k1DTransformDct, k1DTransformAdst,
+ k1DTransformAdst, k1DTransformAdst, k1DTransformAdst,
+ k1DTransformIdentity, k1DTransformDct, k1DTransformIdentity,
+ k1DTransformAdst, k1DTransformIdentity, k1DTransformAdst,
+ k1DTransformIdentity};
+
+// Mask indicating whether the transform sets contain a particular transform
+// type. If |tx_type| is present in |tx_set|, then the |tx_type|th LSB is set.
+constexpr BitMaskSet kTransformTypeInSetMask[kNumTransformSets] = {
+ BitMaskSet(0x1), BitMaskSet(0xE0F), BitMaskSet(0x20F),
+ BitMaskSet(0xFFFF), BitMaskSet(0xFFF), BitMaskSet(0x201)};
+
+bool IsTxSizeTypeValid(TransformSize tx_size, TransformType tx_type) {
+ const TransformSize tx_size_square_max = kTransformSizeSquareMax[tx_size];
+ TransformSet tx_set;
+ if (tx_size_square_max > kTransformSize32x32) {
+ tx_set = kTransformSetDctOnly;
+ } else if (tx_size_square_max == kTransformSize32x32) {
+ tx_set = kTransformSetInter3;
+ } else if (tx_size_square_max == kTransformSize16x16) {
+ tx_set = kTransformSetInter2;
+ } else {
+ tx_set = kTransformSetInter1;
+ }
+ return kTransformTypeInSetMask[tx_set].Contains(tx_type);
+}
+
+template <int bitdepth, typename Pixel, typename DstPixel>
+void InverseTransformTest<bitdepth, Pixel, DstPixel>::TestRandomValues(
+ int num_tests) {
+ libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+
+ for (int tx_type_idx = 0; tx_type_idx < kNumTransformTypes; ++tx_type_idx) {
+ const TransformType tx_type = kLibgav1TxType[tx_type_idx];
+ const Transform1D row_transform = kRowTransform[tx_type];
+ const Transform1D column_transform = kColumnTransform[tx_type];
+
+ // Skip the 'C' test case as this is used as the reference.
+ if (base_inverse_transforms_[row_transform][tx_size_1d_row_][kRow] ==
+ nullptr ||
+ cur_inverse_transforms_[row_transform][tx_size_1d_row_][kRow] ==
+ nullptr ||
+ base_inverse_transforms_[column_transform][tx_size_1d_column_]
+ [kColumn] == nullptr ||
+ cur_inverse_transforms_[column_transform][tx_size_1d_column_]
+ [kColumn] == nullptr) {
+ continue;
+ }
+
+ // Only test valid tx_size for given tx_type. See 5.11.40.
+ if (!IsTxSizeTypeValid(tx_size_, tx_type)) continue;
+
+ absl::Duration base_elapsed_time[2];
+ absl::Duration cur_elapsed_time[2];
+
+ for (int n = 0; n < num_tests; ++n) {
+ const int tx_height = std::min(block_height_, 32);
+ const int start_x = 0;
+ const int start_y = 0;
+
+ inverse_transform_mem_.Reset(&rnd, block_width_, block_height_);
+ memcpy(inverse_transform_mem_.base_residual,
+ inverse_transform_mem_.ref_src,
+ sizeof(inverse_transform_mem_.ref_src));
+ memcpy(inverse_transform_mem_.cur_residual,
+ inverse_transform_mem_.ref_src,
+ sizeof(inverse_transform_mem_.ref_src));
+
+ const absl::Time base_row_start = absl::Now();
+ base_inverse_transforms_[row_transform][tx_size_1d_row_][kRow](
+ tx_type, tx_size_, tx_height, inverse_transform_mem_.base_residual,
+ start_x, start_y, &base_frame_buffer_);
+ base_elapsed_time[kRow] += absl::Now() - base_row_start;
+
+ const absl::Time cur_row_start = absl::Now();
+ cur_inverse_transforms_[row_transform][tx_size_1d_row_][kRow](
+ tx_type, tx_size_, tx_height, inverse_transform_mem_.cur_residual,
+ start_x, start_y, &cur_frame_buffer_);
+ cur_elapsed_time[kRow] += absl::Now() - cur_row_start;
+
+ const absl::Time base_column_start = absl::Now();
+ base_inverse_transforms_[column_transform][tx_size_1d_column_][kColumn](
+ tx_type, tx_size_, tx_height, inverse_transform_mem_.base_residual,
+ start_x, start_y, &base_frame_buffer_);
+ base_elapsed_time[kColumn] += absl::Now() - base_column_start;
+
+ const absl::Time cur_column_start = absl::Now();
+ cur_inverse_transforms_[column_transform][tx_size_1d_column_][kColumn](
+ tx_type, tx_size_, tx_height, inverse_transform_mem_.cur_residual,
+ start_x, start_y, &cur_frame_buffer_);
+ cur_elapsed_time[kColumn] += absl::Now() - cur_column_start;
+
+ if (!test_utils::CompareBlocks(inverse_transform_mem_.base_frame,
+ inverse_transform_mem_.cur_frame,
+ block_width_, block_height_, kMaxBlockSize,
+ kMaxBlockSize, false)) {
+ ADD_FAILURE() << "Result from optimized version of "
+ << ToString(
+ static_cast<TransformSize1D>(tx_size_1d_column_))
+ << " differs from reference in iteration #" << n
+ << "tx_type_idx:" << tx_type_idx;
+ break;
+ }
+ }
+
+ if (num_tests > 1) {
+ const auto base_row_elapsed_time_us =
+ static_cast<int>(absl::ToInt64Microseconds(base_elapsed_time[kRow]));
+ const auto cur_row_elapsed_time_us =
+ static_cast<int>(absl::ToInt64Microseconds(cur_elapsed_time[kRow]));
+ printf("TxType %30s[%19s]:: base_row: %5d us cur_row: %5d us %2.2fx \n",
+ ToString(tx_type), kTransformSize1DNames[tx_size_1d_row_],
+ base_row_elapsed_time_us, cur_row_elapsed_time_us,
+ static_cast<float>(base_row_elapsed_time_us) /
+ static_cast<float>(cur_row_elapsed_time_us));
+ const auto base_column_elapsed_time_us = static_cast<int>(
+ absl::ToInt64Microseconds(base_elapsed_time[kColumn]));
+ const auto cur_column_elapsed_time_us = static_cast<int>(
+ absl::ToInt64Microseconds(cur_elapsed_time[kColumn]));
+ printf("TxType %30s[%19s]:: base_col: %5d us cur_col: %5d us %2.2fx \n",
+ ToString(tx_type), kTransformSize1DNames[tx_size_1d_column_],
+ base_column_elapsed_time_us, cur_column_elapsed_time_us,
+ static_cast<float>(base_column_elapsed_time_us) /
+ static_cast<float>(cur_column_elapsed_time_us));
+ }
+ }
+}
+
+template <int bitdepth, typename Pixel, typename DstPixel>
+void InverseTransformTest<bitdepth, Pixel, DstPixel>::TestDcOnlyRandomValue(
+ int num_tests) {
+ libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+
+ for (int tx_type_idx = 0; tx_type_idx < kNumTransformTypes; ++tx_type_idx) {
+ const TransformType tx_type = kLibgav1TxType[tx_type_idx];
+ const Transform1D row_transform = kRowTransform[tx_type];
+ const Transform1D column_transform = kColumnTransform[tx_type];
+
+ if (cur_inverse_transforms_[row_transform][tx_size_1d_row_][kRow] ==
+ nullptr ||
+ cur_inverse_transforms_[column_transform][tx_size_1d_column_]
+ [kColumn] == nullptr) {
+ continue;
+ }
+
+ // Only test valid tx_size for given tx_type. See 5.11.40.
+ if (IsTxSizeTypeValid(tx_size_, tx_type) == 0) continue;
+
+ absl::Duration base_elapsed_time[2];
+ absl::Duration cur_elapsed_time[2];
+
+ for (int n = 0; n < num_tests; ++n) {
+ const int tx_height = std::min(block_height_, 32);
+ const int start_x = 0;
+ const int start_y = 0;
+
+ // Using width == 1 and height == 1 will reset only the dc value.
+ inverse_transform_mem_.Reset(&rnd, 1, 1);
+ memcpy(inverse_transform_mem_.base_residual,
+ inverse_transform_mem_.ref_src,
+ sizeof(inverse_transform_mem_.ref_src));
+ memcpy(inverse_transform_mem_.cur_residual,
+ inverse_transform_mem_.ref_src,
+ sizeof(inverse_transform_mem_.ref_src));
+
+ // For this test, the "base" contains the output when the
+ // tx_height is set to the max for the given block size. The
+ // "cur" contains the output when the passed in tx_height is 1.
+ // Compare the outputs for match.
+ const absl::Time base_row_start = absl::Now();
+ cur_inverse_transforms_[row_transform][tx_size_1d_row_][kRow](
+ tx_type, tx_size_, tx_height, inverse_transform_mem_.base_residual,
+ start_x, start_y, &base_frame_buffer_);
+ base_elapsed_time[kRow] += absl::Now() - base_row_start;
+
+ const absl::Time cur_row_start = absl::Now();
+ cur_inverse_transforms_[row_transform][tx_size_1d_row_][kRow](
+ tx_type, tx_size_, /*adjusted_tx_height=*/1,
+ inverse_transform_mem_.cur_residual, start_x, start_y,
+ &cur_frame_buffer_);
+ cur_elapsed_time[kRow] += absl::Now() - cur_row_start;
+
+ const absl::Time base_column_start = absl::Now();
+ cur_inverse_transforms_[column_transform][tx_size_1d_column_][kColumn](
+ tx_type, tx_size_, tx_height, inverse_transform_mem_.base_residual,
+ start_x, start_y, &base_frame_buffer_);
+ base_elapsed_time[kColumn] += absl::Now() - base_column_start;
+
+ const absl::Time cur_column_start = absl::Now();
+ cur_inverse_transforms_[column_transform][tx_size_1d_column_][kColumn](
+ tx_type, tx_size_, /*adjusted_tx_height=*/1,
+ inverse_transform_mem_.cur_residual, start_x, start_y,
+ &cur_frame_buffer_);
+ cur_elapsed_time[kColumn] += absl::Now() - cur_column_start;
+
+ if (!test_utils::CompareBlocks(inverse_transform_mem_.base_frame,
+ inverse_transform_mem_.cur_frame,
+ block_width_, block_height_, kMaxBlockSize,
+ kMaxBlockSize, false)) {
+ ADD_FAILURE() << "Result from dc only version of "
+ << ToString(
+ static_cast<TransformSize1D>(tx_size_1d_column_))
+ << " differs from reference in iteration #" << n
+ << "tx_type_idx:" << tx_type_idx;
+ break;
+ }
+ }
+
+ if (num_tests > 1) {
+ const auto base_row_elapsed_time_us =
+ static_cast<int>(absl::ToInt64Microseconds(base_elapsed_time[kRow]));
+ const auto cur_row_elapsed_time_us =
+ static_cast<int>(absl::ToInt64Microseconds(cur_elapsed_time[kRow]));
+ printf("TxType %30s[%19s]:: base_row: %5d us cur_row: %5d us %2.2fx \n",
+ ToString(tx_type), kTransformSize1DNames[tx_size_1d_row_],
+ base_row_elapsed_time_us, cur_row_elapsed_time_us,
+ static_cast<float>(base_row_elapsed_time_us) /
+ static_cast<float>(cur_row_elapsed_time_us));
+ const auto base_column_elapsed_time_us = static_cast<int>(
+ absl::ToInt64Microseconds(base_elapsed_time[kColumn]));
+ const auto cur_column_elapsed_time_us = static_cast<int>(
+ absl::ToInt64Microseconds(cur_elapsed_time[kColumn]));
+ printf("TxType %30s[%19s]:: base_col: %5d us cur_col: %5d us %2.2fx \n",
+ ToString(tx_type), kTransformSize1DNames[tx_size_1d_column_],
+ base_column_elapsed_time_us, cur_column_elapsed_time_us,
+ static_cast<float>(base_column_elapsed_time_us) /
+ static_cast<float>(cur_column_elapsed_time_us));
+ }
+ }
+}
+
+using InverseTransformTest8bpp = InverseTransformTest<8, int16_t, uint8_t>;
+
+TEST_P(InverseTransformTest8bpp, Random) { TestRandomValues(1); }
+
+TEST_P(InverseTransformTest8bpp, DISABLED_Speed) { TestRandomValues(10000); }
+
+TEST_P(InverseTransformTest8bpp, DcRandom) { TestDcOnlyRandomValue(1); }
+
+constexpr TransformSize kTransformSizesAll[] = {
+ kTransformSize4x4, kTransformSize4x8, kTransformSize4x16,
+ kTransformSize8x4, kTransformSize8x8, kTransformSize8x16,
+ kTransformSize8x32, kTransformSize16x4, kTransformSize16x8,
+ kTransformSize16x16, kTransformSize16x32, kTransformSize16x64,
+ kTransformSize32x8, kTransformSize32x16, kTransformSize32x32,
+ kTransformSize32x64, kTransformSize64x16, kTransformSize64x32,
+ kTransformSize64x64};
+
+INSTANTIATE_TEST_SUITE_P(C, InverseTransformTest8bpp,
+ testing::ValuesIn(kTransformSizesAll));
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, InverseTransformTest8bpp,
+ testing::ValuesIn(kTransformSizesAll));
+#endif
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, InverseTransformTest8bpp,
+ testing::ValuesIn(kTransformSizesAll));
+#endif
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using InverseTransformTest10bpp = InverseTransformTest<10, int32_t, uint16_t>;
+
+TEST_P(InverseTransformTest10bpp, Random) { TestRandomValues(1); }
+
+TEST_P(InverseTransformTest10bpp, DISABLED_Speed) { TestRandomValues(10000); }
+
+TEST_P(InverseTransformTest10bpp, DcRandom) { TestDcOnlyRandomValue(1); }
+
+INSTANTIATE_TEST_SUITE_P(C, InverseTransformTest10bpp,
+ testing::ValuesIn(kTransformSizesAll));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, InverseTransformTest10bpp,
+ testing::ValuesIn(kTransformSizesAll));
+#endif
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+} // namespace
+} // namespace dsp
+
+static std::ostream& operator<<(std::ostream& os, const TransformSize param) {
+ return os << ToString(param);
+}
+
+} // namespace libgav1
diff --git a/src/dsp/libgav1_dsp.cmake b/src/dsp/libgav1_dsp.cmake
index 960d5a7..a28334d 100644
--- a/src/dsp/libgav1_dsp.cmake
+++ b/src/dsp/libgav1_dsp.cmake
@@ -40,8 +40,16 @@ list(APPEND libgav1_dsp_sources
"${libgav1_source}/dsp/film_grain_common.h"
"${libgav1_source}/dsp/intra_edge.cc"
"${libgav1_source}/dsp/intra_edge.h"
+ "${libgav1_source}/dsp/intrapred_cfl.cc"
+ "${libgav1_source}/dsp/intrapred_cfl.h"
+ "${libgav1_source}/dsp/intrapred_directional.cc"
+ "${libgav1_source}/dsp/intrapred_directional.h"
+ "${libgav1_source}/dsp/intrapred_filter.cc"
+ "${libgav1_source}/dsp/intrapred_filter.h"
"${libgav1_source}/dsp/intrapred.cc"
"${libgav1_source}/dsp/intrapred.h"
+ "${libgav1_source}/dsp/intrapred_smooth.cc"
+ "${libgav1_source}/dsp/intrapred_smooth.h"
"${libgav1_source}/dsp/inverse_transform.cc"
"${libgav1_source}/dsp/inverse_transform.h"
"${libgav1_source}/dsp/inverse_transform.inc"
@@ -67,6 +75,8 @@ list(APPEND libgav1_dsp_sources
list(APPEND libgav1_dsp_sources_avx2
${libgav1_dsp_sources_avx2}
+ "${libgav1_source}/dsp/x86/cdef_avx2.cc"
+ "${libgav1_source}/dsp/x86/cdef_avx2.h"
"${libgav1_source}/dsp/x86/convolve_avx2.cc"
"${libgav1_source}/dsp/x86/convolve_avx2.h"
"${libgav1_source}/dsp/x86/loop_restoration_10bit_avx2.cc"
@@ -89,11 +99,16 @@ list(APPEND libgav1_dsp_sources_neon
"${libgav1_source}/dsp/arm/intra_edge_neon.cc"
"${libgav1_source}/dsp/arm/intra_edge_neon.h"
"${libgav1_source}/dsp/arm/intrapred_cfl_neon.cc"
+ "${libgav1_source}/dsp/arm/intrapred_cfl_neon.h"
+ "${libgav1_source}/dsp/arm/intrapred_directional_neon.h"
"${libgav1_source}/dsp/arm/intrapred_directional_neon.cc"
- "${libgav1_source}/dsp/arm/intrapred_filter_intra_neon.cc"
+ "${libgav1_source}/dsp/arm/intrapred_filter_neon.cc"
+ "${libgav1_source}/dsp/arm/intrapred_filter_neon.h"
"${libgav1_source}/dsp/arm/intrapred_neon.cc"
"${libgav1_source}/dsp/arm/intrapred_neon.h"
"${libgav1_source}/dsp/arm/intrapred_smooth_neon.cc"
+ "${libgav1_source}/dsp/arm/intrapred_smooth_neon.h"
+ "${libgav1_source}/dsp/arm/inverse_transform_10bit_neon.cc"
"${libgav1_source}/dsp/arm/inverse_transform_neon.cc"
"${libgav1_source}/dsp/arm/inverse_transform_neon.h"
"${libgav1_source}/dsp/arm/loop_filter_neon.cc"
@@ -124,14 +139,23 @@ list(APPEND libgav1_dsp_sources_sse4
"${libgav1_source}/dsp/x86/cdef_sse4.h"
"${libgav1_source}/dsp/x86/convolve_sse4.cc"
"${libgav1_source}/dsp/x86/convolve_sse4.h"
+ "${libgav1_source}/dsp/x86/convolve_sse4.inc"
"${libgav1_source}/dsp/x86/distance_weighted_blend_sse4.cc"
"${libgav1_source}/dsp/x86/distance_weighted_blend_sse4.h"
+ "${libgav1_source}/dsp/x86/film_grain_sse4.cc"
+ "${libgav1_source}/dsp/x86/film_grain_sse4.h"
"${libgav1_source}/dsp/x86/intra_edge_sse4.cc"
"${libgav1_source}/dsp/x86/intra_edge_sse4.h"
+ "${libgav1_source}/dsp/x86/intrapred_cfl_sse4.cc"
+ "${libgav1_source}/dsp/x86/intrapred_cfl_sse4.h"
+ "${libgav1_source}/dsp/x86/intrapred_directional_sse4.cc"
+ "${libgav1_source}/dsp/x86/intrapred_directional_sse4.h"
+ "${libgav1_source}/dsp/x86/intrapred_filter_sse4.cc"
+ "${libgav1_source}/dsp/x86/intrapred_filter_sse4.h"
"${libgav1_source}/dsp/x86/intrapred_sse4.cc"
"${libgav1_source}/dsp/x86/intrapred_sse4.h"
- "${libgav1_source}/dsp/x86/intrapred_cfl_sse4.cc"
"${libgav1_source}/dsp/x86/intrapred_smooth_sse4.cc"
+ "${libgav1_source}/dsp/x86/intrapred_smooth_sse4.h"
"${libgav1_source}/dsp/x86/inverse_transform_sse4.cc"
"${libgav1_source}/dsp/x86/inverse_transform_sse4.h"
"${libgav1_source}/dsp/x86/loop_filter_sse4.cc"
diff --git a/src/dsp/loop_filter_test.cc b/src/dsp/loop_filter_test.cc
new file mode 100644
index 0000000..ca5107a
--- /dev/null
+++ b/src/dsp/loop_filter_test.cc
@@ -0,0 +1,348 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/loop_filter.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+#include <ostream>
+#include <string>
+
+#include "absl/strings/match.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "tests/block_utils.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/third_party/libvpx/md5_helper.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// Horizontal and Vertical need 32x32: 8 pixels preceding filtered section
+// 16 pixels within filtered section
+// 8 pixels following filtered section
+constexpr int kNumPixels = 1024;
+constexpr int kBlockStride = 32;
+
+constexpr int kNumTests = 50000;
+constexpr int kNumSpeedTests = 500000;
+
+constexpr int kMaxLoopFilter = 63;
+
+template <typename Pixel>
+void InitInput(Pixel* dst, const int stride, const int bitdepth,
+ libvpx_test::ACMRandom& rnd, const uint8_t inner_thresh,
+ const bool transpose) {
+ const int max_pixel = (1 << bitdepth) - 1;
+ const int pixel_range = max_pixel + 1;
+ Pixel tmp[kNumPixels];
+ auto clip_pixel = [max_pixel](int val) {
+ return static_cast<Pixel>(std::max(std::min(val, max_pixel), 0));
+ };
+
+ for (int i = 0; i < kNumPixels;) {
+ const uint8_t val = rnd.Rand8();
+ if (val & 0x80) { // 50% chance to choose a new value.
+ tmp[i++] = rnd(pixel_range);
+ } else { // 50% chance to repeat previous value in row X times.
+ int j = 0;
+ while (j++ < ((val & 0x1f) + 1) && i < kNumPixels) {
+ if (i < 1) {
+ tmp[i] = rnd(pixel_range);
+ } else if (val & 0x20) { // Increment by a value within the limit.
+ tmp[i] = clip_pixel(tmp[i - 1] + (inner_thresh - 1));
+ } else { // Decrement by a value within the limit.
+ tmp[i] = clip_pixel(tmp[i - 1] - (inner_thresh - 1));
+ }
+ ++i;
+ }
+ }
+ }
+
+ for (int i = 0; i < kNumPixels;) {
+ const uint8_t val = rnd.Rand8();
+ if (val & 0x80) {
+ ++i;
+ } else { // 50% chance to repeat previous value in column X times.
+ int j = 0;
+ while (j++ < ((val & 0x1f) + 1) && i < kNumPixels) {
+ if (i < 1) {
+ tmp[i] = rnd(pixel_range);
+ } else if (val & 0x20) { // Increment by a value within the limit.
+ tmp[(i % 32) * 32 + i / 32] = clip_pixel(
+ tmp[((i - 1) % 32) * 32 + (i - 1) / 32] + (inner_thresh - 1));
+ } else { // Decrement by a value within the inner_thresh.
+ tmp[(i % 32) * 32 + i / 32] = clip_pixel(
+ tmp[((i - 1) % 32) * 32 + (i - 1) / 32] - (inner_thresh - 1));
+ }
+ ++i;
+ }
+ }
+ }
+
+ for (int i = 0; i < kNumPixels; ++i) {
+ const int offset = transpose ? stride * (i % stride) + i / stride : i;
+ dst[i] = tmp[offset];
+ }
+}
+
+template <int bitdepth, typename Pixel>
+class LoopFilterTest : public testing::TestWithParam<LoopFilterSize> {
+ public:
+ LoopFilterTest() = default;
+ LoopFilterTest(const LoopFilterTest&) = delete;
+ LoopFilterTest& operator=(const LoopFilterTest&) = delete;
+ ~LoopFilterTest() override = default;
+
+ protected:
+ void SetUp() override {
+ test_utils::ResetDspTable(bitdepth);
+ LoopFilterInit_C();
+
+ const Dsp* const dsp = GetDspTable(bitdepth);
+ ASSERT_NE(dsp, nullptr);
+ memcpy(base_loop_filters_, dsp->loop_filters[size_],
+ sizeof(base_loop_filters_));
+
+ const testing::TestInfo* const test_info =
+ testing::UnitTest::GetInstance()->current_test_info();
+ const char* const test_case = test_info->test_suite_name();
+ if (absl::StartsWith(test_case, "C/")) {
+ memset(base_loop_filters_, 0, sizeof(base_loop_filters_));
+ } else if (absl::StartsWith(test_case, "SSE41/")) {
+ if ((GetCpuInfo() & kSSE4_1) != 0) {
+ LoopFilterInit_SSE4_1();
+ }
+ } else if (absl::StartsWith(test_case, "NEON/")) {
+ LoopFilterInit_NEON();
+ } else {
+ FAIL() << "Unrecognized architecture prefix in test case name: "
+ << test_case;
+ }
+
+ memcpy(cur_loop_filters_, dsp->loop_filters[size_],
+ sizeof(cur_loop_filters_));
+
+ for (int i = 0; i < kNumLoopFilterTypes; ++i) {
+ // skip functions that haven't been specialized for this particular
+ // architecture.
+ if (cur_loop_filters_[i] == base_loop_filters_[i]) {
+ cur_loop_filters_[i] = nullptr;
+ }
+ }
+ }
+
+ // Check |digests| if non-NULL otherwise print the filter timing.
+ void TestRandomValues(const char* const digests[kNumLoopFilterTypes],
+ int num_runs) const;
+ void TestSaturatedValues() const;
+
+ const LoopFilterSize size_ = GetParam();
+ LoopFilterFunc base_loop_filters_[kNumLoopFilterTypes];
+ LoopFilterFunc cur_loop_filters_[kNumLoopFilterTypes];
+};
+
+template <int bitdepth, typename Pixel>
+void LoopFilterTest<bitdepth, Pixel>::TestRandomValues(
+ const char* const digests[kNumLoopFilterTypes], const int num_runs) const {
+ for (int i = 0; i < kNumLoopFilterTypes; ++i) {
+ libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+ if (cur_loop_filters_[i] == nullptr) continue;
+
+ libvpx_test::MD5 md5_digest;
+ absl::Duration elapsed_time;
+ for (int n = 0; n < num_runs; ++n) {
+ Pixel dst[kNumPixels];
+ const auto outer_thresh =
+ static_cast<uint8_t>(rnd(3 * kMaxLoopFilter + 5));
+ const auto inner_thresh = static_cast<uint8_t>(rnd(kMaxLoopFilter + 1));
+ const auto hev_thresh =
+ static_cast<uint8_t>(rnd(kMaxLoopFilter + 1) >> 4);
+ InitInput(dst, kBlockStride, bitdepth, rnd, inner_thresh, (n & 1) == 0);
+
+ const absl::Time start = absl::Now();
+ cur_loop_filters_[i](dst + 8 + kBlockStride * 8, kBlockStride,
+ outer_thresh, inner_thresh, hev_thresh);
+ elapsed_time += absl::Now() - start;
+
+ md5_digest.Add(reinterpret_cast<const uint8_t*>(dst), sizeof(dst));
+ }
+ if (digests == nullptr) {
+ const auto elapsed_time_us =
+ static_cast<int>(absl::ToInt64Microseconds(elapsed_time));
+ printf("Mode %s[%25s]: %5d us\n",
+ ToString(static_cast<LoopFilterSize>(size_)),
+ ToString(static_cast<LoopFilterType>(i)), elapsed_time_us);
+ } else {
+ const std::string digest = md5_digest.Get();
+ printf("Mode %s[%25s]: MD5: %s\n",
+ ToString(static_cast<LoopFilterSize>(size_)),
+ ToString(static_cast<LoopFilterType>(i)), digest.c_str());
+ EXPECT_STREQ(digests[i], digest.c_str());
+ }
+ }
+}
+
+template <int bitdepth, typename Pixel>
+void LoopFilterTest<bitdepth, Pixel>::TestSaturatedValues() const {
+ const LoopFilterType filter = kLoopFilterTypeHorizontal;
+ if (cur_loop_filters_[filter] == nullptr) return;
+
+ Pixel dst[kNumPixels], ref[kNumPixels];
+ const auto value = static_cast<Pixel>((1 << bitdepth) - 1);
+ for (auto& r : dst) r = value;
+ memcpy(ref, dst, sizeof(dst));
+
+ const int outer_thresh = 24;
+ const int inner_thresh = 8;
+ const int hev_thresh = 0;
+ cur_loop_filters_[filter](dst + 8 + kBlockStride * 8, kBlockStride,
+ outer_thresh, inner_thresh, hev_thresh);
+ ASSERT_TRUE(test_utils::CompareBlocks(ref, dst, kBlockStride, kBlockStride,
+ kBlockStride, kBlockStride, true))
+ << "kLoopFilterTypeHorizontal output doesn't match reference";
+}
+
+//------------------------------------------------------------------------------
+
+using LoopFilterTest8bpp = LoopFilterTest<8, uint8_t>;
+
+const char* const* GetDigests8bpp(LoopFilterSize size) {
+ static const char* const kDigestsSize4[kNumLoopFilterTypes] = {
+ "2e07bdb04b363d4ce69c7d738b1ee01a",
+ "7ff41f2ffa809a2016d342d92afa7f89",
+ };
+ static const char* const kDigestsSize6[kNumLoopFilterTypes] = {
+ "2cd4d9ee7497ed67e38fad9cbeb7e278",
+ "75c57a30a927d1aca1ac5c4f175712ca",
+ };
+ static const char* const kDigestsSize8[kNumLoopFilterTypes] = {
+ "854860a272d58ace223454ea727a6fe4",
+ "4129ee49b047777583c0e9b2006c87bf",
+ };
+ static const char* const kDigestsSize14[kNumLoopFilterTypes] = {
+ "6eb768620b7ccc84b6f88b9193b02ad2",
+ "56e034d9edbe0d5a3cae69b2d9b3486e",
+ };
+
+ switch (size) {
+ case kLoopFilterSize4:
+ return kDigestsSize4;
+ case kLoopFilterSize6:
+ return kDigestsSize6;
+ case kLoopFilterSize8:
+ return kDigestsSize8;
+ case kLoopFilterSize14:
+ return kDigestsSize14;
+ default:
+ ADD_FAILURE() << "Unknown loop filter size" << size;
+ return nullptr;
+ }
+}
+
+TEST_P(LoopFilterTest8bpp, DISABLED_Speed) {
+ TestRandomValues(nullptr, kNumSpeedTests);
+}
+
+TEST_P(LoopFilterTest8bpp, FixedInput) {
+ TestRandomValues(GetDigests8bpp(size_), kNumTests);
+}
+
+TEST_P(LoopFilterTest8bpp, SaturatedValues) { TestSaturatedValues(); }
+
+constexpr LoopFilterSize kLoopFilterSizes[] = {
+ kLoopFilterSize4, kLoopFilterSize6, kLoopFilterSize8, kLoopFilterSize14};
+
+INSTANTIATE_TEST_SUITE_P(C, LoopFilterTest8bpp,
+ testing::ValuesIn(kLoopFilterSizes));
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, LoopFilterTest8bpp,
+ testing::ValuesIn(kLoopFilterSizes));
+#endif
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, LoopFilterTest8bpp,
+ testing::ValuesIn(kLoopFilterSizes));
+#endif
+//------------------------------------------------------------------------------
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using LoopFilterTest10bpp = LoopFilterTest<10, uint16_t>;
+
+const char* const* GetDigests10bpp(LoopFilterSize size) {
+ static const char* const kDigestsSize4[kNumLoopFilterTypes] = {
+ "657dd0f612734c9c1fb50a2313567af4",
+ "b1c0a0a0b35bad1589badf3c291c0461",
+ };
+ static const char* const kDigestsSize6[kNumLoopFilterTypes] = {
+ "d41906d4830157052d5bde417d9df9fc",
+ "451490def78bd649d16d64db4e665a62",
+ };
+ static const char* const kDigestsSize8[kNumLoopFilterTypes] = {
+ "a763127680f31db7184f2a63ee140268",
+ "1f413bebacaa2435f0e07963a9095243",
+ };
+ static const char* const kDigestsSize14[kNumLoopFilterTypes] = {
+ "f0e61add3e5856657c4055751a6dd6e2",
+ "44da25d613ea601bf5f6e2a42d329cf0",
+ };
+
+ switch (size) {
+ case kLoopFilterSize4:
+ return kDigestsSize4;
+ case kLoopFilterSize6:
+ return kDigestsSize6;
+ case kLoopFilterSize8:
+ return kDigestsSize8;
+ case kLoopFilterSize14:
+ return kDigestsSize14;
+ default:
+ ADD_FAILURE() << "Unknown loop filter size" << size;
+ return nullptr;
+ }
+}
+
+TEST_P(LoopFilterTest10bpp, DISABLED_Speed) {
+ TestRandomValues(nullptr, kNumSpeedTests);
+}
+
+TEST_P(LoopFilterTest10bpp, FixedInput) {
+ TestRandomValues(GetDigests10bpp(size_), kNumTests);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, LoopFilterTest10bpp,
+ testing::ValuesIn(kLoopFilterSizes));
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, LoopFilterTest10bpp,
+ testing::ValuesIn(kLoopFilterSizes));
+#endif
+#endif
+
+} // namespace
+
+static std::ostream& operator<<(std::ostream& os, const LoopFilterSize size) {
+ return os << ToString(size);
+}
+
+} // namespace dsp
+} // namespace libgav1
diff --git a/src/dsp/loop_restoration.cc b/src/dsp/loop_restoration.cc
index 0909df0..1a15d90 100644
--- a/src/dsp/loop_restoration.cc
+++ b/src/dsp/loop_restoration.cc
@@ -143,12 +143,12 @@ inline void WienerVertical(const int16_t* wiener_buffer, const int width,
// filter[3] = 0 - 2 * (filter[0] + filter[1] + filter[2]).
// Thus in libaom's computation, an offset of 128 is needed for filter[3].
template <int bitdepth, typename Pixel>
-void WienerFilter_C(const RestorationUnitInfo& restoration_info,
- const void* const source, const void* const top_border,
- const void* const bottom_border, const ptrdiff_t stride,
- const int width, const int height,
- RestorationBuffer* const restoration_buffer,
- void* const dest) {
+void WienerFilter_C(
+ const RestorationUnitInfo& restoration_info, const void* const source,
+ const ptrdiff_t stride, const void* const top_border,
+ const ptrdiff_t top_border_stride, const void* const bottom_border,
+ const ptrdiff_t bottom_border_stride, const int width, const int height,
+ RestorationBuffer* const restoration_buffer, void* const dest) {
constexpr int kCenterTap = kWienerFilterTaps / 2;
const int16_t* const number_leading_zero_coefficients =
restoration_info.wiener_info.number_leading_zero_coefficients;
@@ -170,38 +170,42 @@ void WienerFilter_C(const RestorationUnitInfo& restoration_info,
auto* wiener_buffer = wiener_buffer_org + number_rows_to_skip * width;
if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) {
- WienerHorizontal<bitdepth, Pixel>(top + (2 - height_extra) * stride, stride,
- width, height_extra, filter_horizontal, 0,
- &wiener_buffer);
+ WienerHorizontal<bitdepth, Pixel>(
+ top + (2 - height_extra) * top_border_stride, top_border_stride, width,
+ height_extra, filter_horizontal, 0, &wiener_buffer);
WienerHorizontal<bitdepth, Pixel>(src, stride, width, height,
filter_horizontal, 0, &wiener_buffer);
- WienerHorizontal<bitdepth, Pixel>(bottom, stride, width, height_extra,
- filter_horizontal, 0, &wiener_buffer);
- } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
- WienerHorizontal<bitdepth, Pixel>(top + (2 - height_extra) * stride, stride,
- width, height_extra, filter_horizontal, 1,
+ WienerHorizontal<bitdepth, Pixel>(bottom, bottom_border_stride, width,
+ height_extra, filter_horizontal, 0,
&wiener_buffer);
+ } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
+ WienerHorizontal<bitdepth, Pixel>(
+ top + (2 - height_extra) * top_border_stride, top_border_stride, width,
+ height_extra, filter_horizontal, 1, &wiener_buffer);
WienerHorizontal<bitdepth, Pixel>(src, stride, width, height,
filter_horizontal, 1, &wiener_buffer);
- WienerHorizontal<bitdepth, Pixel>(bottom, stride, width, height_extra,
- filter_horizontal, 1, &wiener_buffer);
- } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) {
- WienerHorizontal<bitdepth, Pixel>(top + (2 - height_extra) * stride, stride,
- width, height_extra, filter_horizontal, 2,
+ WienerHorizontal<bitdepth, Pixel>(bottom, bottom_border_stride, width,
+ height_extra, filter_horizontal, 1,
&wiener_buffer);
+ } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) {
+ WienerHorizontal<bitdepth, Pixel>(
+ top + (2 - height_extra) * top_border_stride, top_border_stride, width,
+ height_extra, filter_horizontal, 2, &wiener_buffer);
WienerHorizontal<bitdepth, Pixel>(src, stride, width, height,
filter_horizontal, 2, &wiener_buffer);
- WienerHorizontal<bitdepth, Pixel>(bottom, stride, width, height_extra,
- filter_horizontal, 2, &wiener_buffer);
+ WienerHorizontal<bitdepth, Pixel>(bottom, bottom_border_stride, width,
+ height_extra, filter_horizontal, 2,
+ &wiener_buffer);
} else {
assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3);
- WienerHorizontal<bitdepth, Pixel>(top + (2 - height_extra) * stride, stride,
- width, height_extra, filter_horizontal, 3,
- &wiener_buffer);
+ WienerHorizontal<bitdepth, Pixel>(
+ top + (2 - height_extra) * top_border_stride, top_border_stride, width,
+ height_extra, filter_horizontal, 3, &wiener_buffer);
WienerHorizontal<bitdepth, Pixel>(src, stride, width, height,
filter_horizontal, 3, &wiener_buffer);
- WienerHorizontal<bitdepth, Pixel>(bottom, stride, width, height_extra,
- filter_horizontal, 3, &wiener_buffer);
+ WienerHorizontal<bitdepth, Pixel>(bottom, bottom_border_stride, width,
+ height_extra, filter_horizontal, 3,
+ &wiener_buffer);
}
// vertical filtering.
@@ -233,7 +237,7 @@ void WienerFilter_C(const RestorationUnitInfo& restoration_info,
//------------------------------------------------------------------------------
// SGR
-// When |height| is 1, |src_stride| could be set to arbitrary value.
+// When |height| is 1, |src_stride| could be set to an arbitrary value.
template <typename Pixel, int size>
LIBGAV1_ALWAYS_INLINE void BoxSum(const Pixel* src, const ptrdiff_t src_stride,
const int height, const int width,
@@ -267,7 +271,7 @@ LIBGAV1_ALWAYS_INLINE void BoxSum(const Pixel* src, const ptrdiff_t src_stride,
} while (--y != 0);
}
-// When |height| is 1, |src_stride| could be set to arbitrary value.
+// When |height| is 1, |src_stride| could be set to an arbitrary value.
template <typename Pixel>
LIBGAV1_ALWAYS_INLINE void BoxSum(const Pixel* src, const ptrdiff_t src_stride,
const int height, const int width,
@@ -541,8 +545,11 @@ inline void BoxFilter(const Pixel* const src, const ptrdiff_t stride,
template <int bitdepth, typename Pixel>
inline void BoxFilterProcess(const RestorationUnitInfo& restoration_info,
- const Pixel* src, const Pixel* const top_border,
- const Pixel* bottom_border, const ptrdiff_t stride,
+ const Pixel* src, const ptrdiff_t stride,
+ const Pixel* const top_border,
+ const ptrdiff_t top_border_stride,
+ const Pixel* bottom_border,
+ const ptrdiff_t bottom_border_stride,
const int width, const int height,
SgrBuffer* const sgr_buffer, Pixel* dst) {
const auto temp_stride = Align<ptrdiff_t>(width, 8);
@@ -582,8 +589,8 @@ inline void BoxFilterProcess(const RestorationUnitInfo& restoration_info,
b565[1] = b565[0] + temp_stride;
assert(scales[0] != 0);
assert(scales[1] != 0);
- BoxSum<Pixel>(top_border, stride, 2, width + 2, sum3, sum5 + 1, square_sum3,
- square_sum5 + 1);
+ BoxSum<Pixel>(top_border, top_border_stride, 2, width + 2, sum3, sum5 + 1,
+ square_sum3, square_sum5 + 1);
sum5[0] = sum5[1];
square_sum5[0] = square_sum5[1];
BoxSum<Pixel>(src, stride, 1, width + 2, sum3 + 2, sum5 + 3, square_sum3 + 2,
@@ -631,7 +638,7 @@ inline void BoxFilterProcess(const RestorationUnitInfo& restoration_info,
ptrdiff_t s_stride;
if ((height & 1) == 0) {
sr = bottom_border;
- s_stride = stride;
+ s_stride = bottom_border_stride;
} else {
sr = src + 2 * stride;
s_stride = bottom_border - (src + 2 * stride);
@@ -658,8 +665,9 @@ inline void BoxFilterProcess(const RestorationUnitInfo& restoration_info,
std::swap(ma565[0], ma565[1]);
std::swap(b565[0], b565[1]);
}
- BoxSum<Pixel>(bottom_border + stride, stride, 1, width + 2, sum3 + 2,
- sum5 + 3, square_sum3 + 2, square_sum5 + 3);
+ BoxSum<Pixel>(bottom_border + bottom_border_stride, bottom_border_stride, 1,
+ width + 2, sum3 + 2, sum5 + 3, square_sum3 + 2,
+ square_sum5 + 3);
sum5[4] = sum5[3];
square_sum5[4] = square_sum5[3];
BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, scales[0],
@@ -681,12 +689,13 @@ inline void BoxFilterProcess(const RestorationUnitInfo& restoration_info,
template <int bitdepth, typename Pixel>
inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
- const Pixel* src,
+ const Pixel* src, const ptrdiff_t stride,
const Pixel* const top_border,
+ const ptrdiff_t top_border_stride,
const Pixel* bottom_border,
- const ptrdiff_t stride, const int width,
- const int height, SgrBuffer* const sgr_buffer,
- Pixel* dst) {
+ const ptrdiff_t bottom_border_stride,
+ const int width, const int height,
+ SgrBuffer* const sgr_buffer, Pixel* dst) {
const auto temp_stride = Align<ptrdiff_t>(width, 8);
const ptrdiff_t sum_stride = temp_stride + 8;
const int sgr_proj_index = restoration_info.sgr_proj_info.index;
@@ -705,7 +714,8 @@ inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
b565[0] = sgr_buffer->b565;
b565[1] = b565[0] + temp_stride;
assert(scale != 0);
- BoxSum<Pixel, 5>(top_border, stride, 2, width + 2, sum5 + 1, square_sum5 + 1);
+ BoxSum<Pixel, 5>(top_border, top_border_stride, 2, width + 2, sum5 + 1,
+ square_sum5 + 1);
sum5[0] = sum5[1];
square_sum5[0] = square_sum5[1];
BoxSum<Pixel, 5>(src, stride, 1, width + 2, sum5 + 3, square_sum5 + 3);
@@ -736,7 +746,7 @@ inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
ptrdiff_t s_stride;
if ((height & 1) == 0) {
sr = bottom_border;
- s_stride = stride;
+ s_stride = bottom_border_stride;
} else {
sr = src + 2 * stride;
s_stride = bottom_border - (src + 2 * stride);
@@ -755,8 +765,8 @@ inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
Circulate5PointersBy2<uint16_t>(sum5);
Circulate5PointersBy2<uint32_t>(square_sum5);
}
- BoxSum<Pixel, 5>(bottom_border + stride, stride, 1, width + 2, sum5 + 3,
- square_sum5 + 3);
+ BoxSum<Pixel, 5>(bottom_border + bottom_border_stride, bottom_border_stride,
+ 1, width + 2, sum5 + 3, square_sum5 + 3);
sum5[4] = sum5[3];
square_sum5[4] = square_sum5[3];
BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, scale, sgr_buffer,
@@ -772,12 +782,13 @@ inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
template <int bitdepth, typename Pixel>
inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
- const Pixel* src,
+ const Pixel* src, const ptrdiff_t stride,
const Pixel* const top_border,
+ const ptrdiff_t top_border_stride,
const Pixel* bottom_border,
- const ptrdiff_t stride, const int width,
- const int height, SgrBuffer* const sgr_buffer,
- Pixel* dst) {
+ const ptrdiff_t bottom_border_stride,
+ const int width, const int height,
+ SgrBuffer* const sgr_buffer, Pixel* dst) {
assert(restoration_info.sgr_proj_info.multiplier[0] == 0);
const auto temp_stride = Align<ptrdiff_t>(width, 8);
const ptrdiff_t sum_stride = temp_stride + 8;
@@ -802,7 +813,8 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
b444[0] = sgr_buffer->b444;
b444[1] = b444[0] + temp_stride;
assert(scale != 0);
- BoxSum<Pixel, 3>(top_border, stride, 2, width + 2, sum3, square_sum3);
+ BoxSum<Pixel, 3>(top_border, top_border_stride, 2, width + 2, sum3,
+ square_sum3);
BoxSum<Pixel, 3>(src, stride, 1, width + 2, sum3 + 2, square_sum3 + 2);
BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, scale, false,
sgr_buffer, ma343[0], b343[0], nullptr,
@@ -814,7 +826,7 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
s = src + stride;
} else {
s = bottom_border;
- bottom_border += stride;
+ bottom_border += bottom_border_stride;
}
BoxSum<Pixel, 3>(s, 0, 1, width + 2, sum3 + 2, square_sum3 + 2);
BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, scale, true,
@@ -845,7 +857,7 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
b444, dst);
src += stride;
dst += stride;
- bottom_border += stride;
+ bottom_border += bottom_border_stride;
Circulate3PointersBy1<uint16_t>(ma343);
Circulate3PointersBy1<uint32_t>(b343);
std::swap(ma444[0], ma444[1]);
@@ -854,12 +866,12 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
}
template <int bitdepth, typename Pixel>
-void SelfGuidedFilter_C(const RestorationUnitInfo& restoration_info,
- const void* const source, const void* const top_border,
- const void* const bottom_border, const ptrdiff_t stride,
- const int width, const int height,
- RestorationBuffer* const restoration_buffer,
- void* const dest) {
+void SelfGuidedFilter_C(
+ const RestorationUnitInfo& restoration_info, const void* const source,
+ const ptrdiff_t stride, const void* const top_border,
+ const ptrdiff_t top_border_stride, const void* const bottom_border,
+ const ptrdiff_t bottom_border_stride, const int width, const int height,
+ RestorationBuffer* const restoration_buffer, void* const dest) {
const int index = restoration_info.sgr_proj_info.index;
const int radius_pass_0 = kSgrProjParams[index][0]; // 2 or 0
const int radius_pass_1 = kSgrProjParams[index][2]; // 1 or 0
@@ -872,17 +884,17 @@ void SelfGuidedFilter_C(const RestorationUnitInfo& restoration_info,
// |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the
// following assertion.
assert(radius_pass_0 != 0);
- BoxFilterProcessPass1<bitdepth, Pixel>(restoration_info, src - 3, top - 3,
- bottom - 3, stride, width, height,
- sgr_buffer, dst);
+ BoxFilterProcessPass1<bitdepth, Pixel>(
+ restoration_info, src - 3, stride, top - 3, top_border_stride,
+ bottom - 3, bottom_border_stride, width, height, sgr_buffer, dst);
} else if (radius_pass_0 == 0) {
- BoxFilterProcessPass2<bitdepth, Pixel>(restoration_info, src - 2, top - 2,
- bottom - 2, stride, width, height,
- sgr_buffer, dst);
+ BoxFilterProcessPass2<bitdepth, Pixel>(
+ restoration_info, src - 2, stride, top - 2, top_border_stride,
+ bottom - 2, bottom_border_stride, width, height, sgr_buffer, dst);
} else {
- BoxFilterProcess<bitdepth, Pixel>(restoration_info, src - 3, top - 3,
- bottom - 3, stride, width, height,
- sgr_buffer, dst);
+ BoxFilterProcess<bitdepth, Pixel>(
+ restoration_info, src - 3, stride, top - 3, top_border_stride,
+ bottom - 3, bottom_border_stride, width, height, sgr_buffer, dst);
}
}
diff --git a/src/dsp/loop_restoration_test.cc b/src/dsp/loop_restoration_test.cc
new file mode 100644
index 0000000..97a05d4
--- /dev/null
+++ b/src/dsp/loop_restoration_test.cc
@@ -0,0 +1,616 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/loop_restoration.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+#include <string>
+
+#include "absl/strings/match.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/common.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "tests/block_utils.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// in unit of Pixel.
+constexpr int kBorder = 16;
+constexpr int kWidth = 256;
+constexpr int kHeight = 255;
+constexpr int kStride = kWidth + 2 * kBorder;
+constexpr int kOffset = kBorder * kStride + kBorder;
+constexpr int kMaxBlockSize = 288 * kStride;
+constexpr int kUnitWidths[] = {32, 64, 128, 256};
+
+constexpr int kNumRadiusTypes = 3;
+constexpr int kNumWienerOrders = 4;
+constexpr int kWienerOrders[] = {7, 5, 3, 1};
+constexpr int kWienerOrderIdLookup[] = {0, 3, 0, 2, 0, 1, 0, 0};
+
+template <int bitdepth, typename Pixel>
+class SelfGuidedFilterTest : public testing::TestWithParam<int>,
+ public test_utils::MaxAlignedAllocable {
+ public:
+ SelfGuidedFilterTest() = default;
+ SelfGuidedFilterTest(const SelfGuidedFilterTest&) = delete;
+ SelfGuidedFilterTest& operator=(const SelfGuidedFilterTest&) = delete;
+ ~SelfGuidedFilterTest() override = default;
+
+ void SetUp() override {
+ test_utils::ResetDspTable(bitdepth);
+ LoopRestorationInit_C();
+ const testing::TestInfo* const test_info =
+ testing::UnitTest::GetInstance()->current_test_info();
+ const char* const test_case = test_info->test_suite_name();
+ if (absl::StartsWith(test_case, "C/")) {
+ } else if (absl::StartsWith(test_case, "AVX2/")) {
+ if ((GetCpuInfo() & kAVX2) != 0) {
+ LoopRestorationInit_AVX2();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ LoopRestorationInit10bpp_AVX2();
+#endif
+ }
+ } else if (absl::StartsWith(test_case, "SSE41/")) {
+ if ((GetCpuInfo() & kSSE4_1) != 0) {
+ LoopRestorationInit_SSE4_1();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ LoopRestorationInit10bpp_SSE4_1();
+#endif
+ }
+ } else if (absl::StartsWith(test_case, "NEON/")) {
+ LoopRestorationInit_NEON();
+ } else {
+ FAIL() << "Unrecognized architecture prefix in test case name: "
+ << test_case;
+ }
+ const Dsp* const dsp = GetDspTable(bitdepth);
+ ASSERT_NE(dsp, nullptr);
+ target_self_guided_filter_func_ = dsp->loop_restorations[1];
+ restoration_info_.type = kLoopRestorationTypeSgrProj;
+ memset(dst_, 0, sizeof(dst_));
+ }
+
+ void SetInputData(int type, Pixel value, int radius_index,
+ libvpx_test::ACMRandom* rnd);
+ void TestFixedValues(int test_index, Pixel value);
+ void TestRandomValues(bool speed);
+
+ protected:
+ const int unit_width_ = GetParam();
+ const int unit_height_ = kRestorationUnitHeight;
+
+ private:
+ alignas(kMaxAlignment) Pixel src_[kMaxBlockSize];
+ alignas(kMaxAlignment) Pixel dst_[kMaxBlockSize];
+ RestorationUnitInfo restoration_info_;
+ RestorationBuffer restoration_buffer_;
+ LoopRestorationFunc target_self_guided_filter_func_;
+};
+
+template <int bitdepth, typename Pixel>
+void SelfGuidedFilterTest<bitdepth, Pixel>::SetInputData(
+ int type, Pixel value, int radius_index,
+ libvpx_test::ACMRandom* const rnd) {
+ const int mask = (1 << bitdepth) - 1;
+ if (type == 0) { // Set fixed values
+ for (auto& s : src_) s = value;
+ } else { // Set random values
+ for (auto& s : src_) s = rnd->Rand16() & mask;
+ }
+ for (auto& d : dst_) d = rnd->Rand16() & mask;
+ restoration_info_.sgr_proj_info.multiplier[0] =
+ kSgrProjMultiplierMin[0] +
+ rnd->PseudoUniform(kSgrProjMultiplierMax[0] - kSgrProjMultiplierMin[0] +
+ 1);
+ restoration_info_.sgr_proj_info.multiplier[1] =
+ kSgrProjMultiplierMin[1] +
+ rnd->PseudoUniform(kSgrProjMultiplierMax[1] - kSgrProjMultiplierMin[1] +
+ 1);
+ // regulate multiplier so that it matches libaom.
+ // Valid self-guided filter doesn't allow r0 and r1 to be 0 at the same time.
+ // When r0 or r1 is zero, its corresponding multiplier is set to zero in
+ // libaom.
+ int index;
+ if (radius_index == 0) {
+ index = 0; // r0 = 2, r1 = 1
+ } else if (radius_index == 1) {
+ index = 10; // r0 = 0, r1 = 1
+ } else /* if (radius_index == 2) */ {
+ index = 14; // r0 = 2, r1 = 0
+ }
+ const uint8_t r0 = kSgrProjParams[index][0];
+ const uint8_t r1 = kSgrProjParams[index][2];
+ static constexpr int kMultiplier[2] = {0, 95};
+ restoration_info_.sgr_proj_info.index = index;
+ if (r0 == 0) {
+ restoration_info_.sgr_proj_info.multiplier[0] = kMultiplier[0];
+ } else if (r1 == 0) {
+ restoration_info_.sgr_proj_info.multiplier[1] = kMultiplier[1];
+ }
+}
+
+template <int bitdepth, typename Pixel>
+void SelfGuidedFilterTest<bitdepth, Pixel>::TestFixedValues(int test_index,
+ Pixel value) {
+ static const char* const kDigest[][2][kNumRadiusTypes] = {
+ {{"7b78783ff4f03625a50c2ebfd574adca", "4faa0810639016f11a9f761ce28c38b0",
+ "a03314fc210bee68c7adbb44d2bbdac7"},
+ {"fce031d1339cfef5016e76a643538a71", "d439e1060de3f07b5b29c9b0b7c08e54",
+ "a6583fe9359877f4a259c81d900fc4fb"}},
+ {{"948ea16a90c4cefef87ce5b0ee105fc6", "76740629877b721432b84dbbdb4e352a",
+ "27100f37b3e42a5f2a051e1566edb6f8"},
+ {"dd320de3bc82f4ba69738b2190ea9f85", "bf82f271e30a1aca91e53b086e133fb3",
+ "69c274ac59c99999e1bfbf2fc4586ebd"}},
+ {{"9fbf1b246011250f38532a543cc6dd74", "d5c1e0142390ebb51b075c49f8ee9ff4",
+ "92f31086ba2f9e1508983b22d93a4e5c"},
+ {"2198321e6b95e7199738e60f5ddc6966", "34f74626027ffca010c824ddf0942b13",
+ "43dd7df2c2a601262c68cd8af1c61b82"}},
+ {{"42364ff8dbdbd6706fa3b8855a4258be", "a7843fdfd4d3c0d80ba812b353b4d6b4",
+ "f8a6a025827f29f857bed3e28ba3ea33"},
+ {"b83c1f8d7712e37f9b21b033822e37ed", "589daf2e3e6f8715873920515cfc1b42",
+ "20dcbe8e317a4373bebf11d56adc5f02"}}};
+ if (target_self_guided_filter_func_ == nullptr) return;
+ ASSERT_LT(value, 1 << bitdepth);
+ constexpr int bd_index = (bitdepth == 8) ? 0 : 1;
+ libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+ const Pixel* const src = src_ + kOffset;
+ Pixel* const dst = dst_ + kOffset;
+ for (int radius_index = 0; radius_index < kNumRadiusTypes; ++radius_index) {
+ SetInputData(0, value, radius_index, &rnd);
+ const absl::Time start = absl::Now();
+ for (int y = 0; y < kHeight; y += unit_height_) {
+ const int height = std::min(unit_height_, kHeight - y);
+ for (int x = 0; x < kWidth; x += unit_width_) {
+ const int width = std::min(unit_width_, kWidth - x);
+ const Pixel* const source = src + y * kStride + x;
+ target_self_guided_filter_func_(
+ restoration_info_, source, kStride,
+ source - kRestorationVerticalBorder * kStride, kStride,
+ source + height * kStride, kStride, width, height,
+ &restoration_buffer_, dst + y * kStride + x);
+ }
+ }
+ const absl::Duration elapsed_time = absl::Now() - start;
+ test_utils::CheckMd5Digest(
+ "kLoopRestorationTypeSgrProj", std::to_string(GetParam()).c_str(),
+ kDigest[test_index][bd_index][radius_index], dst_ + kBorder * kStride,
+ kHeight * kStride * sizeof(*dst_), elapsed_time);
+ }
+}
+
+template <int bitdepth, typename Pixel>
+void SelfGuidedFilterTest<bitdepth, Pixel>::TestRandomValues(bool speed) {
+ static const char* const kDigest[][2][kNumRadiusTypes] = {
+ {{"9f8358ed820943fa0abe3a8ebb5887db", "fb5d48870165522341843bcbfa8674fb",
+ "ca67159cd29475ac5d52ca4a0df3ea10"},
+ {"a78641886ea0cf8757057d1d91e01434", "1b95172a5f2f9c514c78afa4cf8e5678",
+ "a8ba988283d9e1ad1f0dcdbf6bbdaade"}},
+ {{"f219b445e5c80ffb5dd0359cc2cb4dd4", "699b2c9ddca1cbb0d4fc24cbcbe951e9",
+ "a4005899fa8d3c3c4669910f93ff1290"},
+ {"10a75cab3c78b891c8c6d92d55f685d1", "d46f158f57c628136f6f298ee8ca6e0e",
+ "07203ad761775d5d317f2b7884afd9fe"}},
+ {{"000d4e382be4003b514c9135893d0a37", "8fb082dca975be363bfc9c2d317ae084",
+ "475bcb6a58f87da7723f6227bc2aca0e"},
+ {"4d589683f69ccc5b416149dcc5c835d5", "986b6832df1f6020d50be61ae121e42f",
+ "7cb5c5dbdb3d1c54cfa00def450842dc"}},
+ {{"fd43bfe34d63614554dd29fb24b12173", "5c1ba74ba3062c769d5c3c86a85ac9b9",
+ "f1eda6d15b37172199d9949c2315832f"},
+ {"a11be3117fb77e8fe113581b06f98bd1", "df94d12b774ad5cf744c871e707c36c8",
+ "b23dc0b54c3500248d53377030428a61"}},
+ {{"f3079b3b21d8dc6fce7bb1fd104be359", "c6fcbc686cfb97ab3a64f445d73aad36",
+ "23966cba3e0e7803eeb951905861e0dd"},
+ {"7210391a6fe26e5ca5ea205bc38aa035", "4c3e6eccad3ea152d320ecd1077169de",
+ "dcee48f94126a2132963e86e93dd4903"}}};
+ if (target_self_guided_filter_func_ == nullptr) return;
+ constexpr int bd_index = (bitdepth == 8) ? 0 : 1;
+ const int num_inputs = speed ? 1 : 5;
+ const int num_tests = speed ? 20000 : 1;
+ libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+ const Pixel* const src = src_ + kOffset;
+ Pixel* const dst = dst_ + kOffset;
+ for (int i = 0; i < num_inputs; ++i) {
+ for (int radius_index = 0; radius_index < kNumRadiusTypes; ++radius_index) {
+ SetInputData(1, 0, radius_index, &rnd);
+ const absl::Time start = absl::Now();
+ for (int k = 0; k < num_tests; ++k) {
+ for (int y = 0; y < kHeight; y += unit_height_) {
+ const int height = std::min(unit_height_, kHeight - y);
+ for (int x = 0; x < kWidth; x += unit_width_) {
+ const int width = std::min(unit_width_, kWidth - x);
+ const Pixel* const source = src + y * kStride + x;
+ target_self_guided_filter_func_(
+ restoration_info_, source, kStride,
+ source - kRestorationVerticalBorder * kStride, kStride,
+ source + height * kStride, kStride, width, height,
+ &restoration_buffer_, dst + y * kStride + x);
+ }
+ }
+ }
+ const absl::Duration elapsed_time = absl::Now() - start;
+ test_utils::CheckMd5Digest(
+ "kLoopRestorationTypeSgrProj", std::to_string(GetParam()).c_str(),
+ kDigest[i][bd_index][radius_index], dst_ + kBorder * kStride,
+ kHeight * kStride * sizeof(*dst_), elapsed_time);
+ }
+ }
+}
+
+using SelfGuidedFilterTest8bpp = SelfGuidedFilterTest<8, uint8_t>;
+
+TEST_P(SelfGuidedFilterTest8bpp, Correctness) {
+ TestFixedValues(0, 0);
+ TestFixedValues(1, 1);
+ TestFixedValues(2, 128);
+ TestFixedValues(3, 255);
+ TestRandomValues(false);
+}
+
+TEST_P(SelfGuidedFilterTest8bpp, DISABLED_Speed) { TestRandomValues(true); }
+
+INSTANTIATE_TEST_SUITE_P(C, SelfGuidedFilterTest8bpp,
+ testing::ValuesIn(kUnitWidths));
+#if LIBGAV1_ENABLE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, SelfGuidedFilterTest8bpp,
+ testing::ValuesIn(kUnitWidths));
+#endif
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, SelfGuidedFilterTest8bpp,
+ testing::ValuesIn(kUnitWidths));
+#endif
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, SelfGuidedFilterTest8bpp,
+ testing::ValuesIn(kUnitWidths));
+#endif
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using SelfGuidedFilterTest10bpp = SelfGuidedFilterTest<10, uint16_t>;
+
+TEST_P(SelfGuidedFilterTest10bpp, Correctness) {
+ TestFixedValues(0, 0);
+ TestFixedValues(1, 1);
+ TestFixedValues(2, 512);
+ TestFixedValues(3, 1023);
+ TestRandomValues(false);
+}
+
+TEST_P(SelfGuidedFilterTest10bpp, DISABLED_Speed) { TestRandomValues(true); }
+
+INSTANTIATE_TEST_SUITE_P(C, SelfGuidedFilterTest10bpp,
+ testing::ValuesIn(kUnitWidths));
+
+#if LIBGAV1_ENABLE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, SelfGuidedFilterTest10bpp,
+ testing::ValuesIn(kUnitWidths));
+#endif
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, SelfGuidedFilterTest10bpp,
+ testing::ValuesIn(kUnitWidths));
+#endif
+
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+template <int bitdepth, typename Pixel>
+class WienerFilterTest : public testing::TestWithParam<int>,
+ public test_utils::MaxAlignedAllocable {
+ public:
+ WienerFilterTest() = default;
+ WienerFilterTest(const WienerFilterTest&) = delete;
+ WienerFilterTest& operator=(const WienerFilterTest&) = delete;
+ ~WienerFilterTest() override = default;
+
+ void SetUp() override {
+ test_utils::ResetDspTable(bitdepth);
+ LoopRestorationInit_C();
+ const Dsp* const dsp = GetDspTable(bitdepth);
+ ASSERT_NE(dsp, nullptr);
+ base_wiener_filter_func_ = dsp->loop_restorations[0];
+ const testing::TestInfo* const test_info =
+ testing::UnitTest::GetInstance()->current_test_info();
+ const char* const test_case = test_info->test_suite_name();
+ if (absl::StartsWith(test_case, "C/")) {
+ } else if (absl::StartsWith(test_case, "AVX2/")) {
+ if ((GetCpuInfo() & kAVX2) != 0) {
+ LoopRestorationInit_AVX2();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ LoopRestorationInit10bpp_AVX2();
+#endif
+ }
+ } else if (absl::StartsWith(test_case, "SSE41/")) {
+ if ((GetCpuInfo() & kSSE4_1) != 0) {
+ LoopRestorationInit_SSE4_1();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ LoopRestorationInit10bpp_SSE4_1();
+#endif
+ }
+ } else if (absl::StartsWith(test_case, "NEON/")) {
+ LoopRestorationInit_NEON();
+ } else {
+ FAIL() << "Unrecognized architecture prefix in test case name: "
+ << test_case;
+ }
+ target_wiener_filter_func_ = dsp->loop_restorations[0];
+ restoration_info_.type = kLoopRestorationTypeWiener;
+ memset(dst_, 0, sizeof(dst_));
+ memset(tmp_, 0, sizeof(tmp_));
+ memset(buffer_, 0, sizeof(buffer_));
+ }
+
+ static void CleanFilterByOrder(const int order,
+ int16_t filter[kWienerFilterTaps]) {
+ if (order <= 5) filter[0] = 0;
+ if (order <= 3) filter[1] = 0;
+ if (order <= 1) filter[2] = 0;
+ }
+
+ void SetInputData(int type, Pixel value, int vertical_order,
+ int horizontal_order);
+ void TestFixedValues(int digest_id, Pixel value);
+ void TestRandomValues(bool speed);
+ void TestCompare2C();
+
+ protected:
+ const int unit_width_ = GetParam();
+ const int unit_height_ = kRestorationUnitHeight;
+
+ private:
+ alignas(kMaxAlignment)
+ uint16_t buffer_[(kRestorationUnitWidth + kWienerFilterTaps - 1) *
+ kRestorationUnitHeight];
+ alignas(kMaxAlignment) Pixel src_[kMaxBlockSize];
+ alignas(kMaxAlignment) Pixel dst_[kMaxBlockSize];
+ alignas(kMaxAlignment) Pixel tmp_[kMaxBlockSize];
+ RestorationUnitInfo restoration_info_;
+ RestorationBuffer restoration_buffer_;
+ LoopRestorationFunc base_wiener_filter_func_;
+ LoopRestorationFunc target_wiener_filter_func_;
+};
+
+template <int bitdepth, typename Pixel>
+void WienerFilterTest<bitdepth, Pixel>::SetInputData(
+ int type, Pixel value, const int vertical_order,
+ const int horizontal_order) {
+ const int mask = (1 << bitdepth) - 1;
+ libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+ if (type == 0) {
+ for (auto& s : src_) s = value;
+ } else {
+ for (auto& s : src_) s = rnd.Rand16() & mask;
+ }
+ int order = vertical_order;
+ for (int i = WienerInfo::kVertical; i <= WienerInfo::kHorizontal; ++i) {
+ auto& filter = restoration_info_.wiener_info.filter[i];
+ filter[3] = 128;
+ for (int j = 0; j < 3; ++j) {
+ filter[j] = kWienerTapsMin[j] +
+ rnd.PseudoUniform(kWienerTapsMax[j] - kWienerTapsMin[j] + 1);
+ }
+ CleanFilterByOrder(order, filter);
+ filter[3] -= 2 * (filter[0] + filter[1] + filter[2]);
+ restoration_info_.wiener_info.number_leading_zero_coefficients[i] =
+ (kWienerFilterTaps - order) / 2;
+ order = horizontal_order;
+ }
+}
+
+template <int bitdepth, typename Pixel>
+void WienerFilterTest<bitdepth, Pixel>::TestFixedValues(int digest_id,
+ Pixel value) {
+ static const char* const kDigest[2][4] = {
+ {"74fc90760a14b13340cb718f200ba350", "5bacaca0128cd36f4805330b3787771d",
+ "1109e17545cc4fbd5810b8b77e19fc36", "e7f914ec9d065aba92338016e17a526c"},
+ {"c8cc38790ceb0bea1eb989686755e1e5", "70f573b7e8875262c638a68d2f317916",
+ "193b19065899c835cb513149eb36d135", "f1dff65e3e53558b303ef0a2e3f3ba98"}};
+ if (target_wiener_filter_func_ == nullptr) return;
+ ASSERT_LT(value, 1 << bitdepth);
+ constexpr int bd_index = (bitdepth == 8) ? 0 : 1;
+ const Pixel* const src = src_ + kOffset;
+ Pixel* const dst = dst_ + kOffset;
+ for (const auto vertical_order : kWienerOrders) {
+ for (const auto horizontal_order : kWienerOrders) {
+ SetInputData(0, value, vertical_order, horizontal_order);
+ memset(dst_, 0, sizeof(dst_));
+ const absl::Time start = absl::Now();
+ for (int y = 0; y < kHeight; y += unit_height_) {
+ const int height = std::min(unit_height_, kHeight - y);
+ for (int x = 0; x < kWidth; x += unit_width_) {
+ const int width = std::min(unit_width_, kWidth - x);
+ const Pixel* const source = src + y * kStride + x;
+ target_wiener_filter_func_(
+ restoration_info_, source, kStride,
+ source - kRestorationVerticalBorder * kStride, kStride,
+ source + height * kStride, kStride, width, height,
+ &restoration_buffer_, dst + y * kStride + x);
+ }
+ }
+ const absl::Duration elapsed_time = absl::Now() - start;
+ test_utils::CheckMd5Digest(
+ "kLoopRestorationTypeWiener", std::to_string(GetParam()).c_str(),
+ kDigest[bd_index][digest_id], dst_, sizeof(dst_), elapsed_time);
+ }
+ }
+}
+
+template <int bitdepth, typename Pixel>
+void WienerFilterTest<bitdepth, Pixel>::TestRandomValues(bool speed) {
+ static const char* const kDigest[2][kNumWienerOrders][kNumWienerOrders] = {
+ {{"40d0cf56d2ffb4f581e68b0fc97f547f", "5c04745209b684ba98004ebb0f64e70b",
+ "545ed7d3f7e7ca3b86b4ada31f7aaee7", "0d6b2967f1bd1d99b720e563fe0cf03f"},
+ {"44b37076f0cf27f6eb506aca50c1d3e4", "e927d64dc9249e05a65e10ee75baa7d9",
+ "6136ecb4e29b17c9566504148943fd47", "c5ee2da81d44dc8cb2ac8021f724eb7a"},
+ {"125cbb227313ec91a2683f26e6f049d1", "77671b6529c806d23b749f304b548f59",
+ "28d53a1b486881895b8f73fa64486df1", "f5e32165bafe575d7ee7a6fbae75f36d"},
+ {"e832c41f2566ab542b32abba9d4f27bd", "ab1336ee6b85cba651f35ee5d3b3cc5c",
+ "52a673b6d14fbdca5ebdb1a34ee3326f",
+ "ebb42c7c9111f2e39f21e2158e801d9e"}},
+ {{"8cd9c6bd9983bd49564a58ed4af9098a", "f71f333c9d71237ed4e46f0ef2283196",
+ "375b43abc1d6682d62f91c1841b8b0fc", "71e2444822ae9c697ddfc96e07c6e8a1"},
+ {"d9ed3a66ceef405c08c87f6e91b71059", "c171fcff5fb7bb919f13ead7a4917a4c",
+ "8fbd1edb82fcd78d4d286886f65a700a", "fe14a143e6b261c5bb07b179d40be5a2"},
+ {"1c995f4e7f117857de73211b81093bd0", "5ab1ee3bb14adcd66d66802d58bee068",
+ "d77430783e173ebd1b30e5d9336c8b69", "e159a3620747458dff7ed3d20da1a4b7"},
+ {"5346fa07d195c257548a332753b057a3", "c77674bc0a638abc4d38d58e494fc7cf",
+ "7cbc1562a9dd08e1973b3b9ac1afc765",
+ "3c91bf1a34672cd40bf261c5820d3ec3"}}};
+ if (target_wiener_filter_func_ == nullptr) return;
+ constexpr int bd_index = (bitdepth == 8) ? 0 : 1;
+ const int num_tests = speed ? 100000 : 1;
+ const Pixel* const src = src_ + kOffset;
+ Pixel* const dst = dst_ + kOffset;
+ for (const auto vertical_order : kWienerOrders) {
+ for (const auto horizontal_order : kWienerOrders) {
+ SetInputData(1, (1 << bitdepth) - 1, vertical_order, horizontal_order);
+ memset(dst_, 0, sizeof(dst_));
+ const absl::Time start = absl::Now();
+ for (int i = 0; i < num_tests; ++i) {
+ for (int y = 0; y < kHeight; y += unit_height_) {
+ const int height = std::min(unit_height_, kHeight - y);
+ for (int x = 0; x < kWidth; x += unit_width_) {
+ const int width = std::min(unit_width_, kWidth - x);
+ const Pixel* const source = src + y * kStride + x;
+ target_wiener_filter_func_(
+ restoration_info_, source, kStride,
+ source - kRestorationVerticalBorder * kStride, kStride,
+ source + height * kStride, kStride, width, height,
+ &restoration_buffer_, dst + y * kStride + x);
+ }
+ }
+ }
+ const absl::Duration elapsed_time = absl::Now() - start;
+ test_utils::CheckMd5Digest(
+ "kLoopRestorationTypeWiener", std::to_string(GetParam()).c_str(),
+ kDigest[bd_index][kWienerOrderIdLookup[vertical_order]]
+ [kWienerOrderIdLookup[horizontal_order]],
+ dst_, sizeof(dst_), elapsed_time);
+ }
+ }
+}
+
+template <int bitdepth, typename Pixel>
+void WienerFilterTest<bitdepth, Pixel>::TestCompare2C() {
+ if (base_wiener_filter_func_ == nullptr) return;
+ if (target_wiener_filter_func_ == nullptr) return;
+ if (base_wiener_filter_func_ == target_wiener_filter_func_) return;
+ const Pixel* const src = src_ + kOffset;
+ Pixel* const dst = dst_ + kOffset;
+ Pixel* const tmp = tmp_ + kOffset;
+ for (const auto vertical_order : kWienerOrders) {
+ for (const auto horizontal_order : kWienerOrders) {
+ SetInputData(1, (1 << bitdepth) - 1, vertical_order, horizontal_order);
+ for (int x = 0; x < 2; ++x) {
+ // Prepare min/max filter coefficients.
+ int order = vertical_order;
+ for (int i = WienerInfo::kVertical; i <= WienerInfo::kHorizontal; ++i) {
+ auto& filter = restoration_info_.wiener_info.filter[i];
+ for (int j = 0; j < 3; ++j) {
+ filter[j] = (x == 0) ? kWienerTapsMin[j] : kWienerTapsMax[j];
+ }
+ CleanFilterByOrder(order, filter);
+ filter[3] = 128 - 2 * (filter[0] + filter[1] + filter[2]);
+ restoration_info_.wiener_info.number_leading_zero_coefficients[i] =
+ (kWienerFilterTaps - order) / 2;
+ order = horizontal_order;
+ }
+ base_wiener_filter_func_(restoration_info_, src, kStride,
+ src - kRestorationVerticalBorder * kStride,
+ kStride, src + unit_height_ * kStride, kStride,
+ unit_width_, unit_height_,
+ &restoration_buffer_, dst);
+ target_wiener_filter_func_(restoration_info_, src, kStride,
+ src - kRestorationVerticalBorder * kStride,
+ kStride, src + unit_height_ * kStride,
+ kStride, unit_width_, unit_height_,
+ &restoration_buffer_, tmp);
+ if (!test_utils::CompareBlocks(dst, tmp, unit_width_, unit_height_,
+ kStride, kStride, false, false)) {
+ ADD_FAILURE() << "Mismatch -- wiener taps min/max";
+ }
+ }
+ }
+ }
+}
+
+using WienerFilterTest8bpp = WienerFilterTest<8, uint8_t>;
+
+TEST_P(WienerFilterTest8bpp, Correctness) {
+ TestFixedValues(0, 0);
+ TestFixedValues(1, 1);
+ TestFixedValues(2, 128);
+ TestFixedValues(3, 255);
+ TestRandomValues(false);
+}
+
+TEST_P(WienerFilterTest8bpp, DISABLED_Speed) { TestRandomValues(true); }
+
+TEST_P(WienerFilterTest8bpp, TestCompare2C) { TestCompare2C(); }
+
+INSTANTIATE_TEST_SUITE_P(C, WienerFilterTest8bpp,
+ testing::ValuesIn(kUnitWidths));
+#if LIBGAV1_ENABLE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, WienerFilterTest8bpp,
+ testing::ValuesIn(kUnitWidths));
+#endif
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, WienerFilterTest8bpp,
+ testing::ValuesIn(kUnitWidths));
+#endif
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, WienerFilterTest8bpp,
+ testing::ValuesIn(kUnitWidths));
+#endif
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using WienerFilterTest10bpp = WienerFilterTest<10, uint16_t>;
+
+TEST_P(WienerFilterTest10bpp, Correctness) {
+ TestFixedValues(0, 0);
+ TestFixedValues(1, 1);
+ TestFixedValues(2, 512);
+ TestFixedValues(3, 1023);
+ TestRandomValues(false);
+}
+
+TEST_P(WienerFilterTest10bpp, DISABLED_Speed) { TestRandomValues(true); }
+
+TEST_P(WienerFilterTest10bpp, TestCompare2C) { TestCompare2C(); }
+
+INSTANTIATE_TEST_SUITE_P(C, WienerFilterTest10bpp,
+ testing::ValuesIn(kUnitWidths));
+
+#if LIBGAV1_ENABLE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, WienerFilterTest10bpp,
+ testing::ValuesIn(kUnitWidths));
+#endif
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, WienerFilterTest10bpp,
+ testing::ValuesIn(kUnitWidths));
+#endif
+
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+} // namespace
+} // namespace dsp
+} // namespace libgav1
diff --git a/src/dsp/mask_blend.cc b/src/dsp/mask_blend.cc
index 101c410..15ef821 100644
--- a/src/dsp/mask_blend.cc
+++ b/src/dsp/mask_blend.cc
@@ -25,8 +25,8 @@ namespace libgav1 {
namespace dsp {
namespace {
-template <int subsampling_x, int subsampling_y>
-uint8_t GetMaskValue(const uint8_t* mask, const uint8_t* mask_next_row, int x) {
+uint8_t GetMaskValue(const uint8_t* mask, const uint8_t* mask_next_row, int x,
+ int subsampling_x, int subsampling_y) {
if ((subsampling_x | subsampling_y) == 0) {
return mask[x];
}
@@ -63,7 +63,7 @@ void MaskBlend_C(const void* prediction_0, const void* prediction_1,
for (int y = 0; y < height; ++y) {
for (int x = 0; x < width; ++x) {
const uint8_t mask_value =
- GetMaskValue<subsampling_x, subsampling_y>(mask, mask_next_row, x);
+ GetMaskValue(mask, mask_next_row, x, subsampling_x, subsampling_y);
if (is_inter_intra) {
dst[x] = static_cast<Pixel>(RightShiftWithRounding(
mask_value * pred_1[x] + (64 - mask_value) * pred_0[x], 6));
@@ -96,7 +96,7 @@ void InterIntraMaskBlend8bpp_C(const uint8_t* prediction_0,
for (int y = 0; y < height; ++y) {
for (int x = 0; x < width; ++x) {
const uint8_t mask_value =
- GetMaskValue<subsampling_x, subsampling_y>(mask, mask_next_row, x);
+ GetMaskValue(mask, mask_next_row, x, subsampling_x, subsampling_y);
prediction_1[x] = static_cast<uint8_t>(RightShiftWithRounding(
mask_value * prediction_1[x] + (64 - mask_value) * prediction_0[x],
6));
@@ -148,6 +148,7 @@ void Init8bpp() {
#ifndef LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp420
dsp->inter_intra_mask_blend_8bpp[2] = InterIntraMaskBlend8bpp_C<1, 1>;
#endif
+ static_cast<void>(GetMaskValue);
#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
}
diff --git a/src/dsp/mask_blend_test.cc b/src/dsp/mask_blend_test.cc
new file mode 100644
index 0000000..b5e7e60
--- /dev/null
+++ b/src/dsp/mask_blend_test.cc
@@ -0,0 +1,493 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/mask_blend.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <ostream>
+#include <string>
+#include <type_traits>
+
+#include "absl/strings/match.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kNumSpeedTests = 50000;
+// mask_blend is applied to compound prediction values when is_inter_intra is
+// false. This implies a range far exceeding that of pixel values. The ranges
+// include kCompoundOffset in 10bpp and 12bpp.
+// see: src/dsp/convolve.cc & src/dsp/warp.cc.
+constexpr int kCompoundPredictionRange[3][2] = {
+ // 8bpp
+ {-5132, 9212},
+ // 10bpp
+ {3988, 61532},
+ // 12bpp
+ {3974, 61559},
+};
+
+const char* GetDigest8bpp(int id) {
+ static const char* const kDigest[] = {
+ "4b70d5ef5ac7554b4b2660a4abe14a41", "64adb36f07e4a2c4ea4f05cfd715ff58",
+ "c490478208374a43765900ef7115c264", "b98f222eb70ef8589da2d6c839ca22b8",
+ "54752ca05f67b5af571bc311aa4e3de3", "344b2dab7accd8bd0a255bee16207336",
+ "0b2f6f755d1547eea7e0172f8133ea01", "310dc6364fdacba186c01f0e8ac4fcb7",
+ "b0c9f08b73d9e5c16eaf5abdbca1fdc0", "eaad805999d949fa1e1bbbb63b4b7827",
+ "6eb2a80d212df89403efb50db7a81b08", "c30730aa799dba78a2ebd3f729af82c7",
+ "4346c2860b23f0072b6b288f14c1df36", "8f8dd3eeed74ef115ca8a2f82ebff0ba",
+ "42e8872a81647767636f4c75609e0e2f", "1ff2526547d59557f7bb458249e34527",
+ "cd303d685268aebd2919dd468928d0ba", "254fb3ad990f9d408d252c70dd682e27",
+ "ba8d99c62853d14855f5d93e9574c97b", "e8ab744348681d6aa1043080efa86fc9",
+ "2fa919ca1f54b4336de878ff4015c352", "18e47c9809b909c2bfad08e00dffc635",
+ "9a90c843f06f0b662c509c26f5dd5054", "f89c608f884f37b064fc2b49eb2690a9",
+ "2448734d948ca6ddeb0ce8038a4ab2cf", "a3e0f86b7a5cb49716a424709c00b5a4",
+ "eb84dba768b54da10cded2f932f0aab7", "d6e8fdeb6875b70488f25d7f7ed9423f",
+ "1ca0822febce19c02ddc42a7b3331257", "a9259bb9b87ad002619eb47b907d7226",
+ "6408c5f327f1a9a390fb0046d4bc112b", "dba612489f87d00a82f2735fbcb98dcc",
+ "e8626a97699fbd247d6358ad5f766bee", "5e638a6897d7a2950f3512f871fa19e6",
+ "45a58708939779413f8e0e1de2ee5e6f", "079ae4682d398f0a7e4b66059589586d",
+ "6a06e617308409f9181b59bdd4f63d83", "b05ade2c1a572fc5fcca92b4163d9afb",
+ "30e955c3f86111207d5922575602e90a", "af5e6c65ed48a0eb7d509f7036398728",
+ "f9da3310d7dc75910483dfdd2af6ee62", "a9423b4d67bee5e7c7bc3baa7a9c017a",
+ "6b90a04333407013dd011c1af582e79f", "e658088a74bfb7cc57a2faa74a6f8689",
+ "6eedf27126eba6915035f9f701a1b992", "89116a7c6ad3f70a5b3f3105d04ad1a8",
+ "f41e5e166b049d0006d8b2cab56523b3", "3bed57a684075bbe3c25fd0c3e5520c3",
+ "85c0b21af2afb18ce948abfe3e23c85b", "bd8aaa3602d6b42438f8449f8adb52cb",
+ "1266bad904caad2c6d4047abefc2393d", "6573f2fe2a14c9ab7d5e192742388489",
+ "6b9b443f6306059fa3fe18df9de6dc48", "c9a91ee6ae8b653f552866e4073dd097",
+ "fa58938384198f7709d4871d155ba100", "033d121fc782e83ff94c31e73407d2a8",
+ "7ea268d79f7b8c75a4feeb24e892471a", "73a376bb3e07172d1e094ab8e01a7d42",
+ "13c366e0da1663fac126ea3d3876c110", "2f5eb5fcdf953c63fee2b8c75a6e5568",
+ "2054b197f002223f2d75699884279511", "67ce53e6991657a922d77cc8a23f1e07",
+ "f48e6d666435e7a917d6f90539b0d557", "21d03669d8d255e43552f8fb90724717",
+ "43dbaa1a7aaf2a01764e78e041b6763b", "a8173347ea861ecee6da54f81df73951",
+ "6b97ec4e4647a8de026d693059b855b7", "a85bf4c4b48791ac4971339877e4bc8a",
+ "04cf84d020a60ce3ce53845255ca8ec9", "ddd87035b960499b883d0aefcf96b6b2",
+ "278c5dd102474d598bf788cd66977ba9", "78b3790785811516142d417a49177c8c",
+ "7883ea9c2df0b4f5797cba31f4352678", "727004811025ac97b04940e2eaf68f94",
+ "7ffa3f97ec13dc8b6225550133a392bc", "6f5f2cb7a44aa0daea5c6b3315110591",
+ "88a59d68875fb44ec3be9d3fa293bccb", "0516e71f76b9d998794d3d63e480fa2f",
+ "193793d42f0964b4a958a68d9d7eb4ba", "4d259c7c6a95744e4ebaaa5361befb11",
+ "c090155b997dc103203bcb5a9dcc6282",
+ };
+ return kDigest[id];
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+const char* GetDigest10bpp(int id) {
+ static const char* const kDigest[] = {
+ "1af3cbd1616941b59e6a3f6a417b6312", "1d8b3f4b9d5d2f4ff5be8e81b7243121",
+ "53a3a76bf2bcd5761cd15fc739a4f4e1", "7597f69dc19a584280be0d67911db6a6",
+ "e1221c172843dc6c1b345bcd370771cc", "2ccbe012ca167114b14c3ba70befa960",
+ "0f68632d7e5faddb4554ca430d1df822", "8caa0061a26e142b783951d5abd7bf5d",
+ "1cce6acdbd8ca8d2546ba937584730bf", "022913e87a3c1a86aaefe2c2d4f89882",
+ "48f8ab636ba15a06731d869b603cbe58", "ba1616c990d224c20de123c3ccf19952",
+ "346a797b7cb4de10759e329f8b49e077", "8f4aa102e9b1ac430bdb9ebd4ec4cfca",
+ "5886397456b15e504ad55d8e0ce71e0e", "2a78b52ce43dc28606e83521963c00fa",
+ "8d3ef5280063337b0df97f91251bb8fc", "81f0ceada000ce40586be828a2045430",
+ "edb7b70a473392148bc419a44385326b", "97abe2eecaf9158a0529b234a241a57a",
+ "65729d750aa1258e4a7eccef247ac8c2", "78cc995e81188b9e8b29fa58796a3313",
+ "a1eb6a8c2f7c77e30e739a1b3b07cc74", "805b0f2f4b9d80f118d800b5ab4f603e",
+ "12610c83533f7170149390ba581f70b2", "cba20deed43b49ada3f626c91510995d",
+ "ba7ea35410b746fcbcf56c24ccb56d59", "933b2235b9b943984607d87f0bce1067",
+ "7ae59015295db8983bc8472429076464", "c18cce63327b367c0a260e9cbf4222b9",
+ "7c9672a7dfa964cb3ed3f2b4b443d2b6", "b29bcf1cc5369702e0179db1198db531",
+ "412326aff6c89116240b5d3ef63fa5cc", "3d854589fd171e42d118be4627ec5330",
+ "9a157e51e39ed314031224f074193791", "c645cdc63d3112f27b90cc9080c6d071",
+ "3f360cc336a4ee9a9bd78bde1a6e9eb3", "37b40fa8674d03a7cd66afdee939b9bf",
+ "cd6c7b98fe71b533c6a06d6d9122a6d0", "c26e0a0e90a969d762edcab770bed3b7",
+ "e517967d2cf4f1b0fff09d334475e2ae", "bc760a328a0a4b2d75593667adfa2a0e",
+ "b6239fdeeccc462640047cb2e2c2be96", "bc01f6a232ef9f0d9e57301779edd67f",
+ "cf6e8c1823c5498fa5589db40406a6ad", "2a9a4bd0bd84f0b85225a5b30f5eaa16",
+ "56f7bb2265dbd8a563bb269aa527c8a3", "fcbed0f0350be5a1384f95f8090d262e",
+ "f3ecf2e5747ebff65ac78ecbe7cc5e6a", "1d57d1371ad2f5f320cc4de789665f7c",
+ "e9f400fee64673b0f6313400fe449135", "5dfdc4a8376740011c777df46418b5d2",
+ "a4eb2c077300c0d8eeda028c9db3a63a", "90551259280c2b2150f018304204f072",
+ "4cbcd76496fc5b841cd164b6067b9c0b", "895964acc7b7e7d084de2266421c351b",
+ "af2e05159d369d0e3b72707f242b2845", "c7d393cef751950df3b9ed8056a9ffce",
+ "788541c0807aed47b863d47e5912555d", "163a06512f48c1b0f2535c8c50815bcc",
+ "dc5e723bab9fbfd7074a62e05b6b3c2b", "bf91200ce1bf97b4642a601adc13d700",
+ "d93fcefa6b9004baaab76d436e7ac931", "e89a2111caecc6bcf5f2b42ea0167ab4",
+ "e04a058df9b87878ca97edc1c42e76e1", "5d1f60876147edd6ed29d1fb50172464",
+ "655fb228aa410fd244c58c87fe510bec", "639a8a0a8f62d628136f5a97b3728b69",
+ "5b60f2428b092a502d6471fa09befd7f", "40601555ac945b4d37d3434b6e5619be",
+ "02be23bf1f89d5f5af02a39b98f96142", "9347a45bd54d28d8105f8183996b3505",
+ "d8429cc7b0b388981861a0fdd40289f0", "c4b7fab3b044486f663e160c07805e0a",
+ "f5f5d513b1f1c13d0abc70fc18afea48", "f236795ea30f1b8761b268734a245ba1",
+ "c7b7452ea8247a3a40248278d08953d5", "ddd6ba3c5ec56cc7a0b0161ae67001fa",
+ "94675749f2db46a8ade6f2f211db9a32", "3d165364ff96a5ef39e67a53fe3ed3be",
+ "3d1d66a9401fd7e78050724ca1fa0419",
+ };
+ return kDigest[id];
+}
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+struct MaskBlendTestParam {
+ MaskBlendTestParam(int width, int height, int subsampling_x,
+ int subsampling_y, bool is_inter_intra,
+ bool is_wedge_inter_intra)
+ : width(width),
+ height(height),
+ subsampling_x(subsampling_x),
+ subsampling_y(subsampling_y),
+ is_inter_intra(is_inter_intra),
+ is_wedge_inter_intra(is_wedge_inter_intra) {}
+ int width;
+ int height;
+ int subsampling_x;
+ int subsampling_y;
+ bool is_inter_intra;
+ bool is_wedge_inter_intra;
+};
+
+std::ostream& operator<<(std::ostream& os, const MaskBlendTestParam& param) {
+ return os << "BlockSize" << param.width << "x" << param.height
+ << ", subsampling(x/y): " << param.subsampling_x << "/"
+ << param.subsampling_y
+ << ", is_inter_intra: " << param.is_inter_intra
+ << ", is_wedge_inter_intra: " << param.is_wedge_inter_intra;
+}
+
+template <int bitdepth, typename Pixel>
+class MaskBlendTest : public testing::TestWithParam<MaskBlendTestParam>,
+ public test_utils::MaxAlignedAllocable {
+ public:
+ MaskBlendTest() = default;
+ ~MaskBlendTest() override = default;
+
+ void SetUp() override {
+ test_utils::ResetDspTable(bitdepth);
+ MaskBlendInit_C();
+ const dsp::Dsp* const dsp = dsp::GetDspTable(bitdepth);
+ ASSERT_NE(dsp, nullptr);
+ const testing::TestInfo* const test_info =
+ testing::UnitTest::GetInstance()->current_test_info();
+ const absl::string_view test_case = test_info->test_suite_name();
+ if (absl::StartsWith(test_case, "C/")) {
+ } else if (absl::StartsWith(test_case, "NEON/")) {
+ MaskBlendInit_NEON();
+ } else if (absl::StartsWith(test_case, "SSE41/")) {
+ if ((GetCpuInfo() & kSSE4_1) != 0) {
+ MaskBlendInit_SSE4_1();
+ }
+ } else {
+ FAIL() << "Unrecognized architecture prefix in test case name: "
+ << test_case;
+ }
+ func_ = (param_.is_inter_intra && !param_.is_wedge_inter_intra)
+ ? dsp->mask_blend[0][param_.is_inter_intra]
+ : dsp->mask_blend[param_.subsampling_x + param_.subsampling_y]
+ [param_.is_inter_intra];
+ func_8bpp_ = dsp->inter_intra_mask_blend_8bpp[param_.is_wedge_inter_intra
+ ? param_.subsampling_x +
+ param_.subsampling_y
+ : 0];
+ }
+
+ protected:
+ int GetDigestIdOffset() const {
+ // id is for retrieving the corresponding digest from the lookup table given
+ // the set of input parameters. id can be figured out by its width, height
+ // and an offset (id_offset).
+ // For example, in kMaskBlendTestParam, this set of parameters
+ // (8, 8, 0, 0, false, false) corresponds to the first entry in the
+ // digest lookup table, where id == 0.
+ // (8, 8, 1, 0, false, false) corresponds to id == 13.
+ // (8, 8, 1, 1, false, false) corresponds to id == 26.
+ // (8, 8, 0, 0, true, false) corresponds to id == 39.
+ // Id_offset denotes offset for different modes (is_inter_intra,
+ // is_wedge_inter_intra). Width and height help to figure out id:
+ // width = 8, height = 8, id = id_offset + log2(8) - 3.
+ // width = 8, height = 16, id = id_offset + log2(min(width, height) - 3 + 1.
+ // ...
+ if (!param_.is_inter_intra && !param_.is_wedge_inter_intra) {
+ return param_.subsampling_x * 13 + param_.subsampling_y * 13;
+ }
+ if (param_.is_inter_intra && !param_.is_wedge_inter_intra) {
+ return 39 + param_.subsampling_x * 7 + param_.subsampling_y * 7;
+ }
+ if (param_.is_inter_intra && param_.is_wedge_inter_intra) {
+ return 60 + param_.subsampling_x * 7 + param_.subsampling_y * 7;
+ }
+ return 0;
+ }
+
+ int GetDigestId() const {
+ int id = GetDigestIdOffset();
+ if (param_.width == param_.height) {
+ return id + 3 * (FloorLog2(param_.width) - 3);
+ }
+ if (param_.width < param_.height) {
+ return id + 1 + 3 * (FloorLog2(param_.width) - 3);
+ }
+ return id + 2 + 3 * (FloorLog2(param_.height) - 3);
+ }
+
+ void Test(const char* digest, int num_runs);
+
+ private:
+ using PredType =
+ typename std::conditional<bitdepth == 8, int16_t, uint16_t>::type;
+ static constexpr int kStride = kMaxSuperBlockSizeInPixels;
+ static constexpr int kDestStride = kMaxSuperBlockSizeInPixels * sizeof(Pixel);
+ const MaskBlendTestParam param_ = GetParam();
+ alignas(kMaxAlignment) PredType
+ source1_[kMaxSuperBlockSizeInPixels * kMaxSuperBlockSizeInPixels] = {};
+ uint8_t source1_8bpp_[kMaxSuperBlockSizeInPixels *
+ kMaxSuperBlockSizeInPixels] = {};
+ alignas(kMaxAlignment) PredType
+ source2_[kMaxSuperBlockSizeInPixels * kMaxSuperBlockSizeInPixels] = {};
+ uint8_t source2_8bpp_[kMaxSuperBlockSizeInPixels *
+ kMaxSuperBlockSizeInPixels] = {};
+ uint8_t source2_8bpp_cache_[kMaxSuperBlockSizeInPixels *
+ kMaxSuperBlockSizeInPixels] = {};
+ uint8_t mask_[kMaxSuperBlockSizeInPixels * kMaxSuperBlockSizeInPixels];
+ uint8_t dest_[sizeof(Pixel) * kMaxSuperBlockSizeInPixels *
+ kMaxSuperBlockSizeInPixels] = {};
+ dsp::MaskBlendFunc func_;
+ dsp::InterIntraMaskBlendFunc8bpp func_8bpp_;
+};
+
+template <int bitdepth, typename Pixel>
+void MaskBlendTest<bitdepth, Pixel>::Test(const char* const digest,
+ const int num_runs) {
+ if (func_ == nullptr && func_8bpp_ == nullptr) return;
+ const int width = param_.width >> param_.subsampling_x;
+ const int height = param_.height >> param_.subsampling_y;
+
+ // Add id offset to seed just to add more randomness to input blocks.
+ // If we use the same seed for different block sizes, the generated input
+ // blocks are repeated. For example, if input size is 8x8, the generated
+ // block is exactly the upper left half of the generated 16x16 block.
+ libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed() +
+ GetDigestIdOffset());
+ PredType* src_1 = source1_;
+ uint8_t* src_1_8bpp = source1_8bpp_;
+ PredType* src_2 = source2_;
+ uint8_t* src_2_8bpp = source2_8bpp_;
+ const ptrdiff_t src_2_stride = param_.is_inter_intra ? kStride : width;
+ uint8_t* mask_row = mask_;
+ const int range_mask = (1 << (bitdepth)) - 1;
+ for (int y = 0; y < height; ++y) {
+ for (int x = 0; x < width; ++x) {
+ src_1[x] = static_cast<PredType>(rnd.Rand16() & range_mask);
+ src_2[x] = static_cast<PredType>(rnd.Rand16() & range_mask);
+ if (param_.is_inter_intra && bitdepth == 8) {
+ src_1_8bpp[x] = src_1[x];
+ src_2_8bpp[x] = src_2[x];
+ }
+ if (!param_.is_inter_intra) {
+ // Implies isCompound == true.
+ constexpr int bitdepth_index = (bitdepth - 8) >> 1;
+ const int min_val = kCompoundPredictionRange[bitdepth_index][0];
+ const int max_val = kCompoundPredictionRange[bitdepth_index][1];
+ src_1[x] = static_cast<PredType>(rnd(max_val - min_val) + min_val);
+ src_2[x] = static_cast<PredType>(rnd(max_val - min_val) + min_val);
+ }
+ }
+ src_1 += width;
+ src_1_8bpp += width;
+ src_2 += src_2_stride;
+ src_2_8bpp += src_2_stride;
+ }
+ // Mask should be setup regardless of subsampling.
+ for (int y = 0; y < param_.height; ++y) {
+ for (int x = 0; x < param_.width; ++x) {
+ mask_row[x] = rnd.Rand8() & 63;
+ mask_row[x] += rnd.Rand8() & 1; // Range of mask is [0, 64].
+ }
+ mask_row += kStride;
+ }
+
+ absl::Duration elapsed_time;
+ for (int i = 0; i < num_runs; ++i) {
+ const absl::Time start = absl::Now();
+ if (param_.is_inter_intra && bitdepth == 8) {
+ ASSERT_EQ(func_, nullptr);
+ static_assert(sizeof(source2_8bpp_cache_) == sizeof(source2_8bpp_), "");
+ // source2_8bpp_ is modified in the call.
+ memcpy(source2_8bpp_cache_, source2_8bpp_, sizeof(source2_8bpp_));
+ func_8bpp_(source1_8bpp_, source2_8bpp_, src_2_stride, mask_, kStride,
+ width, height);
+ for (int y = 0; y < height; ++y) {
+ for (int x = 0; x < width; ++x) {
+ dest_[y * kDestStride + x] = source2_8bpp_[y * src_2_stride + x];
+ }
+ }
+ memcpy(source2_8bpp_, source2_8bpp_cache_, sizeof(source2_8bpp_));
+ } else {
+ if (bitdepth != 8) {
+ ASSERT_EQ(func_8bpp_, nullptr);
+ }
+ func_(source1_, source2_, src_2_stride, mask_, kStride, width, height,
+ dest_, kDestStride);
+ }
+ elapsed_time += absl::Now() - start;
+ }
+
+ test_utils::CheckMd5Digest(
+ "MaskBlend",
+ absl::StrFormat("%dx%d", param_.width, param_.height).c_str(), digest,
+ dest_, sizeof(dest_), elapsed_time);
+}
+
+const MaskBlendTestParam kMaskBlendTestParam[] = {
+ // is_inter_intra = false, is_wedge_inter_intra = false.
+ // block size range is from 8x8 to 128x128.
+ MaskBlendTestParam(8, 8, 0, 0, false, false),
+ MaskBlendTestParam(8, 16, 0, 0, false, false),
+ MaskBlendTestParam(16, 8, 0, 0, false, false),
+ MaskBlendTestParam(16, 16, 0, 0, false, false),
+ MaskBlendTestParam(16, 32, 0, 0, false, false),
+ MaskBlendTestParam(32, 16, 0, 0, false, false),
+ MaskBlendTestParam(32, 32, 0, 0, false, false),
+ MaskBlendTestParam(32, 64, 0, 0, false, false),
+ MaskBlendTestParam(64, 32, 0, 0, false, false),
+ MaskBlendTestParam(64, 64, 0, 0, false, false),
+ MaskBlendTestParam(64, 128, 0, 0, false, false),
+ MaskBlendTestParam(128, 64, 0, 0, false, false),
+ MaskBlendTestParam(128, 128, 0, 0, false, false),
+ MaskBlendTestParam(8, 8, 1, 0, false, false),
+ MaskBlendTestParam(8, 16, 1, 0, false, false),
+ MaskBlendTestParam(16, 8, 1, 0, false, false),
+ MaskBlendTestParam(16, 16, 1, 0, false, false),
+ MaskBlendTestParam(16, 32, 1, 0, false, false),
+ MaskBlendTestParam(32, 16, 1, 0, false, false),
+ MaskBlendTestParam(32, 32, 1, 0, false, false),
+ MaskBlendTestParam(32, 64, 1, 0, false, false),
+ MaskBlendTestParam(64, 32, 1, 0, false, false),
+ MaskBlendTestParam(64, 64, 1, 0, false, false),
+ MaskBlendTestParam(64, 128, 1, 0, false, false),
+ MaskBlendTestParam(128, 64, 1, 0, false, false),
+ MaskBlendTestParam(128, 128, 1, 0, false, false),
+ MaskBlendTestParam(8, 8, 1, 1, false, false),
+ MaskBlendTestParam(8, 16, 1, 1, false, false),
+ MaskBlendTestParam(16, 8, 1, 1, false, false),
+ MaskBlendTestParam(16, 16, 1, 1, false, false),
+ MaskBlendTestParam(16, 32, 1, 1, false, false),
+ MaskBlendTestParam(32, 16, 1, 1, false, false),
+ MaskBlendTestParam(32, 32, 1, 1, false, false),
+ MaskBlendTestParam(32, 64, 1, 1, false, false),
+ MaskBlendTestParam(64, 32, 1, 1, false, false),
+ MaskBlendTestParam(64, 64, 1, 1, false, false),
+ MaskBlendTestParam(64, 128, 1, 1, false, false),
+ MaskBlendTestParam(128, 64, 1, 1, false, false),
+ MaskBlendTestParam(128, 128, 1, 1, false, false),
+ // is_inter_intra = true, is_wedge_inter_intra = false.
+ // block size range is from 8x8 to 32x32.
+ MaskBlendTestParam(8, 8, 0, 0, true, false),
+ MaskBlendTestParam(8, 16, 0, 0, true, false),
+ MaskBlendTestParam(16, 8, 0, 0, true, false),
+ MaskBlendTestParam(16, 16, 0, 0, true, false),
+ MaskBlendTestParam(16, 32, 0, 0, true, false),
+ MaskBlendTestParam(32, 16, 0, 0, true, false),
+ MaskBlendTestParam(32, 32, 0, 0, true, false),
+ MaskBlendTestParam(8, 8, 1, 0, true, false),
+ MaskBlendTestParam(8, 16, 1, 0, true, false),
+ MaskBlendTestParam(16, 8, 1, 0, true, false),
+ MaskBlendTestParam(16, 16, 1, 0, true, false),
+ MaskBlendTestParam(16, 32, 1, 0, true, false),
+ MaskBlendTestParam(32, 16, 1, 0, true, false),
+ MaskBlendTestParam(32, 32, 1, 0, true, false),
+ MaskBlendTestParam(8, 8, 1, 1, true, false),
+ MaskBlendTestParam(8, 16, 1, 1, true, false),
+ MaskBlendTestParam(16, 8, 1, 1, true, false),
+ MaskBlendTestParam(16, 16, 1, 1, true, false),
+ MaskBlendTestParam(16, 32, 1, 1, true, false),
+ MaskBlendTestParam(32, 16, 1, 1, true, false),
+ MaskBlendTestParam(32, 32, 1, 1, true, false),
+ // is_inter_intra = true, is_wedge_inter_intra = true.
+ // block size range is from 8x8 to 32x32.
+ MaskBlendTestParam(8, 8, 0, 0, true, true),
+ MaskBlendTestParam(8, 16, 0, 0, true, true),
+ MaskBlendTestParam(16, 8, 0, 0, true, true),
+ MaskBlendTestParam(16, 16, 0, 0, true, true),
+ MaskBlendTestParam(16, 32, 0, 0, true, true),
+ MaskBlendTestParam(32, 16, 0, 0, true, true),
+ MaskBlendTestParam(32, 32, 0, 0, true, true),
+ MaskBlendTestParam(8, 8, 1, 0, true, true),
+ MaskBlendTestParam(8, 16, 1, 0, true, true),
+ MaskBlendTestParam(16, 8, 1, 0, true, true),
+ MaskBlendTestParam(16, 16, 1, 0, true, true),
+ MaskBlendTestParam(16, 32, 1, 0, true, true),
+ MaskBlendTestParam(32, 16, 1, 0, true, true),
+ MaskBlendTestParam(32, 32, 1, 0, true, true),
+ MaskBlendTestParam(8, 8, 1, 1, true, true),
+ MaskBlendTestParam(8, 16, 1, 1, true, true),
+ MaskBlendTestParam(16, 8, 1, 1, true, true),
+ MaskBlendTestParam(16, 16, 1, 1, true, true),
+ MaskBlendTestParam(16, 32, 1, 1, true, true),
+ MaskBlendTestParam(32, 16, 1, 1, true, true),
+ MaskBlendTestParam(32, 32, 1, 1, true, true),
+};
+
+using MaskBlendTest8bpp = MaskBlendTest<8, uint8_t>;
+
+TEST_P(MaskBlendTest8bpp, Blending) { Test(GetDigest8bpp(GetDigestId()), 1); }
+
+TEST_P(MaskBlendTest8bpp, DISABLED_Speed) {
+ Test(GetDigest8bpp(GetDigestId()), kNumSpeedTests);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, MaskBlendTest8bpp,
+ testing::ValuesIn(kMaskBlendTestParam));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, MaskBlendTest8bpp,
+ testing::ValuesIn(kMaskBlendTestParam));
+#endif
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, MaskBlendTest8bpp,
+ testing::ValuesIn(kMaskBlendTestParam));
+#endif
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using MaskBlendTest10bpp = MaskBlendTest<10, uint16_t>;
+
+TEST_P(MaskBlendTest10bpp, Blending) { Test(GetDigest10bpp(GetDigestId()), 1); }
+
+TEST_P(MaskBlendTest10bpp, DISABLED_Speed) {
+ Test(GetDigest10bpp(GetDigestId()), kNumSpeedTests);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, MaskBlendTest10bpp,
+ testing::ValuesIn(kMaskBlendTestParam));
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, MaskBlendTest10bpp,
+ testing::ValuesIn(kMaskBlendTestParam));
+#endif
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+} // namespace
+} // namespace dsp
+} // namespace libgav1
diff --git a/src/dsp/motion_field_projection_test.cc b/src/dsp/motion_field_projection_test.cc
new file mode 100644
index 0000000..3a47cc7
--- /dev/null
+++ b/src/dsp/motion_field_projection_test.cc
@@ -0,0 +1,213 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/motion_field_projection.h"
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <cmath>
+#include <cstdint>
+#include <string>
+
+#include "absl/strings/match.h"
+#include "absl/strings/str_format.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/reference_info.h"
+#include "src/utils/types.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kMotionFieldWidth = 160;
+constexpr int kMotionFieldHight = 120;
+
+// The 'int' parameter is unused but required to allow for instantiations of C,
+// NEON, etc.
+class MotionFieldProjectionTest : public testing::TestWithParam<int> {
+ public:
+ MotionFieldProjectionTest() = default;
+ MotionFieldProjectionTest(const MotionFieldProjectionTest&) = delete;
+ MotionFieldProjectionTest& operator=(const MotionFieldProjectionTest&) =
+ delete;
+ ~MotionFieldProjectionTest() override = default;
+
+ void SetUp() override {
+ test_utils::ResetDspTable(8);
+ MotionFieldProjectionInit_C();
+ const testing::TestInfo* const test_info =
+ testing::UnitTest::GetInstance()->current_test_info();
+ const char* const test_case = test_info->test_suite_name();
+ if (absl::StartsWith(test_case, "C/")) {
+ } else if (absl::StartsWith(test_case, "NEON/")) {
+ MotionFieldProjectionInit_NEON();
+ } else if (absl::StartsWith(test_case, "SSE41/")) {
+ if ((GetCpuInfo() & kSSE4_1) != 0) {
+ MotionFieldProjectionInit_SSE4_1();
+ }
+ } else {
+ FAIL() << "Unrecognized architecture prefix in test case name: "
+ << test_case;
+ }
+ const Dsp* const dsp = GetDspTable(8);
+ ASSERT_NE(dsp, nullptr);
+ target_motion_field_projection_kernel_func_ =
+ dsp->motion_field_projection_kernel;
+ }
+
+ void SetInputData(int motion_field_width, libvpx_test::ACMRandom* rnd);
+ void TestRandomValues(bool speed);
+
+ private:
+ MotionFieldProjectionKernelFunc target_motion_field_projection_kernel_func_;
+ ReferenceInfo reference_info_;
+ TemporalMotionField motion_field_;
+};
+
+void MotionFieldProjectionTest::SetInputData(
+ const int motion_field_width, libvpx_test::ACMRandom* const rnd) {
+ ASSERT_TRUE(reference_info_.Reset(kMotionFieldHight, motion_field_width));
+ ASSERT_TRUE(motion_field_.mv.Reset(kMotionFieldHight, motion_field_width,
+ /*zero_initialize=*/false));
+ ASSERT_TRUE(motion_field_.reference_offset.Reset(kMotionFieldHight,
+ motion_field_width,
+ /*zero_initialize=*/false));
+ constexpr int order_hint_bits = 6;
+ unsigned int order_hint_shift_bits = Mod32(32 - order_hint_bits);
+ const unsigned int current_frame_order_hint =
+ rnd->Rand8() & ((1 << order_hint_bits) - 1); // [0, 63]
+ uint8_t reference_frame_order_hint = 0;
+ reference_info_.relative_distance_to[0] = 0;
+ reference_info_.skip_references[kReferenceFrameIntra] = true;
+ reference_info_.projection_divisions[kReferenceFrameIntra] = 0;
+ for (int i = kReferenceFrameLast; i < kNumReferenceFrameTypes; ++i) {
+ reference_frame_order_hint =
+ rnd->Rand8() & ((1 << order_hint_bits) - 1); // [0, 63]
+ const int relative_distance_to =
+ GetRelativeDistance(current_frame_order_hint,
+ reference_frame_order_hint, order_hint_shift_bits);
+ reference_info_.relative_distance_to[i] = relative_distance_to;
+ reference_info_.skip_references[i] =
+ relative_distance_to > kMaxFrameDistance || relative_distance_to <= 0;
+ reference_info_.projection_divisions[i] =
+ reference_info_.skip_references[i]
+ ? 0
+ : kProjectionMvDivisionLookup[relative_distance_to];
+ }
+ for (int y = 0; y < kMotionFieldHight; ++y) {
+ for (int x = 0; x < motion_field_width; ++x) {
+ reference_info_.motion_field_reference_frame[y][x] =
+ static_cast<ReferenceFrameType>(rnd->Rand16() &
+ kReferenceFrameAlternate);
+ reference_info_.motion_field_mv[y][x].mv[0] = rnd->Rand16Signed() / 512;
+ reference_info_.motion_field_mv[y][x].mv[1] = rnd->Rand16Signed() / 512;
+ }
+ }
+ MotionVector invalid_mv;
+ invalid_mv.mv[0] = kInvalidMvValue;
+ invalid_mv.mv[1] = kInvalidMvValue;
+ MotionVector* const motion_field_mv = &motion_field_.mv[0][0];
+ int8_t* const motion_field_reference_offset =
+ &motion_field_.reference_offset[0][0];
+ std::fill(motion_field_mv, motion_field_mv + motion_field_.mv.size(),
+ invalid_mv);
+ std::fill(
+ motion_field_reference_offset,
+ motion_field_reference_offset + motion_field_.reference_offset.size(),
+ -128);
+}
+
+void MotionFieldProjectionTest::TestRandomValues(bool speed) {
+ static const char* const kDigestMv[8] = {
+ "87c2a74538f5c015809492ac2e521075", "ba7b4a5d82c6083b13a5b02eb7655ab7",
+ "8c37d96bf1744d5553860bf44a4f60a3", "720aa644f85e48995db9785e87cd02e3",
+ "9289c0c66524bb77a605870d78285f35", "f0326509885c2b2c89feeac53698cd47",
+ "6b9ad1d672dec825cb1803063d35badc", "dfe06c57cc9c70d27246df7fd0afa0b2"};
+ static const char* const kDigestReferenceOffset[8] = {
+ "d8d1384268d7cf5c4514b39c329f94fb", "7f30e79ceb064befbad64a20d206a540",
+ "61e2eb5644edbd3a91b939403edc891e", "7a018f1bf88193e86934241af445dc36",
+ "2d6166bf8bbe1db77baf687ecf71d028", "95fee61f0219e06076d6f0e1073b1a4e",
+ "64d0a63751267bdc573cab761f1fe685", "906a99e0e791dbcb9183c9b68ecc4ea3"};
+ const int num_tests = speed ? 2000 : 1;
+ if (target_motion_field_projection_kernel_func_ == nullptr) return;
+ libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+ for (int width_idx = 0; width_idx < 8; ++width_idx) {
+ const int motion_field_width = kMotionFieldWidth + width_idx;
+ SetInputData(motion_field_width, &rnd);
+ const int dst_sign = ((rnd.Rand16() & 1) != 0) ? 0 : -1;
+ const int reference_to_current_with_sign =
+ rnd.PseudoUniform(2 * kMaxFrameDistance + 1) - kMaxFrameDistance;
+ assert(std::abs(reference_to_current_with_sign) <= kMaxFrameDistance);
+ // Step of y8 and x8 is at least 16 except the last hop.
+ for (int step = 16; step <= 80; step += 16) {
+ const absl::Time start = absl::Now();
+ for (int k = 0; k < num_tests; ++k) {
+ for (int y8 = 0; y8 < kMotionFieldHight; y8 += step) {
+ const int y8_end = std::min(y8 + step, kMotionFieldHight);
+ for (int x8 = 0; x8 < motion_field_width; x8 += step) {
+ const int x8_end = std::min(x8 + step, motion_field_width);
+ target_motion_field_projection_kernel_func_(
+ reference_info_, reference_to_current_with_sign, dst_sign, y8,
+ y8_end, x8, x8_end, &motion_field_);
+ }
+ }
+ }
+ const absl::Duration elapsed_time = absl::Now() - start;
+ test_utils::CheckMd5Digest(
+ "MotionFieldProjectionKernel",
+ absl::StrFormat("(mv) width %d step %d", motion_field_width, step)
+ .c_str(),
+ kDigestMv[width_idx], motion_field_.mv[0],
+ sizeof(motion_field_.mv[0][0]) * motion_field_.mv.size(),
+ elapsed_time);
+ test_utils::CheckMd5Digest(
+ "MotionFieldProjectionKernel",
+ absl::StrFormat("(ref offset) width %d step %d", motion_field_width,
+ step)
+ .c_str(),
+ kDigestReferenceOffset[width_idx], motion_field_.reference_offset[0],
+ sizeof(motion_field_.reference_offset[0][0]) *
+ motion_field_.reference_offset.size(),
+ elapsed_time);
+ }
+ }
+}
+
+TEST_P(MotionFieldProjectionTest, Correctness) { TestRandomValues(false); }
+
+TEST_P(MotionFieldProjectionTest, DISABLED_Speed) { TestRandomValues(true); }
+
+INSTANTIATE_TEST_SUITE_P(C, MotionFieldProjectionTest, testing::Values(0));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, MotionFieldProjectionTest, testing::Values(0));
+#endif
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, MotionFieldProjectionTest, testing::Values(0));
+#endif
+
+} // namespace
+} // namespace dsp
+} // namespace libgav1
diff --git a/src/dsp/motion_vector_search_test.cc b/src/dsp/motion_vector_search_test.cc
new file mode 100644
index 0000000..a7b2ec8
--- /dev/null
+++ b/src/dsp/motion_vector_search_test.cc
@@ -0,0 +1,197 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/motion_vector_search.h"
+
+#include <cstdint>
+#include <string>
+
+#include "absl/strings/match.h"
+#include "absl/strings/str_format.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "src/utils/types.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// The 'int' parameter is unused but required to allow for instantiations of C,
+// NEON, etc.
+class MotionVectorSearchTest : public testing::TestWithParam<int>,
+ public test_utils::MaxAlignedAllocable {
+ public:
+ MotionVectorSearchTest() = default;
+ MotionVectorSearchTest(const MotionVectorSearchTest&) = delete;
+ MotionVectorSearchTest& operator=(const MotionVectorSearchTest&) = delete;
+ ~MotionVectorSearchTest() override = default;
+
+ void SetUp() override {
+ test_utils::ResetDspTable(8);
+ MotionVectorSearchInit_C();
+ const testing::TestInfo* const test_info =
+ testing::UnitTest::GetInstance()->current_test_info();
+ const char* const test_case = test_info->test_suite_name();
+ if (absl::StartsWith(test_case, "C/")) {
+ } else if (absl::StartsWith(test_case, "NEON/")) {
+ MotionVectorSearchInit_NEON();
+ } else if (absl::StartsWith(test_case, "SSE41/")) {
+ if ((GetCpuInfo() & kSSE4_1) != 0) {
+ MotionVectorSearchInit_SSE4_1();
+ }
+ } else {
+ FAIL() << "Unrecognized architecture prefix in test case name: "
+ << test_case;
+ }
+ const Dsp* const dsp = GetDspTable(8);
+ ASSERT_NE(dsp, nullptr);
+ mv_projection_compound_[0] = dsp->mv_projection_compound[0];
+ mv_projection_compound_[1] = dsp->mv_projection_compound[1];
+ mv_projection_compound_[2] = dsp->mv_projection_compound[2];
+ mv_projection_single_[0] = dsp->mv_projection_single[0];
+ mv_projection_single_[1] = dsp->mv_projection_single[1];
+ mv_projection_single_[2] = dsp->mv_projection_single[2];
+ }
+
+ void SetInputData(libvpx_test::ACMRandom* rnd);
+ void TestRandomValues(bool speed);
+
+ private:
+ MvProjectionCompoundFunc mv_projection_compound_[3];
+ MvProjectionSingleFunc mv_projection_single_[3];
+ int reference_offsets_[2];
+ alignas(kMaxAlignment)
+ MotionVector temporal_mvs_[kMaxTemporalMvCandidatesWithPadding];
+ int8_t temporal_reference_offsets_[kMaxTemporalMvCandidatesWithPadding];
+ CompoundMotionVector compound_mv_org_[kMaxTemporalMvCandidates + 1]
+ [kMaxTemporalMvCandidatesWithPadding];
+ alignas(kMaxAlignment)
+ CompoundMotionVector compound_mv_[kMaxTemporalMvCandidates + 1]
+ [kMaxTemporalMvCandidatesWithPadding];
+ MotionVector single_mv_org_[kMaxTemporalMvCandidates + 1]
+ [kMaxTemporalMvCandidatesWithPadding];
+ alignas(kMaxAlignment)
+ MotionVector single_mv_[kMaxTemporalMvCandidates + 1]
+ [kMaxTemporalMvCandidatesWithPadding];
+};
+
+void MotionVectorSearchTest::SetInputData(libvpx_test::ACMRandom* const rnd) {
+ reference_offsets_[0] =
+ Clip3(rnd->Rand16(), -kMaxFrameDistance, kMaxFrameDistance);
+ reference_offsets_[1] =
+ Clip3(rnd->Rand16(), -kMaxFrameDistance, kMaxFrameDistance);
+ for (int i = 0; i < kMaxTemporalMvCandidatesWithPadding; ++i) {
+ temporal_reference_offsets_[i] = rnd->RandRange(kMaxFrameDistance);
+ for (auto& mv : temporal_mvs_[i].mv) {
+ mv = rnd->Rand16Signed() / 8;
+ }
+ }
+ for (int i = 0; i <= kMaxTemporalMvCandidates; ++i) {
+ for (int j = 0; j < kMaxTemporalMvCandidatesWithPadding; ++j) {
+ for (int k = 0; k < 2; ++k) {
+ single_mv_[i][j].mv[k] = rnd->Rand16Signed();
+ for (auto& mv : compound_mv_[i][j].mv[k].mv) {
+ mv = rnd->Rand16Signed();
+ }
+ }
+ compound_mv_org_[i][j] = compound_mv_[i][j];
+ single_mv_org_[i][j] = single_mv_[i][j];
+ }
+ }
+}
+
+void MotionVectorSearchTest::TestRandomValues(bool speed) {
+ static const char* const kDigestCompound[3] = {
+ "74c055b06c3701b2e50f2c964a6130b9", "cab21dd54f0a1bf6e80b58cdcf1fe0a9",
+ "e42de30cd84fa4e7b8581a330ed08a8b"};
+ static const char* const kDigestSingle[3] = {
+ "265ffbb59d0895183f8e2d90b6652c71", "5068d980c4ce42ed3f11963b8aece6cc",
+ "7e699d58df3954a38ff11c8e34151e66"};
+ const int num_tests = speed ? 1000000 : 1;
+ libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+ for (int function_index = 0; function_index < 3; ++function_index) {
+ SetInputData(&rnd);
+ if (mv_projection_compound_[function_index] == nullptr) continue;
+ const absl::Time start = absl::Now();
+ for (int count = 1; count <= kMaxTemporalMvCandidates; ++count) {
+ const int total_count = count + (count & 1);
+ for (int i = 0; i < num_tests; ++i) {
+ mv_projection_compound_[function_index](
+ temporal_mvs_, temporal_reference_offsets_, reference_offsets_,
+ count, compound_mv_[count]);
+ }
+ // One more element could be calculated in SIMD implementations.
+ // Restore the original values if any.
+ for (int i = count; i < total_count; ++i) {
+ compound_mv_[count][i] = compound_mv_org_[count][i];
+ }
+ }
+ const absl::Duration elapsed_time = absl::Now() - start;
+ test_utils::CheckMd5Digest(
+ "MvProjectionCompound",
+ absl::StrFormat("function_index %d", function_index).c_str(),
+ kDigestCompound[function_index], compound_mv_, sizeof(compound_mv_),
+ elapsed_time);
+ }
+ for (int function_index = 0; function_index < 3; ++function_index) {
+ SetInputData(&rnd);
+ if (mv_projection_single_[function_index] == nullptr) continue;
+ const absl::Time start = absl::Now();
+ for (int count = 1; count <= kMaxTemporalMvCandidates; ++count) {
+ const int total_count = (count + 3) & ~3;
+ for (int i = 0; i < num_tests; ++i) {
+ mv_projection_single_[function_index](
+ temporal_mvs_, temporal_reference_offsets_, reference_offsets_[0],
+ count, single_mv_[count]);
+ }
+ // Up to three more elements could be calculated in SIMD implementations.
+ // Restore the original values if any.
+ for (int i = count; i < total_count; ++i) {
+ single_mv_[count][i] = single_mv_org_[count][i];
+ }
+ }
+ const absl::Duration elapsed_time = absl::Now() - start;
+ test_utils::CheckMd5Digest(
+ "MvProjectionSingle",
+ absl::StrFormat("function_index %d", function_index).c_str(),
+ kDigestSingle[function_index], single_mv_, sizeof(single_mv_),
+ elapsed_time);
+ }
+}
+
+TEST_P(MotionVectorSearchTest, Correctness) { TestRandomValues(false); }
+
+TEST_P(MotionVectorSearchTest, DISABLED_Speed) { TestRandomValues(true); }
+
+INSTANTIATE_TEST_SUITE_P(C, MotionVectorSearchTest, testing::Values(0));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, MotionVectorSearchTest, testing::Values(0));
+#endif
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, MotionVectorSearchTest, testing::Values(0));
+#endif
+
+} // namespace
+} // namespace dsp
+} // namespace libgav1
diff --git a/src/dsp/obmc_test.cc b/src/dsp/obmc_test.cc
new file mode 100644
index 0000000..60b10c6
--- /dev/null
+++ b/src/dsp/obmc_test.cc
@@ -0,0 +1,349 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/obmc.h"
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <ostream>
+#include <string>
+
+#include "absl/strings/match.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "tests/block_utils.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+#include "src/dsp/obmc.inc"
+
+constexpr int kMaxBlendingBlockSize = 64;
+constexpr int kNumSpeedTests = 1000000;
+
+const char* GetDigest8bpp(int id) {
+ static const char* const kDigest[] = {
+ "76906f87892c30c7059a5c97e4838c42", "0b8670d937217c66425f2662b51eebbe",
+ "c8659acd1e8ecdab06be73f0954fa1ae", "e785f31f2723a193fefd534bd6f6c18f",
+ "751fcd8a345fef1c38a25293c9b528c0", "69af412dfa5e96ad43b79c178cb1c58b",
+ "2766a64622e183bb4614f2018f14fa85", "8d98589a5cef6e68ee8fadf19d420e3c",
+ "19eccf31dd8cf1abcee9414128fe4141", "35019f98e30bcbc6ab624682a0628519",
+ "199c551164e73c100045d7ab033ffdcc", "ad5a5eb2906265690c22741b0715f37b",
+ "e2152dea159249149ff4151111b73ed6", "6b44c0052789ce2fa4df882f35618e7d",
+ "1edd570bec7e63780d83588f6aacda25", "b04b81c9e52c58885907dc7f1ef2c11c",
+ "b24ad192e151b1e0f74d1493004cb1b6", "6c1ce7ed3463cc60870e336f990d4f14",
+ "2e6b7a06da21512dfdd9a517d2988655", "971ba1c41ab13bb341c04f936760f546",
+ "55b803239d9f12888c666c5320450937", "3d0838963f8c95dafbfb8e5e25c865d2",
+ "98a9be6245720d4e0da18115c1a1dbd7", "7e7afe3136ad681b5ea05664fe916548",
+ "33971753243f09106173199b7bae1ef5", "65413f33c19a42c112d395121aa4b3b4",
+ };
+ return kDigest[id];
+}
+
+const char* GetDigestSpeed8bpp(int id) {
+ static const char* const kDigest[] = {
+ "c5b532f5960477bdd50684ab25fae0f4", "bf76ed404bc5674e0a4ff238efceb62b",
+ "5ea519b616cd2998fbb9b25b4c2660cb", "f23d18197a96de48901738d130a147d9",
+ "07b4140c693947a63865f835089766c4", "62547d29bc4dfb2e201e9d907c09e345",
+ "c3988da521be50aeb9944564001b282b", "d5a8ff9ca1bd49f4260bb497c489b06c",
+ "b3e94f1e33c316759ebf47620327168c", "c5e64a34ca7e55f4daed19cbe4c27049",
+ "3b234eb729e8e79db8692c4cbe1b6667", "f9f3060a44c3a575470f9700b3c3a75b",
+ "e3a1960b0a7238db1184a3f9d8e9a4b2", "721c7e8ec3aa0608b64f10f7ff5427db",
+ "ba9938553703d520bc0ade427c397140", "8b6e15e8ecd234363f70f51c64b0aea1",
+ "31bf64a6ed1e8002d488c0b9dcffb80a", "9ab1f3ae2e7f70cd27452f30cecfd18e",
+ "eaf25ac79ad70fc17ca96d8fcdf0f939", "9aaa88cb5e6b8757e37c3430bd664e70",
+ "8293874b2794df8fd22f5a35c3de7bee", "e9d6ee9106227c2c67ea9e6a4652e4ad",
+ "29f8a6fc2a650f3945a4ea6d3b975b6d", "8f300a257e913a42666b4921b2b0b5c5",
+ "a526265c4b3c8593736a82ddc1fd1603", "76e248f6756ac96343204b0e48d72a9e",
+ };
+ return kDigest[id];
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+const char* GetDigest10bpp(int id) {
+ static const char* const kDigest[] = {
+ "6ab8f28e8fb3c4b10b23efee38d4154e", "d4374005d34e43e06c1b0c906289dadd",
+ "6f922e4142b644ca3f1eb0f363a1c34e", "84e7c098a9335b36082fec0bc7203075",
+ "40f00ea6884fea23a3b7fae59e3b02c3", "70cb92d08b4fdb6dd9c7d418cb1455d3",
+ "ed550798b56e70439a93cb48c359e873", "55e0d927b984e78cd51a1961e58a431d",
+ "482a6856b87265a82e4ea3fdadb2d95b", "0be46226ff87d74ff2ce68a83eaf9cca",
+ "bb4461f0131a1693a0a76f21d92a480b", "ea24f78d74c7864fb247c9a98c9b97b6",
+ "d2e70b81882aeb3d9fccef89e7552a9d", "4a692ddf91905727bc524d91735cf93c",
+ "f5d882ee6d9ae6f7dfa467ca99301424", "58821b87e7d9d4388d6003ffcb3723d1",
+ "824ddb98eb4129b3d254c0bc7a64cd73", "5eaaafa8ef9b7ba5e2856a947e5b33df",
+ "071de1494e0f1b2f99266b90bdc43ddd", "c33227a96dad506adc32dacfb371ab78",
+ "e8a632f9fff240c439d4ae6e86795046", "26b90d74f18f9df4427b6180d48db1fc",
+ "e4a01e492ddc0398b5c5b60c81468242", "f1b4f7ab5c8b949e51db104f2e33565a",
+ "b1fb9ecc6a552e2b23ee92e2f3e4122a", "a683d20129a91bb20b904aa20c0499b1",
+ };
+ return kDigest[id];
+}
+
+const char* GetDigestSpeed10bpp(int id) {
+ static const char* const kDigest[] = {
+ "df59e5fd6e0237a56381f3a516806eb8", "f478bdf43e0b91b8dc9b2661eb207e49",
+ "80557576299708005111029cef04da53", "24f84f07f53f61cd46bdcfe1e05ff9b5",
+ "4dd6bc62145baa5357a4cbf6d7a6ef15", "0b7aa27cee43b8ae0c02d07887eaa225",
+ "9e28cdae73ca97433499c31ca79e1d07", "1cacd6466a143f88e736fffaf21e2246",
+ "9c7699626660d8965e06a54282a408f3", "eef893efef62b2eb4aaad06fc462819c",
+ "4965d0a3ff750813df85c0082b21bd4b", "ec10fd79fbf552abc595def392e9a863",
+ "a148bbafdc4466fbb700b31acccca8ac", "ff0566921ff2d5145f79fbf409508fb2",
+ "5da9d960988549f53b817003b93e4d01", "fa9028b2ed049ad71b5fd15f2daacbe5",
+ "b4c4f88d1fb54869ce7ff452ca7786a6", "d607f785fce62bad85102054539e7089",
+ "b441761ea2817e4618c594aaa11d670a", "1cc5e08e6d5f9315dbc0369b97af941d",
+ "568cc1a3a67ba4e6e77f54602d0ed3e3", "522f14c068f788bc284a7d1e47d623ed",
+ "b543855cbe384b88861c881853c28192", "5faaafc124e94eedc69dc0f5d33dacac",
+ "13ca4d01bd20085459e6126555e1f7b5", "46d46fae3c8a7d9e4725154d8d2b76d8",
+ };
+ return kDigest[id];
+}
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+struct ObmcTestParam {
+ ObmcTestParam(int width, int height, ObmcDirection blending_direction)
+ : width(width), height(height), blending_direction(blending_direction) {}
+ int width;
+ int height;
+ ObmcDirection blending_direction;
+};
+
+std::ostream& operator<<(std::ostream& os, const ObmcTestParam& param) {
+ return os << "BlockSize" << param.width << "x" << param.height
+ << ", blending_direction: " << ToString(param.blending_direction);
+}
+
+template <int bitdepth, typename Pixel>
+class ObmcBlendTest : public testing::TestWithParam<ObmcTestParam> {
+ public:
+ ObmcBlendTest() = default;
+ ~ObmcBlendTest() override = default;
+
+ void SetUp() override {
+ test_utils::ResetDspTable(bitdepth);
+ ObmcInit_C();
+ const dsp::Dsp* const dsp = dsp::GetDspTable(bitdepth);
+ ASSERT_NE(dsp, nullptr);
+ const testing::TestInfo* const test_info =
+ testing::UnitTest::GetInstance()->current_test_info();
+ const absl::string_view test_case = test_info->test_suite_name();
+ if (absl::StartsWith(test_case, "C/")) {
+ } else if (absl::StartsWith(test_case, "SSE41/")) {
+ if ((GetCpuInfo() & kSSE4_1) != 0) {
+ ObmcInit_SSE4_1();
+ }
+ } else if (absl::StartsWith(test_case, "NEON/")) {
+ ObmcInit_NEON();
+ } else {
+ FAIL() << "Unrecognized architecture prefix in test case name: "
+ << test_case;
+ }
+ func_ = dsp->obmc_blend[blending_direction_];
+ }
+
+ protected:
+ int GetDigestId() const {
+ // blending_direction_ == 0:
+ // (width, height):
+ // (2, 2), id = 0. (2, 4), id = 1. (4, 2), id = 2.
+ // (4, 4), id = 3. (4, 8), id = 4. (8, 4), id = 5.
+ // ...
+ // blending_direction_ == 1: id starts from 13.
+ const int id = (blending_direction_ == kObmcDirectionVertical) ? 0 : 13;
+ if (width_ == height_) return id + 3 * (FloorLog2(width_) - 1);
+ if (width_ < height_) return id + 1 + 3 * (FloorLog2(width_) - 1);
+ return id + 2 + 3 * (FloorLog2(height_) - 1);
+ }
+
+ // Note |digest| is only used when |use_fixed_values| is false.
+ void Test(const char* digest, bool use_fixed_values, int value);
+ void TestSpeed(const char* digest, int num_runs);
+
+ private:
+ const int width_ = GetParam().width;
+ const int height_ = GetParam().height;
+ const int blending_direction_ = GetParam().blending_direction;
+ Pixel source1_[kMaxBlendingBlockSize * kMaxBlendingBlockSize] = {};
+ Pixel source2_[kMaxBlendingBlockSize * kMaxBlendingBlockSize] = {};
+ dsp::ObmcBlendFunc func_;
+};
+
+template <int bitdepth, typename Pixel>
+void ObmcBlendTest<bitdepth, Pixel>::Test(const char* const digest,
+ const bool use_fixed_values,
+ const int value) {
+ if (func_ == nullptr) return;
+ if (use_fixed_values) {
+ std::fill(source1_,
+ source1_ + kMaxBlendingBlockSize * kMaxBlendingBlockSize, value);
+ std::fill(source2_,
+ source2_ + kMaxBlendingBlockSize * kMaxBlendingBlockSize, value);
+ } else {
+ libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+ Pixel* src_1 = source1_;
+ Pixel* src_2 = source2_;
+ const int mask = (1 << bitdepth) - 1;
+ for (int y = 0; y < height_; ++y) {
+ for (int x = 0; x < width_; ++x) {
+ src_1[x] = rnd.Rand16() & mask;
+ src_2[x] = rnd.Rand16() & mask;
+ }
+ src_1 += kMaxBlendingBlockSize;
+ src_2 += kMaxBlendingBlockSize;
+ }
+ }
+ const ptrdiff_t stride = kMaxBlendingBlockSize * sizeof(Pixel);
+ func_(source1_, stride, width_, height_, source2_, stride);
+ if (use_fixed_values) {
+ const bool success = test_utils::CompareBlocks(
+ source1_, source2_, width_, height_, kMaxBlendingBlockSize,
+ kMaxBlendingBlockSize, false);
+ EXPECT_TRUE(success);
+ } else {
+ test_utils::CheckMd5Digest(
+ "Obmc", absl::StrFormat("%dx%d", width_, height_).c_str(), digest,
+ source1_, sizeof(source1_), absl::Duration());
+ }
+}
+
+template <int bitdepth, typename Pixel>
+void ObmcBlendTest<bitdepth, Pixel>::TestSpeed(const char* const digest,
+ const int num_runs) {
+ if (func_ == nullptr) return;
+ libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+ Pixel* src_1 = source1_;
+ Pixel* src_2 = source2_;
+ const int mask = (1 << bitdepth) - 1;
+ for (int y = 0; y < height_; ++y) {
+ for (int x = 0; x < width_; ++x) {
+ src_1[x] = rnd.Rand16() & mask;
+ src_2[x] = rnd.Rand16() & mask;
+ }
+ src_1 += kMaxBlendingBlockSize;
+ src_2 += kMaxBlendingBlockSize;
+ }
+ const ptrdiff_t stride = kMaxBlendingBlockSize * sizeof(Pixel);
+ uint8_t dest[sizeof(Pixel) * kMaxBlendingBlockSize * kMaxBlendingBlockSize];
+ absl::Duration elapsed_time;
+ for (int i = 0; i < num_runs; ++i) {
+ memcpy(dest, source1_,
+ sizeof(Pixel) * kMaxBlendingBlockSize * kMaxBlendingBlockSize);
+ const absl::Time start = absl::Now();
+ func_(dest, stride, width_, height_, source2_, stride);
+ elapsed_time += absl::Now() - start;
+ }
+ memcpy(source1_, dest,
+ sizeof(Pixel) * kMaxBlendingBlockSize * kMaxBlendingBlockSize);
+ test_utils::CheckMd5Digest("Obmc",
+ absl::StrFormat("%dx%d", width_, height_).c_str(),
+ digest, source1_, sizeof(source1_), elapsed_time);
+}
+
+const ObmcTestParam kObmcTestParam[] = {
+ ObmcTestParam(2, 2, kObmcDirectionVertical),
+ ObmcTestParam(2, 4, kObmcDirectionVertical),
+ ObmcTestParam(4, 2, kObmcDirectionVertical),
+ ObmcTestParam(4, 4, kObmcDirectionVertical),
+ ObmcTestParam(4, 8, kObmcDirectionVertical),
+ ObmcTestParam(8, 4, kObmcDirectionVertical),
+ ObmcTestParam(8, 8, kObmcDirectionVertical),
+ ObmcTestParam(8, 16, kObmcDirectionVertical),
+ ObmcTestParam(16, 8, kObmcDirectionVertical),
+ ObmcTestParam(16, 16, kObmcDirectionVertical),
+ ObmcTestParam(16, 32, kObmcDirectionVertical),
+ ObmcTestParam(32, 16, kObmcDirectionVertical),
+ ObmcTestParam(32, 32, kObmcDirectionVertical),
+ ObmcTestParam(2, 2, kObmcDirectionHorizontal),
+ ObmcTestParam(2, 4, kObmcDirectionHorizontal),
+ ObmcTestParam(4, 2, kObmcDirectionHorizontal),
+ ObmcTestParam(4, 4, kObmcDirectionHorizontal),
+ ObmcTestParam(4, 8, kObmcDirectionHorizontal),
+ ObmcTestParam(8, 4, kObmcDirectionHorizontal),
+ ObmcTestParam(8, 8, kObmcDirectionHorizontal),
+ ObmcTestParam(8, 16, kObmcDirectionHorizontal),
+ ObmcTestParam(16, 8, kObmcDirectionHorizontal),
+ ObmcTestParam(16, 16, kObmcDirectionHorizontal),
+ ObmcTestParam(16, 32, kObmcDirectionHorizontal),
+ ObmcTestParam(32, 16, kObmcDirectionHorizontal),
+ ObmcTestParam(32, 32, kObmcDirectionHorizontal),
+};
+
+using ObmcBlendTest8bpp = ObmcBlendTest<8, uint8_t>;
+
+TEST_P(ObmcBlendTest8bpp, Blending) {
+ Test(/*digest=*/nullptr, /*use_fixed_values=*/true, 0);
+ Test(/*digest=*/nullptr, /*use_fixed_values=*/true, 1);
+ Test(/*digest=*/nullptr, /*use_fixed_values=*/true, 128);
+ Test(/*digest=*/nullptr, /*use_fixed_values=*/true, 255);
+ Test(GetDigest8bpp(GetDigestId()), /*use_fixed_values=*/false, -1);
+}
+
+TEST_P(ObmcBlendTest8bpp, DISABLED_Speed) {
+ TestSpeed(
+ GetDigestSpeed8bpp(GetDigestId()),
+ (kNumSpeedTests * 32 * 32) / (GetParam().height * GetParam().width));
+}
+
+INSTANTIATE_TEST_SUITE_P(C, ObmcBlendTest8bpp,
+ testing::ValuesIn(kObmcTestParam));
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, ObmcBlendTest8bpp,
+ testing::ValuesIn(kObmcTestParam));
+#endif
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, ObmcBlendTest8bpp,
+ testing::ValuesIn(kObmcTestParam));
+#endif
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using ObmcBlendTest10bpp = ObmcBlendTest<10, uint16_t>;
+
+TEST_P(ObmcBlendTest10bpp, Blending) {
+ Test(/*digest=*/nullptr, /*use_fixed_values=*/true, 0);
+ Test(/*digest=*/nullptr, /*use_fixed_values=*/true, 1);
+ Test(/*digest=*/nullptr, /*use_fixed_values=*/true, 128);
+ Test(/*digest=*/nullptr, /*use_fixed_values=*/true, (1 << 10) - 1);
+ Test(GetDigest10bpp(GetDigestId()), /*use_fixed_values=*/false, -1);
+}
+
+TEST_P(ObmcBlendTest10bpp, DISABLED_Speed) {
+ TestSpeed(
+ GetDigestSpeed10bpp(GetDigestId()),
+ (kNumSpeedTests * 32 * 32) / (GetParam().height * GetParam().width));
+}
+
+INSTANTIATE_TEST_SUITE_P(C, ObmcBlendTest10bpp,
+ testing::ValuesIn(kObmcTestParam));
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, ObmcBlendTest10bpp,
+ testing::ValuesIn(kObmcTestParam));
+#endif
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+} // namespace
+} // namespace dsp
+} // namespace libgav1
diff --git a/src/dsp/super_res.cc b/src/dsp/super_res.cc
index d041bd1..abb01a1 100644
--- a/src/dsp/super_res.cc
+++ b/src/dsp/super_res.cc
@@ -26,10 +26,10 @@ namespace {
template <int bitdepth, typename Pixel>
void SuperRes_C(const void* /*coefficients*/, void* const source,
- const ptrdiff_t stride, const int height,
+ const ptrdiff_t source_stride, const int height,
const int downscaled_width, const int upscaled_width,
- const int initial_subpixel_x, const int step,
- void* const dest) {
+ const int initial_subpixel_x, const int step, void* const dest,
+ ptrdiff_t dest_stride) {
assert(step <= 1 << kSuperResScaleBits);
auto* src = static_cast<Pixel*>(source) - DivideBy2(kSuperResFilterTaps);
auto* dst = static_cast<Pixel*>(dest);
@@ -61,8 +61,8 @@ void SuperRes_C(const void* /*coefficients*/, void* const source,
(1 << bitdepth) - 1);
subpixel_x += step;
} while (++x < upscaled_width);
- src += stride;
- dst += stride;
+ src += source_stride;
+ dst += dest_stride;
} while (--y != 0);
}
diff --git a/src/dsp/super_res_test.cc b/src/dsp/super_res_test.cc
new file mode 100644
index 0000000..a93fc31
--- /dev/null
+++ b/src/dsp/super_res_test.cc
@@ -0,0 +1,264 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/super_res.h"
+
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+#include <string>
+#include <vector>
+
+#include "absl/strings/match.h"
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kNumSpeedTests = 5e5;
+
+const char* GetDigest8bpp(int id) {
+ static const char* const kDigestSuperRes[] = {
+ "52eb4eac1df0c51599d57696405b69d0", "ccb07cc8295fd1440ff2e3b9199ec4f9",
+ "baef34cca795b95f3d1fd81d609da679", "03f1579c2773c8ba9c867316a22b94a3"};
+ return kDigestSuperRes[id];
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+const char* GetDigest10bpp(int id) {
+ static const char* const kDigestSuperRes[] = {
+ "8fd78e05d944aeb11fac278b47ee60ba", "948eaecb70fa5614ce1c1c95e9942dc3",
+ "126cd7727e787e0625ec3f5ce97f8fa0", "85c806c41d40b841764bcb54f6d3a712"};
+ return kDigestSuperRes[id];
+}
+#endif
+
+struct SuperResTestParam {
+ SuperResTestParam(int downscaled_width, int upscaled_width)
+ : downscaled_width(downscaled_width), upscaled_width(upscaled_width) {}
+ int downscaled_width;
+ int upscaled_width;
+};
+
+template <int bitdepth, typename Pixel, typename Coefficient>
+class SuperResTest : public testing::TestWithParam<SuperResTestParam>,
+ public test_utils::MaxAlignedAllocable {
+ public:
+ SuperResTest() = default;
+ void SetUp() override {
+ test_utils::ResetDspTable(bitdepth);
+ SuperResInit_C();
+ const dsp::Dsp* const dsp = dsp::GetDspTable(bitdepth);
+ ASSERT_NE(dsp, nullptr);
+
+ const testing::TestInfo* const test_info =
+ testing::UnitTest::GetInstance()->current_test_info();
+ const std::vector<std::string> split_test_name =
+ absl::StrSplit(test_info->name(), '/');
+ ASSERT_TRUE(absl::SimpleAtoi(split_test_name[1], &test_id_));
+ const absl::string_view test_case = test_info->test_suite_name();
+ if (absl::StartsWith(test_case, "C/")) {
+ } else if (absl::StartsWith(test_case, "NEON/")) {
+ SuperResInit_NEON();
+ } else if (absl::StartsWith(test_case, "SSE41/")) {
+ SuperResInit_SSE4_1();
+ } else {
+ FAIL() << "Unrecognized architecture prefix in test case name: "
+ << test_case;
+ }
+ super_res_coefficients_ = dsp->super_res_coefficients;
+ func_ = dsp->super_res;
+ }
+
+ void TestComputeSuperRes(int fixed_value, int num_runs);
+
+ private:
+ static constexpr int kHeight = 127;
+ // The maximum width that must be allocated.
+ static constexpr int kUpscaledBufferWidth = 192;
+ // Allow room for the filter taps.
+ static constexpr int kStride =
+ ((kUpscaledBufferWidth + 2 * kSuperResHorizontalBorder + 15) & ~15);
+ const int kDownscaledWidth = GetParam().downscaled_width;
+ const int kUpscaledWidth = GetParam().upscaled_width;
+ int test_id_;
+ SuperResCoefficientsFunc super_res_coefficients_;
+ SuperResFunc func_;
+ Pixel source_buffer_[kHeight][kStride];
+ alignas(kMaxAlignment) Pixel dest_buffer_[kHeight][kStride];
+ alignas(kMaxAlignment) Coefficient
+ superres_coefficients_[kSuperResFilterTaps * kUpscaledBufferWidth];
+};
+
+template <int bitdepth, typename Pixel, typename Coefficient>
+void SuperResTest<bitdepth, Pixel, Coefficient>::TestComputeSuperRes(
+ int fixed_value, int num_runs) {
+ if (func_ == nullptr) return;
+ const int superres_width = kDownscaledWidth << kSuperResScaleBits;
+ const int step = (superres_width + kUpscaledWidth / 2) / kUpscaledWidth;
+ const int error = step * kUpscaledWidth - superres_width;
+ const int initial_subpixel_x =
+ ((-((kUpscaledWidth - kDownscaledWidth) << (kSuperResScaleBits - 1)) +
+ DivideBy2(kUpscaledWidth)) /
+ kUpscaledWidth +
+ (1 << (kSuperResExtraBits - 1)) - error / 2) &
+ kSuperResScaleMask;
+ if (super_res_coefficients_ != nullptr) {
+ super_res_coefficients_(kUpscaledWidth, initial_subpixel_x, step,
+ superres_coefficients_);
+ }
+ memset(dest_buffer_, 0, sizeof(dest_buffer_));
+ if (fixed_value != 0) {
+ SetBlock<Pixel>(kHeight, kStride, fixed_value, source_buffer_[0], kStride);
+ } else {
+ // Random values.
+ libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+ const int bitdepth_mask = (1 << bitdepth) - 1;
+ for (int y = 0; y < kHeight; ++y) {
+ for (int x = 0; x < kStride; ++x) {
+ source_buffer_[y][x] = rnd.Rand16() & bitdepth_mask;
+ }
+ }
+ }
+ // Offset starting point in the buffer to accommodate line extension.
+ Pixel* src_ptr = source_buffer_[0] + kSuperResHorizontalBorder;
+
+ const absl::Time start = absl::Now();
+ for (int i = 0; i < num_runs; ++i) {
+ func_(superres_coefficients_, src_ptr, kStride, kHeight, kDownscaledWidth,
+ kUpscaledWidth, initial_subpixel_x, step, dest_buffer_, kStride);
+ }
+ const absl::Duration elapsed_time = absl::Now() - start;
+
+ if (fixed_value != 0) {
+ for (int y = 0; y < kHeight; ++y) {
+ for (int x = 0; x < kUpscaledWidth; ++x) {
+ EXPECT_TRUE(dest_buffer_[y][x] == fixed_value)
+ << "At location [" << y << ", " << x
+ << "]\nexpected: " << fixed_value
+ << "\nactual: " << dest_buffer_[y][x];
+ }
+ }
+ } else if (num_runs == 1) {
+ // Random values.
+ if ((kUpscaledWidth & 15) != 0) {
+ // The SIMD functions overwrite up to 15 pixels in each row. Reset them.
+ for (int y = 0; y < kHeight; ++y) {
+ for (int x = kUpscaledWidth; x < Align(kUpscaledWidth, 16); ++x) {
+ dest_buffer_[y][x] = 0;
+ }
+ }
+ }
+ const char* expected_digest;
+ if (bitdepth == 8) {
+ expected_digest = GetDigest8bpp(test_id_);
+ } else {
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ expected_digest = GetDigest10bpp(test_id_);
+#endif
+ }
+ test_utils::CheckMd5Digest(
+ "SuperRes",
+ absl::StrFormat("width %d, step %d, start %d", kUpscaledWidth, step,
+ initial_subpixel_x)
+ .c_str(),
+ expected_digest, dest_buffer_, sizeof(dest_buffer_), elapsed_time);
+ } else {
+ // Speed test.
+ printf("Mode SuperRes [width %d, step %d, start %d]: %d us\n",
+ kUpscaledWidth, step, initial_subpixel_x,
+ static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+ }
+}
+
+using SuperResTest8bpp = SuperResTest<8, uint8_t, int8_t>;
+
+TEST_P(SuperResTest8bpp, FixedValues) {
+ TestComputeSuperRes(100, 1);
+ TestComputeSuperRes(255, 1);
+ TestComputeSuperRes(1, 1);
+}
+
+TEST_P(SuperResTest8bpp, RandomValues) { TestComputeSuperRes(0, 1); }
+
+TEST_P(SuperResTest8bpp, DISABLED_Speed) {
+ TestComputeSuperRes(0, kNumSpeedTests);
+}
+
+const SuperResTestParam kSuperResTestParams[] = {
+ SuperResTestParam(96, 192),
+ SuperResTestParam(171, 192),
+ SuperResTestParam(102, 128),
+ SuperResTestParam(61, 121),
+};
+
+INSTANTIATE_TEST_SUITE_P(C, SuperResTest8bpp,
+ testing::ValuesIn(kSuperResTestParams));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, SuperResTest8bpp,
+ testing::ValuesIn(kSuperResTestParams));
+#endif
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, SuperResTest8bpp,
+ testing::ValuesIn(kSuperResTestParams));
+#endif
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using SuperResTest10bpp = SuperResTest<10, uint16_t, int16_t>;
+
+TEST_P(SuperResTest10bpp, FixedValues) {
+ TestComputeSuperRes(100, 1);
+ TestComputeSuperRes(511, 1);
+ TestComputeSuperRes(1, 1);
+}
+
+TEST_P(SuperResTest10bpp, RandomValues) { TestComputeSuperRes(0, 1); }
+
+TEST_P(SuperResTest10bpp, DISABLED_Speed) {
+ TestComputeSuperRes(0, kNumSpeedTests);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, SuperResTest10bpp,
+ testing::ValuesIn(kSuperResTestParams));
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, SuperResTest10bpp,
+ testing::ValuesIn(kSuperResTestParams));
+#endif
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, SuperResTest10bpp,
+ testing::ValuesIn(kSuperResTestParams));
+#endif
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+} // namespace
+} // namespace dsp
+} // namespace libgav1
diff --git a/src/dsp/warp_test.cc b/src/dsp/warp_test.cc
new file mode 100644
index 0000000..e7384f4
--- /dev/null
+++ b/src/dsp/warp_test.cc
@@ -0,0 +1,649 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/warp.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <ostream>
+#include <string>
+#include <type_traits>
+
+#include "absl/base/macros.h"
+#include "absl/strings/match.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/post_filter.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "tests/block_utils.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kSourceBorderHorizontal = 16;
+constexpr int kSourceBorderVertical = 13;
+
+constexpr int kMaxSourceBlockWidth =
+ kMaxSuperBlockSizeInPixels + kSourceBorderHorizontal * 2;
+constexpr int kMaxSourceBlockHeight =
+ kMaxSuperBlockSizeInPixels + kSourceBorderVertical * 2;
+constexpr int kMaxDestBlockWidth =
+ kMaxSuperBlockSizeInPixels + kConvolveBorderLeftTop * 2;
+constexpr int kMaxDestBlockHeight =
+ kMaxSuperBlockSizeInPixels + kConvolveBorderLeftTop * 2;
+
+constexpr uint16_t kDivisorLookup[257] = {
+ 16384, 16320, 16257, 16194, 16132, 16070, 16009, 15948, 15888, 15828, 15768,
+ 15709, 15650, 15592, 15534, 15477, 15420, 15364, 15308, 15252, 15197, 15142,
+ 15087, 15033, 14980, 14926, 14873, 14821, 14769, 14717, 14665, 14614, 14564,
+ 14513, 14463, 14413, 14364, 14315, 14266, 14218, 14170, 14122, 14075, 14028,
+ 13981, 13935, 13888, 13843, 13797, 13752, 13707, 13662, 13618, 13574, 13530,
+ 13487, 13443, 13400, 13358, 13315, 13273, 13231, 13190, 13148, 13107, 13066,
+ 13026, 12985, 12945, 12906, 12866, 12827, 12788, 12749, 12710, 12672, 12633,
+ 12596, 12558, 12520, 12483, 12446, 12409, 12373, 12336, 12300, 12264, 12228,
+ 12193, 12157, 12122, 12087, 12053, 12018, 11984, 11950, 11916, 11882, 11848,
+ 11815, 11782, 11749, 11716, 11683, 11651, 11619, 11586, 11555, 11523, 11491,
+ 11460, 11429, 11398, 11367, 11336, 11305, 11275, 11245, 11215, 11185, 11155,
+ 11125, 11096, 11067, 11038, 11009, 10980, 10951, 10923, 10894, 10866, 10838,
+ 10810, 10782, 10755, 10727, 10700, 10673, 10645, 10618, 10592, 10565, 10538,
+ 10512, 10486, 10460, 10434, 10408, 10382, 10356, 10331, 10305, 10280, 10255,
+ 10230, 10205, 10180, 10156, 10131, 10107, 10082, 10058, 10034, 10010, 9986,
+ 9963, 9939, 9916, 9892, 9869, 9846, 9823, 9800, 9777, 9754, 9732,
+ 9709, 9687, 9664, 9642, 9620, 9598, 9576, 9554, 9533, 9511, 9489,
+ 9468, 9447, 9425, 9404, 9383, 9362, 9341, 9321, 9300, 9279, 9259,
+ 9239, 9218, 9198, 9178, 9158, 9138, 9118, 9098, 9079, 9059, 9039,
+ 9020, 9001, 8981, 8962, 8943, 8924, 8905, 8886, 8867, 8849, 8830,
+ 8812, 8793, 8775, 8756, 8738, 8720, 8702, 8684, 8666, 8648, 8630,
+ 8613, 8595, 8577, 8560, 8542, 8525, 8508, 8490, 8473, 8456, 8439,
+ 8422, 8405, 8389, 8372, 8355, 8339, 8322, 8306, 8289, 8273, 8257,
+ 8240, 8224, 8208, 8192};
+
+template <bool is_compound>
+const char* GetDigest8bpp(int id) {
+ static const char* const kDigest[] = {
+ "77ba358a0f5e19a8e69fa0a95712578e", "141b23d13a04e0b84d26d514de76d6b0",
+ "b0265858454b979852ffadae323f0fb7", "9cf38e3579265b656f1f2100ba15b0e9",
+ "ab51d05cc255ef8e37921182df1d89b1", "e3e96f90a4b07ca733e40f057dc01c41",
+ "4eee8c1a52a62a266db9b1c9338e124c", "901a87d8f88f6324dbc0960a6de861ac",
+ "da9cb6faf6adaeeae12b6784f39186c5", "14450ab05536cdb0d2f499716ccb559d",
+ "566b396cbf008bbb869b364fdc81860d", "681a872baf2de4e58d73ea9ab8643a72",
+ "7f17d290d513a7416761b3a01f10fd2f",
+ };
+ static const char* const kCompoundDigest[] = {
+ "7e9339d265b7beac7bbe32fe7bb0fccb", "f747d663b427bb38a3ff36b0815a394c",
+ "858cf54d2253281a919fbdb48fe91c53", "4721dd97a212c6068bd488f400259afc",
+ "36878c7906492bc740112abdea77616f", "89deb68aa35764bbf3024b501a6bed50",
+ "8ac5b08f9b2afd38143c357646af0f82", "bf6e2a64835ea0c9d7467394253d0eb2",
+ "7b0a539acd2a27eff398dd084abad933", "61c8d81b397c1cf727ff8a9fabab90af",
+ "4d412349a25a832c1fb3fb29e3f0e2b3", "2c6dd2a9a4ede9fa00adb567ba646f30",
+ "b2a0ce68db3cadd207299f73112bed74",
+ };
+ return is_compound ? kCompoundDigest[id] : kDigest[id];
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+template <bool is_compound>
+const char* GetDigest10bpp(int id) {
+ static const char* const kDigest[] = {
+ "1fef54f56a0bafccf7f8da1ac3b18b76", "8a65c72f171feafa2f393d31d6b7fe1b",
+ "808019346f2f1f45f8cf2e9fc9a49320", "c28e2f2c6c830a29bcc2452166cba521",
+ "f040674d6f54e8910d655f0d11fd8cdd", "473af9bb1c6023965c2284b716feef97",
+ "e4f6d7babd0813d5afb0f575ebfa8166", "58f96ef8a880963a213624bb0d06d47c",
+ "1ec0995fa4490628b679d03683233388", "9526fb102fde7dc1a7e160e65af6da33",
+ "f0457427d0c0e31d82ea4f612f7f86f1", "ddc82ae298cccebad493ba9de0f69fbd",
+ "5ed615091e2f62df26de7e91a985cb81",
+ };
+ static const char* const kCompoundDigest[] = {
+ "8e6986ae143260e0b8b4887f15a141a1", "0a7f0db8316b8c3569f08834dd0c6f50",
+ "90705b2e7dbe083e8a1f70f29d6f257e", "e428a75bea77d769d21f3f7a1d2b0b38",
+ "a570b13d790c085c4ab50d71dd085d56", "e5d043c6cd6ff6dbab6e38a8877e93bd",
+ "12ea96991e46e3e9aa78ab812ffa0525", "84293a94a53f1cf814fa25e793c3fe27",
+ "b98a7502c84ac8437266f702dcc0a92e", "d8db5d52e9b0a5be0ad2d517d5bd16e9",
+ "f3be504bbb609ce4cc71c5539252638a", "fcde83b54e14e9de23460644f244b047",
+ "42eb66e752e9ef289b47053b5c73fdd6",
+ };
+ return is_compound ? kCompoundDigest[id] : kDigest[id];
+}
+#endif
+
+int RandomWarpedParam(int seed_offset, int bits) {
+ libvpx_test::ACMRandom rnd(seed_offset +
+ libvpx_test::ACMRandom::DeterministicSeed());
+ // 1 in 8 chance of generating zero (arbitrary).
+ const bool zero = (rnd.Rand16() & 7) == 0;
+ if (zero) return 0;
+ // Generate uniform values in the range [-(1 << bits), 1] U [1, 1 <<
+ // bits].
+ const int mask = (1 << bits) - 1;
+ const int value = 1 + (rnd.RandRange(1u << 31) & mask);
+ const bool sign = (rnd.Rand16() & 1) != 0;
+ return sign ? value : -value;
+}
+
+// This function is a copy from warp_prediction.cc.
+template <typename T>
+void GenerateApproximateDivisor(T value, int16_t* division_factor,
+ int16_t* division_shift) {
+ const int n = FloorLog2(std::abs(value));
+ const T e = std::abs(value) - (static_cast<T>(1) << n);
+ const int entry = (n > kDivisorLookupBits)
+ ? RightShiftWithRounding(e, n - kDivisorLookupBits)
+ : static_cast<int>(e << (kDivisorLookupBits - n));
+ *division_shift = n + kDivisorLookupPrecisionBits;
+ *division_factor =
+ (value < 0) ? -kDivisorLookup[entry] : kDivisorLookup[entry];
+}
+
+// This function is a copy from warp_prediction.cc.
+int16_t GetShearParameter(int value) {
+ return static_cast<int16_t>(
+ LeftShift(RightShiftWithRoundingSigned(value, kWarpParamRoundingBits),
+ kWarpParamRoundingBits));
+}
+
+// This function is a copy from warp_prediction.cc.
+// This function is used here to help generate valid warp parameters.
+bool SetupShear(const int* params, int16_t* alpha, int16_t* beta,
+ int16_t* gamma, int16_t* delta) {
+ int16_t division_shift;
+ int16_t division_factor;
+ GenerateApproximateDivisor<int32_t>(params[2], &division_factor,
+ &division_shift);
+ const int alpha0 =
+ Clip3(params[2] - (1 << kWarpedModelPrecisionBits), INT16_MIN, INT16_MAX);
+ const int beta0 = Clip3(params[3], INT16_MIN, INT16_MAX);
+ const int64_t v = LeftShift(params[4], kWarpedModelPrecisionBits);
+ const int gamma0 =
+ Clip3(RightShiftWithRoundingSigned(v * division_factor, division_shift),
+ INT16_MIN, INT16_MAX);
+ const int64_t w = static_cast<int64_t>(params[3]) * params[4];
+ const int delta0 = Clip3(
+ params[5] -
+ RightShiftWithRoundingSigned(w * division_factor, division_shift) -
+ (1 << kWarpedModelPrecisionBits),
+ INT16_MIN, INT16_MAX);
+
+ *alpha = GetShearParameter(alpha0);
+ *beta = GetShearParameter(beta0);
+ *gamma = GetShearParameter(gamma0);
+ *delta = GetShearParameter(delta0);
+ if ((4 * std::abs(*alpha) + 7 * std::abs(*beta) >=
+ (1 << kWarpedModelPrecisionBits)) ||
+ (4 * std::abs(*gamma) + 4 * std::abs(*delta) >=
+ (1 << kWarpedModelPrecisionBits))) {
+ return false; // NOLINT (easier condition to understand).
+ }
+
+ return true;
+}
+
+void GenerateWarpedModel(int* params, int16_t* alpha, int16_t* beta,
+ int16_t* gamma, int16_t* delta, int seed) {
+ do {
+ params[0] = RandomWarpedParam(seed, kWarpedModelPrecisionBits + 6);
+ params[1] = RandomWarpedParam(seed, kWarpedModelPrecisionBits + 6);
+ params[2] = RandomWarpedParam(seed, kWarpedModelPrecisionBits - 3) +
+ (1 << kWarpedModelPrecisionBits);
+ params[3] = RandomWarpedParam(seed, kWarpedModelPrecisionBits - 3);
+ params[4] = RandomWarpedParam(seed, kWarpedModelPrecisionBits - 3);
+ params[5] = RandomWarpedParam(seed, kWarpedModelPrecisionBits - 3) +
+ (1 << kWarpedModelPrecisionBits);
+ ++seed;
+ } while (params[2] == 0 || !SetupShear(params, alpha, beta, gamma, delta));
+}
+
+struct WarpTestParam {
+ WarpTestParam(int width, int height) : width(width), height(height) {}
+ int width;
+ int height;
+};
+
+template <bool is_compound, int bitdepth, typename Pixel>
+class WarpTest : public testing::TestWithParam<WarpTestParam> {
+ public:
+ WarpTest() = default;
+ ~WarpTest() override = default;
+
+ void SetUp() override {
+ test_utils::ResetDspTable(bitdepth);
+ WarpInit_C();
+ const dsp::Dsp* const dsp = dsp::GetDspTable(bitdepth);
+ ASSERT_NE(dsp, nullptr);
+ const testing::TestInfo* const test_info =
+ testing::UnitTest::GetInstance()->current_test_info();
+ const absl::string_view test_case = test_info->test_suite_name();
+ if (absl::StartsWith(test_case, "C/")) {
+ } else if (absl::StartsWith(test_case, "NEON/")) {
+ WarpInit_NEON();
+ } else if (absl::StartsWith(test_case, "SSE41/")) {
+ WarpInit_SSE4_1();
+ } else {
+ FAIL() << "Unrecognized architecture prefix in test case name: "
+ << test_case;
+ }
+ func_ = is_compound ? dsp->warp_compound : dsp->warp;
+ }
+
+ protected:
+ using DestType =
+ typename std::conditional<is_compound, uint16_t, Pixel>::type;
+
+ void SetInputData(bool use_fixed_values, int value);
+ void Test(bool use_fixed_values, int value, int num_runs = 1);
+ void TestFixedValues();
+ void TestRandomValues();
+ void TestSpeed();
+
+ const WarpTestParam param_ = GetParam();
+
+ private:
+ int warp_params_[8];
+ dsp::WarpFunc func_;
+ // Warp filters are 7-tap, which needs 3 pixels (kConvolveBorderLeftTop)
+ // padding. Destination buffer indices are based on subsampling values (x+y):
+ // 0: (4:4:4), 1:(4:2:2), 2: (4:2:0).
+ Pixel source_[kMaxSourceBlockHeight * kMaxSourceBlockWidth] = {};
+ DestType dest_[3][kMaxDestBlockHeight * kMaxDestBlockWidth] = {};
+};
+
+template <bool is_compound, int bitdepth, typename Pixel>
+void WarpTest<is_compound, bitdepth, Pixel>::SetInputData(bool use_fixed_values,
+ int value) {
+ if (use_fixed_values) {
+ for (int y = 0; y < param_.height; ++y) {
+ const int row = kSourceBorderVertical + y;
+ Memset(source_ + row * kMaxSourceBlockWidth + kSourceBorderHorizontal,
+ value, param_.width);
+ }
+ } else {
+ const int mask = (1 << bitdepth) - 1;
+ libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+ for (int y = 0; y < param_.height; ++y) {
+ const int row = kSourceBorderVertical + y;
+ for (int x = 0; x < param_.width; ++x) {
+ const int column = kSourceBorderHorizontal + x;
+ source_[row * kMaxSourceBlockWidth + column] = rnd.Rand16() & mask;
+ }
+ }
+ }
+ PostFilter::ExtendFrame<Pixel>(
+ &source_[kSourceBorderVertical * kMaxSourceBlockWidth +
+ kSourceBorderHorizontal],
+ param_.width, param_.height, kMaxSourceBlockWidth,
+ kSourceBorderHorizontal, kSourceBorderHorizontal, kSourceBorderVertical,
+ kSourceBorderVertical);
+}
+
+template <bool is_compound, int bitdepth, typename Pixel>
+void WarpTest<is_compound, bitdepth, Pixel>::Test(bool use_fixed_values,
+ int value,
+ int num_runs /*= 1*/) {
+ if (func_ == nullptr) return;
+ SetInputData(use_fixed_values, value);
+ libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+ const int source_offset =
+ kSourceBorderVertical * kMaxSourceBlockWidth + kSourceBorderHorizontal;
+ const int dest_offset =
+ kConvolveBorderLeftTop * kMaxDestBlockWidth + kConvolveBorderLeftTop;
+ const Pixel* const src = source_ + source_offset;
+ const ptrdiff_t src_stride = kMaxSourceBlockWidth * sizeof(Pixel);
+ const ptrdiff_t dst_stride =
+ is_compound ? kMaxDestBlockWidth : kMaxDestBlockWidth * sizeof(Pixel);
+
+ absl::Duration elapsed_time;
+ for (int subsampling_x = 0; subsampling_x <= 1; ++subsampling_x) {
+ for (int subsampling_y = 0; subsampling_y <= 1; ++subsampling_y) {
+ if (subsampling_x == 0 && subsampling_y == 1) {
+ // When both are 0: 4:4:4
+ // When both are 1: 4:2:0
+ // When only |subsampling_x| is 1: 4:2:2
+ // Having only |subsampling_y| == 1 is unsupported.
+ continue;
+ }
+ int params[8];
+ int16_t alpha;
+ int16_t beta;
+ int16_t gamma;
+ int16_t delta;
+ GenerateWarpedModel(params, &alpha, &beta, &gamma, &delta, rnd.Rand8());
+
+ const int dest_id = subsampling_x + subsampling_y;
+ DestType* const dst = dest_[dest_id] + dest_offset;
+ const absl::Time start = absl::Now();
+ for (int n = 0; n < num_runs; ++n) {
+ func_(src, src_stride, param_.width, param_.height, params,
+ subsampling_x, subsampling_y, 0, 0, param_.width, param_.height,
+ alpha, beta, gamma, delta, dst, dst_stride);
+ }
+ elapsed_time += absl::Now() - start;
+ }
+ }
+
+ if (use_fixed_values) {
+ // For fixed values, input and output are identical.
+ for (size_t i = 0; i < ABSL_ARRAYSIZE(dest_); ++i) {
+ // |is_compound| holds a few more bits of precision and an offset value.
+ Pixel compensated_dest[kMaxDestBlockWidth * kMaxDestBlockHeight];
+ const int compound_offset = (bitdepth == 8) ? 0 : kCompoundOffset;
+ if (is_compound) {
+ for (int y = 0; y < param_.height; ++y) {
+ for (int x = 0; x < param_.width; ++x) {
+ const int compound_value =
+ dest_[i][dest_offset + y * kMaxDestBlockWidth + x];
+ const int remove_offset = compound_value - compound_offset;
+ const int full_shift =
+ remove_offset >>
+ (kInterRoundBitsVertical - kInterRoundBitsCompoundVertical);
+ compensated_dest[y * kMaxDestBlockWidth + x] =
+ Clip3(full_shift, 0, (1 << bitdepth) - 1);
+ }
+ }
+ }
+ Pixel* pixel_dest =
+ is_compound ? compensated_dest
+ : reinterpret_cast<Pixel*>(dest_[i] + dest_offset);
+ const bool success = test_utils::CompareBlocks(
+ src, pixel_dest, param_.width, param_.height, kMaxSourceBlockWidth,
+ kMaxDestBlockWidth, false);
+ EXPECT_TRUE(success) << "subsampling_x + subsampling_y: " << i;
+ }
+ } else {
+ // (width, height):
+ // (8, 8), id = 0. (8, 16), id = 1. (16, 8), id = 2.
+ // (16, 16), id = 3. (16, 32), id = 4. (32, 16), id = 5.
+ // ...
+ // (128, 128), id = 12.
+ int id;
+ if (param_.width == param_.height) {
+ id = 3 * static_cast<int>(FloorLog2(param_.width) - 3);
+ } else if (param_.width < param_.height) {
+ id = 1 + 3 * static_cast<int>(FloorLog2(param_.width) - 3);
+ } else {
+ id = 2 + 3 * static_cast<int>(FloorLog2(param_.height) - 3);
+ }
+
+ const char* expected_digest;
+ if (bitdepth == 8) {
+ expected_digest = GetDigest8bpp<is_compound>(id);
+ } else {
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ expected_digest = GetDigest10bpp<is_compound>(id);
+#endif
+ }
+ test_utils::CheckMd5Digest(
+ "Warp", absl::StrFormat("%dx%d", param_.width, param_.height).c_str(),
+ expected_digest, dest_, sizeof(dest_), elapsed_time);
+ }
+}
+
+template <bool is_compound, int bitdepth, typename Pixel>
+void WarpTest<is_compound, bitdepth, Pixel>::TestFixedValues() {
+ Test(true, 0);
+ Test(true, 1);
+ Test(true, 128);
+ Test(true, (1 << bitdepth) - 1);
+}
+
+template <bool is_compound, int bitdepth, typename Pixel>
+void WarpTest<is_compound, bitdepth, Pixel>::TestRandomValues() {
+ Test(false, 0);
+}
+
+template <bool is_compound, int bitdepth, typename Pixel>
+void WarpTest<is_compound, bitdepth, Pixel>::TestSpeed() {
+ const int num_runs = static_cast<int>(1.0e7 / (param_.width * param_.height));
+ Test(false, 0, num_runs);
+}
+
+void ApplyFilterToSignedInput(const int min_input, const int max_input,
+ const int8_t filter[kSubPixelTaps],
+ int* min_output, int* max_output) {
+ int min = 0, max = 0;
+ for (int i = 0; i < kSubPixelTaps; ++i) {
+ const int tap = filter[i];
+ if (tap > 0) {
+ max += max_input * tap;
+ min += min_input * tap;
+ } else {
+ min += max_input * tap;
+ max += min_input * tap;
+ }
+ }
+ *min_output = min;
+ *max_output = max;
+}
+
+void ApplyFilterToUnsignedInput(const int max_input,
+ const int8_t filter[kSubPixelTaps],
+ int* min_output, int* max_output) {
+ ApplyFilterToSignedInput(0, max_input, filter, min_output, max_output);
+}
+
+// Validate the maximum ranges for different parts of the Warp process.
+template <int bitdepth>
+void ShowRange() {
+ constexpr int horizontal_bits = (bitdepth == kBitdepth12)
+ ? kInterRoundBitsHorizontal12bpp
+ : kInterRoundBitsHorizontal;
+ constexpr int vertical_bits = (bitdepth == kBitdepth12)
+ ? kInterRoundBitsVertical12bpp
+ : kInterRoundBitsVertical;
+ constexpr int compound_vertical_bits = kInterRoundBitsCompoundVertical;
+
+ constexpr int compound_offset = (bitdepth == 8) ? 0 : kCompoundOffset;
+
+ constexpr int max_input = (1 << bitdepth) - 1;
+
+ const int8_t* worst_warp_filter = kWarpedFilters8[93];
+
+ // First pass.
+ printf("Bitdepth: %2d Input range: [%8d, %8d]\n", bitdepth, 0,
+ max_input);
+
+ int min = 0, max = 0;
+ ApplyFilterToUnsignedInput(max_input, worst_warp_filter, &min, &max);
+
+ int first_pass_offset;
+ if (bitdepth == 8) {
+ // Derive an offset for 8 bit.
+ for (first_pass_offset = 1; - first_pass_offset > min;
+ first_pass_offset <<= 1) {
+ }
+ printf(" 8bpp intermediate offset: %d.\n", first_pass_offset);
+ min += first_pass_offset;
+ max += first_pass_offset;
+ assert(min > 0);
+ assert(max < UINT16_MAX);
+ } else {
+ // 10bpp and 12bpp require int32_t for the intermediate values. Adding an
+ // offset is not required.
+ assert(min > INT32_MIN);
+ assert(max > INT16_MAX && max < INT32_MAX);
+ }
+
+ printf(" intermediate range: [%8d, %8d]\n", min, max);
+
+ const int first_pass_min = RightShiftWithRounding(min, horizontal_bits);
+ const int first_pass_max = RightShiftWithRounding(max, horizontal_bits);
+
+ printf(" first pass output range: [%8d, %8d]\n", first_pass_min,
+ first_pass_max);
+
+ // Second pass.
+ if (bitdepth == 8) {
+ ApplyFilterToUnsignedInput(first_pass_max, worst_warp_filter, &min, &max);
+ } else {
+ ApplyFilterToSignedInput(first_pass_min, first_pass_max, worst_warp_filter,
+ &min, &max);
+ }
+
+ if (bitdepth == 8) {
+ // Remove the offset that was applied in the first pass since we must use
+ // int32_t for this phase anyway. 128 is the sum of the filter taps.
+ const int offset_removal = (first_pass_offset >> horizontal_bits) * 128;
+ printf(" 8bpp intermediate offset removal: %d.\n", offset_removal);
+ max -= offset_removal;
+ min -= offset_removal;
+ assert(min < INT16_MIN && min > INT32_MIN);
+ assert(max > INT16_MAX && max < INT32_MAX);
+ } else {
+ // 10bpp and 12bpp require int32_t for the intermediate values. Adding an
+ // offset is not required.
+ assert(min > INT32_MIN);
+ assert(max > INT16_MAX && max < INT32_MAX);
+ }
+
+ printf(" intermediate range: [%8d, %8d]\n", min, max);
+
+ // Second pass non-compound output is clipped to Pixel values.
+ const int second_pass_min =
+ Clip3(RightShiftWithRounding(min, vertical_bits), 0, max_input);
+ const int second_pass_max =
+ Clip3(RightShiftWithRounding(max, vertical_bits), 0, max_input);
+ printf(" second pass output range: [%8d, %8d]\n", second_pass_min,
+ second_pass_max);
+
+ // Output is Pixel so matches Pixel values.
+ assert(second_pass_min == 0);
+ assert(second_pass_max == max_input);
+
+ const int compound_second_pass_min =
+ RightShiftWithRounding(min, compound_vertical_bits) + compound_offset;
+ const int compound_second_pass_max =
+ RightShiftWithRounding(max, compound_vertical_bits) + compound_offset;
+
+ printf(" compound second pass output range: [%8d, %8d]\n",
+ compound_second_pass_min, compound_second_pass_max);
+
+ if (bitdepth == 8) {
+ // 8bpp output is int16_t without an offset.
+ assert(compound_second_pass_min > INT16_MIN);
+ assert(compound_second_pass_max < INT16_MAX);
+ } else {
+ // 10bpp and 12bpp use the offset to fit inside uint16_t.
+ assert(compound_second_pass_min > 0);
+ assert(compound_second_pass_max < UINT16_MAX);
+ }
+
+ printf("\n");
+}
+
+TEST(WarpTest, ShowRange) {
+ ShowRange<kBitdepth8>();
+ ShowRange<kBitdepth10>();
+ ShowRange<kBitdepth12>();
+}
+
+using WarpTest8bpp = WarpTest</*is_compound=*/false, 8, uint8_t>;
+// TODO(jzern): Coverage could be added for kInterRoundBitsCompoundVertical via
+// WarpCompoundTest.
+// using WarpCompoundTest8bpp = WarpTest</*is_compound=*/true, 8, uint8_t>;
+
+// Verifies the sum of the warped filter coefficients is 128 for every filter.
+//
+// Verifies the properties used in the calculation of ranges of variables in
+// the block warp process:
+// * The maximum sum of the positive warped filter coefficients is 175.
+// * The minimum (i.e., most negative) sum of the negative warped filter
+// coefficients is -47.
+//
+// NOTE: This test is independent of the bitdepth and the implementation of the
+// block warp function, so it just needs to be a test in the WarpTest8bpp class
+// and does not need to be defined with TEST_P.
+TEST(WarpTest8bpp, WarpedFilterCoefficientSums) {
+ int max_positive_sum = 0;
+ int min_negative_sum = 0;
+ for (const auto& filter : kWarpedFilters) {
+ int sum = 0;
+ int positive_sum = 0;
+ int negative_sum = 0;
+ for (const auto coefficient : filter) {
+ sum += coefficient;
+ if (coefficient > 0) {
+ positive_sum += coefficient;
+ } else {
+ negative_sum += coefficient;
+ }
+ }
+ EXPECT_EQ(sum, 128);
+ max_positive_sum = std::max(positive_sum, max_positive_sum);
+ min_negative_sum = std::min(negative_sum, min_negative_sum);
+ }
+ EXPECT_EQ(max_positive_sum, 175);
+ EXPECT_EQ(min_negative_sum, -47);
+}
+
+TEST_P(WarpTest8bpp, FixedValues) { TestFixedValues(); }
+
+TEST_P(WarpTest8bpp, RandomValues) { TestRandomValues(); }
+
+TEST_P(WarpTest8bpp, DISABLED_Speed) { TestSpeed(); }
+const WarpTestParam warp_test_param[] = {
+ WarpTestParam(8, 8), WarpTestParam(8, 16), WarpTestParam(16, 8),
+ WarpTestParam(16, 16), WarpTestParam(16, 32), WarpTestParam(32, 16),
+ WarpTestParam(32, 32), WarpTestParam(32, 64), WarpTestParam(64, 32),
+ WarpTestParam(64, 64), WarpTestParam(64, 128), WarpTestParam(128, 64),
+ WarpTestParam(128, 128),
+};
+
+INSTANTIATE_TEST_SUITE_P(C, WarpTest8bpp, testing::ValuesIn(warp_test_param));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, WarpTest8bpp,
+ testing::ValuesIn(warp_test_param));
+#endif
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, WarpTest8bpp,
+ testing::ValuesIn(warp_test_param));
+#endif
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using WarpTest10bpp = WarpTest</*is_compound=*/false, 10, uint16_t>;
+// TODO(jzern): Coverage could be added for kInterRoundBitsCompoundVertical via
+// WarpCompoundTest.
+// using WarpCompoundTest10bpp = WarpTest</*is_compound=*/true, 10, uint16_t>;
+
+TEST_P(WarpTest10bpp, FixedValues) { TestFixedValues(); }
+
+TEST_P(WarpTest10bpp, RandomValues) { TestRandomValues(); }
+
+TEST_P(WarpTest10bpp, DISABLED_Speed) { TestSpeed(); }
+
+INSTANTIATE_TEST_SUITE_P(C, WarpTest10bpp, testing::ValuesIn(warp_test_param));
+#endif
+
+std::ostream& operator<<(std::ostream& os, const WarpTestParam& warp_param) {
+ return os << "BlockSize" << warp_param.width << "x" << warp_param.height;
+}
+
+} // namespace
+} // namespace dsp
+} // namespace libgav1
diff --git a/src/dsp/weight_mask_test.cc b/src/dsp/weight_mask_test.cc
new file mode 100644
index 0000000..77b608e
--- /dev/null
+++ b/src/dsp/weight_mask_test.cc
@@ -0,0 +1,390 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/weight_mask.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <ostream>
+#include <string>
+#include <type_traits>
+
+#include "absl/strings/match.h"
+#include "absl/strings/str_format.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kNumSpeedTests = 50000;
+constexpr int kMaxPredictionSize = 128;
+// weight_mask is only used with kCompoundPredictionTypeDiffWeighted with
+// convolve producing the most extreme ranges.
+// This includes kCompoundOffset in 10bpp and 12bpp.
+// see: src/dsp/convolve.cc & src/dsp/warp.cc.
+constexpr int kCompoundPredictionRange[3][2] = {
+ // 8bpp
+ {-5132, 9212},
+ // 10bpp
+ {3988, 61532},
+ // 12bpp
+ {3974, 61559},
+};
+
+const char* GetDigest8bpp(int id) {
+ static const char* const kDigest[] = {
+ "035267cb2ac5a0f8ff50c2d30ad52226",
+ "3231f4972dd858b734e0cc48c4cd001e",
+ "7e163b69721a13ec9f75b5cd74ffee3f",
+ "" /*kBlock4x16*/,
+ "b75e90abc224acca8754c82039b3ba93",
+ "9f555f3a2c1a933a663d6103b8118dea",
+ "8539e54f34cd6668ff6e6606210be201",
+ "20f85c9db7c878c21fbf2052936f269e",
+ "620ec166de57b0639260b2d72eebfc3e",
+ "be666394b5a894d78f4097b6cca272fe",
+ "57a96816e84cdb381f596c23827b5922",
+ "f2e0d348f608f246b6d8d799b66c189e",
+ "161ac051f38372d9339d36728b9926ba",
+ "d5fad48aaf132a81cb62bba4f07bbebb",
+ "e10be2dca2f7dae38dae75150fc1612d",
+ "7f744481eb551bbc224b5236c82cbade",
+ "0d99bbf31ecddc1c2d5063a68c0e9375",
+ "5fb8ec5f582f0ebfe519ed55860f67c4",
+
+ // mask_is_inverse = true.
+ "a4250ca39daa700836138371d36d465f",
+ "abe9a9a1c3a5accda9bfefd4d6e81ccb",
+ "e95b08878d0bb5f2293c27c3a6fe0253",
+ "" /*kBlock4x16*/,
+ "e1c52be02ce9ab2800015bb08b866c31",
+ "eea1dc73811f73866edfeb4555865f20",
+ "3178e64085645bd819256a8ab43c7b0a",
+ "ee83884e4d5cd2c9ac04879116bab681",
+ "d107eff7d5ae9ba14d2c6b3b8d9fca49",
+ "400aeea7d299626fc336c46b1ad7a9d8",
+ "e9e26a400f67f3ad36350fe4171fc613",
+ "4c31ad714f470f34127febaf1bac714b",
+ "bbdcb1097c66d561dd4ea16b3fb73f97",
+ "3a21dfbf53e4c964e303a75a3308ce15",
+ "3416dab4512fd0dc61d788b433cd624e",
+ "68ace8f01fdd74aec3fee528c8167738",
+ "9fabe05a6523da81a45150e19f75acff",
+ "7c0643e4d02421d06d7ca71822a94e1d",
+ };
+ return kDigest[id];
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+const char* GetDigest10bpp(int id) {
+ static const char* const kDigest[] = {
+ "1dc9bdd042e5228705b857b42798e364",
+ "c054c8644bd482ce78a139d8e063e013",
+ "bbe4ac48f013f34c84779da05b0bcbe0",
+ "" /*kBlock4x16*/,
+ "13d4759277637a607f25439182553708",
+ "f089667610561a47d50f9f930ad7c454",
+ "46715e6f7819f59725bdb083f4403255",
+ "3774541c339ae3af920ef2b1d6abf6a1",
+ "94913b01d226cb5eb273dfee84b51f65",
+ "be0c0847629dfff8e0e991ed67697a7d",
+ "716b5398b77d7459274d4ea9c91ebd8e",
+ "f5c1b0b461df4182529949472242b421",
+ "5e9576ea4cf107249ce4ae89a72b9c95",
+ "da021bcdf7936f7bd9a2399c69e4d37c",
+ "b3a310a39c1900e00f992839ff188656",
+ "9f3a15351af5945615f296242ec56a38",
+ "b6e0bd03c521c5f00e90530daa7d4432",
+ "3270d7f621d488aec5b76bcf121debd0",
+
+ // mask_is_inverse = true.
+ "33df96dd246683133eefe4caea6e3f7d",
+ "73e0ccc5d42806548a4b59f856256c1e",
+ "3561a0358cf831aee9477d07feafae2d",
+ "" /*kBlock4x16*/,
+ "c5a2e633c0cd6925e68f21f47f0e2d84",
+ "8755a2d3840dde5fd6a0cce6bd6642c5",
+ "85ec538b72cecd6ea1fddab5ce3b4e64",
+ "a53e0dec84c675c4c6b1f5792b0232ff",
+ "86180da325f9727670a98cf2dbf7410e",
+ "a5fdc95104948047e179b2bc3d47f51d",
+ "9b95b3858187838e4669180e2ddb295e",
+ "6e40ca55608f6bf2f8cd91c8dbf3ddbf",
+ "d3a092672e921b588279d57e50b31888",
+ "9883eb19b733ee9f1cb6a6b6a1a00bb5",
+ "dd34764e068b228b7820321b06864e63",
+ "6c743dc9c8c87c7044151d29993e5042",
+ "44925dab01011a98b8ab1f0308fa852a",
+ "6d984b2ccfa056278e2130771127a943",
+ };
+ return kDigest[id];
+}
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+struct WeightMaskTestParam {
+ WeightMaskTestParam(int width, int height, bool mask_is_inverse)
+ : width(width), height(height), mask_is_inverse(mask_is_inverse) {}
+ int width;
+ int height;
+ bool mask_is_inverse;
+};
+
+std::ostream& operator<<(std::ostream& os, const WeightMaskTestParam& param) {
+ return os << param.width << "x" << param.height
+ << ", mask_is_inverse: " << param.mask_is_inverse;
+}
+
+template <int bitdepth>
+class WeightMaskTest : public testing::TestWithParam<WeightMaskTestParam>,
+ public test_utils::MaxAlignedAllocable {
+ public:
+ WeightMaskTest() = default;
+ ~WeightMaskTest() override = default;
+
+ void SetUp() override {
+ test_utils::ResetDspTable(bitdepth);
+ WeightMaskInit_C();
+ const dsp::Dsp* const dsp = dsp::GetDspTable(bitdepth);
+ ASSERT_NE(dsp, nullptr);
+ const int width_index = FloorLog2(width_) - 3;
+ const int height_index = FloorLog2(height_) - 3;
+ const testing::TestInfo* const test_info =
+ testing::UnitTest::GetInstance()->current_test_info();
+ const char* const test_case = test_info->test_suite_name();
+ if (absl::StartsWith(test_case, "C/")) {
+ } else if (absl::StartsWith(test_case, "NEON/")) {
+ WeightMaskInit_NEON();
+ } else if (absl::StartsWith(test_case, "SSE41/")) {
+ WeightMaskInit_SSE4_1();
+ }
+ func_ = dsp->weight_mask[width_index][height_index][mask_is_inverse_];
+ }
+
+ protected:
+ void SetInputData(bool use_fixed_values, int value_1, int value_2);
+ void Test(int num_runs, bool use_fixed_values, int value_1, int value_2);
+
+ private:
+ const int width_ = GetParam().width;
+ const int height_ = GetParam().height;
+ const bool mask_is_inverse_ = GetParam().mask_is_inverse;
+ using PredType =
+ typename std::conditional<bitdepth == 8, int16_t, uint16_t>::type;
+ alignas(
+ kMaxAlignment) PredType block_1_[kMaxPredictionSize * kMaxPredictionSize];
+ alignas(
+ kMaxAlignment) PredType block_2_[kMaxPredictionSize * kMaxPredictionSize];
+ uint8_t mask_[kMaxPredictionSize * kMaxPredictionSize] = {};
+ dsp::WeightMaskFunc func_;
+};
+
+template <int bitdepth>
+void WeightMaskTest<bitdepth>::SetInputData(const bool use_fixed_values,
+ const int value_1,
+ const int value_2) {
+ if (use_fixed_values) {
+ std::fill(block_1_, block_1_ + kMaxPredictionSize * kMaxPredictionSize,
+ value_1);
+ std::fill(block_2_, block_2_ + kMaxPredictionSize * kMaxPredictionSize,
+ value_2);
+ } else {
+ constexpr int bitdepth_index = (bitdepth - 8) >> 1;
+ libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+ for (int y = 0; y < height_; ++y) {
+ for (int x = 0; x < width_; ++x) {
+ const int min_val = kCompoundPredictionRange[bitdepth_index][0];
+ const int max_val = kCompoundPredictionRange[bitdepth_index][1];
+ block_1_[y * width_ + x] =
+ static_cast<PredType>(rnd(max_val - min_val) + min_val);
+ block_2_[y * width_ + x] =
+ static_cast<PredType>(rnd(max_val - min_val) + min_val);
+ }
+ }
+ }
+}
+
+BlockSize DimensionsToBlockSize(int width, int height) {
+ if (width == 4) {
+ if (height == 4) return kBlock4x4;
+ if (height == 8) return kBlock4x8;
+ if (height == 16) return kBlock4x16;
+ return kBlockInvalid;
+ }
+ if (width == 8) {
+ if (height == 4) return kBlock8x4;
+ if (height == 8) return kBlock8x8;
+ if (height == 16) return kBlock8x16;
+ if (height == 32) return kBlock8x32;
+ return kBlockInvalid;
+ }
+ if (width == 16) {
+ if (height == 4) return kBlock16x4;
+ if (height == 8) return kBlock16x8;
+ if (height == 16) return kBlock16x16;
+ if (height == 32) return kBlock16x32;
+ if (height == 64) return kBlock16x64;
+ return kBlockInvalid;
+ }
+ if (width == 32) {
+ if (height == 8) return kBlock32x8;
+ if (height == 16) return kBlock32x16;
+ if (height == 32) return kBlock32x32;
+ if (height == 64) return kBlock32x64;
+ return kBlockInvalid;
+ }
+ if (width == 64) {
+ if (height == 16) return kBlock64x16;
+ if (height == 32) return kBlock64x32;
+ if (height == 64) return kBlock64x64;
+ if (height == 128) return kBlock64x128;
+ return kBlockInvalid;
+ }
+ if (width == 128) {
+ if (height == 64) return kBlock128x64;
+ if (height == 128) return kBlock128x128;
+ return kBlockInvalid;
+ }
+ return kBlockInvalid;
+}
+
+template <int bitdepth>
+void WeightMaskTest<bitdepth>::Test(const int num_runs,
+ const bool use_fixed_values,
+ const int value_1, const int value_2) {
+ if (func_ == nullptr) return;
+ SetInputData(use_fixed_values, value_1, value_2);
+ const absl::Time start = absl::Now();
+ for (int i = 0; i < num_runs; ++i) {
+ func_(block_1_, block_2_, mask_, kMaxPredictionSize);
+ }
+ const absl::Duration elapsed_time = absl::Now() - start;
+ if (use_fixed_values) {
+ int fixed_value = (value_1 - value_2 == 0) ? 38 : 64;
+ if (mask_is_inverse_) fixed_value = 64 - fixed_value;
+ for (int y = 0; y < height_; ++y) {
+ for (int x = 0; x < width_; ++x) {
+ ASSERT_EQ(static_cast<int>(mask_[y * kMaxPredictionSize + x]),
+ fixed_value)
+ << "x: " << x << " y: " << y;
+ }
+ }
+ } else {
+ const int id_offset = mask_is_inverse_ ? kMaxBlockSizes - 4 : 0;
+ const int id = id_offset +
+ static_cast<int>(DimensionsToBlockSize(width_, height_)) - 4;
+ if (bitdepth == 8) {
+ test_utils::CheckMd5Digest(
+ absl::StrFormat("BlockSize %dx%d", width_, height_).c_str(),
+ "WeightMask", GetDigest8bpp(id), mask_, sizeof(mask_), elapsed_time);
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ } else {
+ test_utils::CheckMd5Digest(
+ absl::StrFormat("BlockSize %dx%d", width_, height_).c_str(),
+ "WeightMask", GetDigest10bpp(id), mask_, sizeof(mask_), elapsed_time);
+#endif
+ }
+ }
+}
+
+const WeightMaskTestParam weight_mask_test_param[] = {
+ WeightMaskTestParam(8, 8, false), WeightMaskTestParam(8, 16, false),
+ WeightMaskTestParam(8, 32, false), WeightMaskTestParam(16, 8, false),
+ WeightMaskTestParam(16, 16, false), WeightMaskTestParam(16, 32, false),
+ WeightMaskTestParam(16, 64, false), WeightMaskTestParam(32, 8, false),
+ WeightMaskTestParam(32, 16, false), WeightMaskTestParam(32, 32, false),
+ WeightMaskTestParam(32, 64, false), WeightMaskTestParam(64, 16, false),
+ WeightMaskTestParam(64, 32, false), WeightMaskTestParam(64, 64, false),
+ WeightMaskTestParam(64, 128, false), WeightMaskTestParam(128, 64, false),
+ WeightMaskTestParam(128, 128, false), WeightMaskTestParam(8, 8, true),
+ WeightMaskTestParam(8, 16, true), WeightMaskTestParam(8, 32, true),
+ WeightMaskTestParam(16, 8, true), WeightMaskTestParam(16, 16, true),
+ WeightMaskTestParam(16, 32, true), WeightMaskTestParam(16, 64, true),
+ WeightMaskTestParam(32, 8, true), WeightMaskTestParam(32, 16, true),
+ WeightMaskTestParam(32, 32, true), WeightMaskTestParam(32, 64, true),
+ WeightMaskTestParam(64, 16, true), WeightMaskTestParam(64, 32, true),
+ WeightMaskTestParam(64, 64, true), WeightMaskTestParam(64, 128, true),
+ WeightMaskTestParam(128, 64, true), WeightMaskTestParam(128, 128, true),
+};
+
+using WeightMaskTest8bpp = WeightMaskTest<8>;
+
+TEST_P(WeightMaskTest8bpp, FixedValues) {
+ const int min = kCompoundPredictionRange[0][0];
+ const int max = kCompoundPredictionRange[0][1];
+ Test(1, true, min, min);
+ Test(1, true, min, max);
+ Test(1, true, max, min);
+ Test(1, true, max, max);
+}
+
+TEST_P(WeightMaskTest8bpp, RandomValues) { Test(1, false, -1, -1); }
+
+TEST_P(WeightMaskTest8bpp, DISABLED_Speed) {
+ Test(kNumSpeedTests, false, -1, -1);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, WeightMaskTest8bpp,
+ testing::ValuesIn(weight_mask_test_param));
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, WeightMaskTest8bpp,
+ testing::ValuesIn(weight_mask_test_param));
+#endif
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, WeightMaskTest8bpp,
+ testing::ValuesIn(weight_mask_test_param));
+#endif
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using WeightMaskTest10bpp = WeightMaskTest<10>;
+
+TEST_P(WeightMaskTest10bpp, FixedValues) {
+ const int min = kCompoundPredictionRange[1][0];
+ const int max = kCompoundPredictionRange[1][1];
+ Test(1, true, min, min);
+ Test(1, true, min, max);
+ Test(1, true, max, min);
+ Test(1, true, max, max);
+}
+
+TEST_P(WeightMaskTest10bpp, RandomValues) { Test(1, false, -1, -1); }
+
+TEST_P(WeightMaskTest10bpp, DISABLED_Speed) {
+ Test(kNumSpeedTests, false, -1, -1);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, WeightMaskTest10bpp,
+ testing::ValuesIn(weight_mask_test_param));
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, WeightMaskTest10bpp,
+ testing::ValuesIn(weight_mask_test_param));
+#endif
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, WeightMaskTest10bpp,
+ testing::ValuesIn(weight_mask_test_param));
+#endif
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+} // namespace
+} // namespace dsp
+} // namespace libgav1
diff --git a/src/dsp/x86/average_blend_sse4.cc b/src/dsp/x86/average_blend_sse4.cc
index 8e008d1..ec9f589 100644
--- a/src/dsp/x86/average_blend_sse4.cc
+++ b/src/dsp/x86/average_blend_sse4.cc
@@ -30,6 +30,7 @@
namespace libgav1 {
namespace dsp {
+namespace low_bitdepth {
namespace {
constexpr int kInterPostRoundBit = 4;
@@ -138,13 +139,232 @@ void Init8bpp() {
}
} // namespace
+} // namespace low_bitdepth
-void AverageBlendInit_SSE4_1() { Init8bpp(); }
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+constexpr int kInterPostRoundBitPlusOne = 5;
+
+template <const int width, const int offset>
+inline void AverageBlendRow(const uint16_t* prediction_0,
+ const uint16_t* prediction_1,
+ const __m128i& compound_offset,
+ const __m128i& round_offset, const __m128i& max,
+ const __m128i& zero, uint16_t* dst,
+ const ptrdiff_t dest_stride) {
+ // pred_0/1 max range is 16b.
+ const __m128i pred_0 = LoadUnaligned16(prediction_0 + offset);
+ const __m128i pred_1 = LoadUnaligned16(prediction_1 + offset);
+ const __m128i pred_00 = _mm_cvtepu16_epi32(pred_0);
+ const __m128i pred_01 = _mm_unpackhi_epi16(pred_0, zero);
+ const __m128i pred_10 = _mm_cvtepu16_epi32(pred_1);
+ const __m128i pred_11 = _mm_unpackhi_epi16(pred_1, zero);
+
+ const __m128i pred_add_0 = _mm_add_epi32(pred_00, pred_10);
+ const __m128i pred_add_1 = _mm_add_epi32(pred_01, pred_11);
+ const __m128i compound_offset_0 = _mm_sub_epi32(pred_add_0, compound_offset);
+ const __m128i compound_offset_1 = _mm_sub_epi32(pred_add_1, compound_offset);
+ // RightShiftWithRounding and Clip3.
+ const __m128i round_0 = _mm_add_epi32(compound_offset_0, round_offset);
+ const __m128i round_1 = _mm_add_epi32(compound_offset_1, round_offset);
+ const __m128i res_0 = _mm_srai_epi32(round_0, kInterPostRoundBitPlusOne);
+ const __m128i res_1 = _mm_srai_epi32(round_1, kInterPostRoundBitPlusOne);
+ const __m128i result = _mm_min_epi16(_mm_packus_epi32(res_0, res_1), max);
+ if (width != 4) {
+ // Store width=8/16/32/64/128.
+ StoreUnaligned16(dst + offset, result);
+ return;
+ }
+ assert(width == 4);
+ StoreLo8(dst, result);
+ StoreHi8(dst + dest_stride, result);
+}
+
+void AverageBlend10bpp_SSE4_1(const void* prediction_0,
+ const void* prediction_1, const int width,
+ const int height, void* const dest,
+ const ptrdiff_t dst_stride) {
+ auto* dst = static_cast<uint16_t*>(dest);
+ const ptrdiff_t dest_stride = dst_stride / sizeof(dst[0]);
+ const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+ const __m128i compound_offset =
+ _mm_set1_epi32(kCompoundOffset + kCompoundOffset);
+ const __m128i round_offset =
+ _mm_set1_epi32((1 << kInterPostRoundBitPlusOne) >> 1);
+ const __m128i max = _mm_set1_epi16((1 << kBitdepth10) - 1);
+ const __m128i zero = _mm_setzero_si128();
+ int y = height;
+
+ if (width == 4) {
+ const ptrdiff_t dest_stride2 = dest_stride << 1;
+ const ptrdiff_t width2 = width << 1;
+ do {
+ // row0,1
+ AverageBlendRow<4, 0>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ dst += dest_stride2;
+ pred_0 += width2;
+ pred_1 += width2;
+ y -= 2;
+ } while (y != 0);
+ return;
+ }
+ if (width == 8) {
+ const ptrdiff_t dest_stride2 = dest_stride << 1;
+ const ptrdiff_t width2 = width << 1;
+ do {
+ // row0.
+ AverageBlendRow<8, 0>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ // row1.
+ AverageBlendRow<8, 0>(pred_0 + width, pred_1 + width, compound_offset,
+ round_offset, max, zero, dst + dest_stride,
+ dest_stride);
+ dst += dest_stride2;
+ pred_0 += width2;
+ pred_1 += width2;
+ y -= 2;
+ } while (y != 0);
+ return;
+ }
+ if (width == 16) {
+ const ptrdiff_t dest_stride2 = dest_stride << 1;
+ const ptrdiff_t width2 = width << 1;
+ do {
+ // row0.
+ AverageBlendRow<8, 0>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 8>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ // row1.
+ AverageBlendRow<8, 0>(pred_0 + width, pred_1 + width, compound_offset,
+ round_offset, max, zero, dst + dest_stride,
+ dest_stride);
+ AverageBlendRow<8, 8>(pred_0 + width, pred_1 + width, compound_offset,
+ round_offset, max, zero, dst + dest_stride,
+ dest_stride);
+ dst += dest_stride2;
+ pred_0 += width2;
+ pred_1 += width2;
+ y -= 2;
+ } while (y != 0);
+ return;
+ }
+ if (width == 32) {
+ do {
+ // pred [0 - 15].
+ AverageBlendRow<8, 0>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 8>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ // pred [16 - 31].
+ AverageBlendRow<8, 16>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 24>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ dst += dest_stride;
+ pred_0 += width;
+ pred_1 += width;
+ } while (--y != 0);
+ return;
+ }
+ if (width == 64) {
+ do {
+ // pred [0 - 31].
+ AverageBlendRow<8, 0>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 8>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 16>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 24>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ // pred [31 - 63].
+ AverageBlendRow<8, 32>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 40>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 48>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 56>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ dst += dest_stride;
+ pred_0 += width;
+ pred_1 += width;
+ } while (--y != 0);
+ return;
+ }
+ assert(width == 128);
+ do {
+ // pred [0 - 31].
+ AverageBlendRow<8, 0>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 8>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 16>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 24>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ // pred [31 - 63].
+ AverageBlendRow<8, 32>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 40>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 48>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 56>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+
+ // pred [64 - 95].
+ AverageBlendRow<8, 64>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 72>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 80>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 88>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ // pred [96 - 127].
+ AverageBlendRow<8, 96>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 104>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 112>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 120>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ dst += dest_stride;
+ pred_0 += width;
+ pred_1 += width;
+ } while (--y != 0);
+}
+
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+#if DSP_ENABLED_10BPP_SSE4_1(AverageBlend)
+ dsp->average_blend = AverageBlend10bpp_SSE4_1;
+#endif
+}
+
+} // namespace
+} // namespace high_bitdepth
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+void AverageBlendInit_SSE4_1() {
+ low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ high_bitdepth::Init10bpp();
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+}
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_TARGETING_SSE4_1
+#else // !LIBGAV1_TARGETING_SSE4_1
namespace libgav1 {
namespace dsp {
diff --git a/src/dsp/x86/average_blend_sse4.h b/src/dsp/x86/average_blend_sse4.h
index 937e8e2..cd07112 100644
--- a/src/dsp/x86/average_blend_sse4.h
+++ b/src/dsp/x86/average_blend_sse4.h
@@ -32,9 +32,13 @@ void AverageBlendInit_SSE4_1();
// If sse4 is enabled and the baseline isn't set due to a higher level of
// optimization being enabled, signal the sse4 implementation should be used.
#if LIBGAV1_TARGETING_SSE4_1
+
#ifndef LIBGAV1_Dsp8bpp_AverageBlend
#define LIBGAV1_Dsp8bpp_AverageBlend LIBGAV1_CPU_SSE4_1
#endif
+#ifndef LIBGAV1_Dsp10bpp_AverageBlend
+#define LIBGAV1_Dsp10bpp_AverageBlend LIBGAV1_CPU_SSE4_1
+#endif
#endif // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/cdef_avx2.cc b/src/dsp/x86/cdef_avx2.cc
new file mode 100644
index 0000000..d41dc38
--- /dev/null
+++ b/src/dsp/x86/cdef_avx2.cc
@@ -0,0 +1,784 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/cdef.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_AVX2
+#include <immintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_avx2.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+#include "src/dsp/cdef.inc"
+
+// Used when calculating odd |cost[x]| values.
+// Holds elements 1 3 5 7 7 7 7 7
+alignas(32) constexpr uint32_t kCdefDivisionTableOddPairsPadded[] = {
+ 420, 210, 140, 105, 420, 210, 140, 105,
+ 105, 105, 105, 105, 105, 105, 105, 105};
+
+// ----------------------------------------------------------------------------
+// Refer to CdefDirection_C().
+//
+// int32_t partial[8][15] = {};
+// for (int i = 0; i < 8; ++i) {
+// for (int j = 0; j < 8; ++j) {
+// const int x = 1;
+// partial[0][i + j] += x;
+// partial[1][i + j / 2] += x;
+// partial[2][i] += x;
+// partial[3][3 + i - j / 2] += x;
+// partial[4][7 + i - j] += x;
+// partial[5][3 - i / 2 + j] += x;
+// partial[6][j] += x;
+// partial[7][i / 2 + j] += x;
+// }
+// }
+//
+// Using the code above, generate the position count for partial[8][15].
+//
+// partial[0]: 1 2 3 4 5 6 7 8 7 6 5 4 3 2 1
+// partial[1]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+// partial[2]: 8 8 8 8 8 8 8 8 0 0 0 0 0 0 0
+// partial[3]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+// partial[4]: 1 2 3 4 5 6 7 8 7 6 5 4 3 2 1
+// partial[5]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+// partial[6]: 8 8 8 8 8 8 8 8 0 0 0 0 0 0 0
+// partial[7]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+//
+// The SIMD code shifts the input horizontally, then adds vertically to get the
+// correct partial value for the given position.
+// ----------------------------------------------------------------------------
+
+// ----------------------------------------------------------------------------
+// partial[0][i + j] += x;
+//
+// 00 01 02 03 04 05 06 07 00 00 00 00 00 00 00
+// 00 10 11 12 13 14 15 16 17 00 00 00 00 00 00
+// 00 00 20 21 22 23 24 25 26 27 00 00 00 00 00
+// 00 00 00 30 31 32 33 34 35 36 37 00 00 00 00
+// 00 00 00 00 40 41 42 43 44 45 46 47 00 00 00
+// 00 00 00 00 00 50 51 52 53 54 55 56 57 00 00
+// 00 00 00 00 00 00 60 61 62 63 64 65 66 67 00
+// 00 00 00 00 00 00 00 70 71 72 73 74 75 76 77
+//
+// partial[4] is the same except the source is reversed.
+LIBGAV1_ALWAYS_INLINE void AddPartial_D0_D4(__m256i* v_src_16,
+ __m256i* partial_lo,
+ __m256i* partial_hi) {
+ // 00 01 02 03 04 05 06 07
+ *partial_lo = v_src_16[0];
+ // 00 00 00 00 00 00 00 00
+ *partial_hi = _mm256_setzero_si256();
+
+ // 00 10 11 12 13 14 15 16
+ *partial_lo =
+ _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_src_16[1], 2));
+ // 17 00 00 00 00 00 00 00
+ *partial_hi =
+ _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_src_16[1], 14));
+
+ // 00 00 20 21 22 23 24 25
+ *partial_lo =
+ _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_src_16[2], 4));
+ // 26 27 00 00 00 00 00 00
+ *partial_hi =
+ _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_src_16[2], 12));
+
+ // 00 00 00 30 31 32 33 34
+ *partial_lo =
+ _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_src_16[3], 6));
+ // 35 36 37 00 00 00 00 00
+ *partial_hi =
+ _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_src_16[3], 10));
+
+ // 00 00 00 00 40 41 42 43
+ *partial_lo =
+ _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_src_16[4], 8));
+ // 44 45 46 47 00 00 00 00
+ *partial_hi =
+ _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_src_16[4], 8));
+
+ // 00 00 00 00 00 50 51 52
+ *partial_lo =
+ _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_src_16[5], 10));
+ // 53 54 55 56 57 00 00 00
+ *partial_hi =
+ _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_src_16[5], 6));
+
+ // 00 00 00 00 00 00 60 61
+ *partial_lo =
+ _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_src_16[6], 12));
+ // 62 63 64 65 66 67 00 00
+ *partial_hi =
+ _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_src_16[6], 4));
+
+ // 00 00 00 00 00 00 00 70
+ *partial_lo =
+ _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_src_16[7], 14));
+ // 71 72 73 74 75 76 77 00
+ *partial_hi =
+ _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_src_16[7], 2));
+}
+
+// ----------------------------------------------------------------------------
+// partial[1][i + j / 2] += x;
+//
+// A0 = src[0] + src[1], A1 = src[2] + src[3], ...
+//
+// A0 A1 A2 A3 00 00 00 00 00 00 00 00 00 00 00
+// 00 B0 B1 B2 B3 00 00 00 00 00 00 00 00 00 00
+// 00 00 C0 C1 C2 C3 00 00 00 00 00 00 00 00 00
+// 00 00 00 D0 D1 D2 D3 00 00 00 00 00 00 00 00
+// 00 00 00 00 E0 E1 E2 E3 00 00 00 00 00 00 00
+// 00 00 00 00 00 F0 F1 F2 F3 00 00 00 00 00 00
+// 00 00 00 00 00 00 G0 G1 G2 G3 00 00 00 00 00
+// 00 00 00 00 00 00 00 H0 H1 H2 H3 00 00 00 00
+//
+// partial[3] is the same except the source is reversed.
+LIBGAV1_ALWAYS_INLINE void AddPartial_D1_D3(__m256i* v_src_16,
+ __m256i* partial_lo,
+ __m256i* partial_hi) {
+ __m256i v_d1_temp[8];
+ const __m256i v_zero = _mm256_setzero_si256();
+
+ for (int i = 0; i < 8; ++i) {
+ v_d1_temp[i] = _mm256_hadd_epi16(v_src_16[i], v_zero);
+ }
+
+ *partial_lo = *partial_hi = v_zero;
+ // A0 A1 A2 A3 00 00 00 00
+ *partial_lo = _mm256_add_epi16(*partial_lo, v_d1_temp[0]);
+
+ // 00 B0 B1 B2 B3 00 00 00
+ *partial_lo =
+ _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_d1_temp[1], 2));
+
+ // 00 00 C0 C1 C2 C3 00 00
+ *partial_lo =
+ _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_d1_temp[2], 4));
+ // 00 00 00 D0 D1 D2 D3 00
+ *partial_lo =
+ _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_d1_temp[3], 6));
+ // 00 00 00 00 E0 E1 E2 E3
+ *partial_lo =
+ _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_d1_temp[4], 8));
+
+ // 00 00 00 00 00 F0 F1 F2
+ *partial_lo =
+ _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_d1_temp[5], 10));
+ // F3 00 00 00 00 00 00 00
+ *partial_hi =
+ _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_d1_temp[5], 6));
+
+ // 00 00 00 00 00 00 G0 G1
+ *partial_lo =
+ _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_d1_temp[6], 12));
+ // G2 G3 00 00 00 00 00 00
+ *partial_hi =
+ _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_d1_temp[6], 4));
+
+ // 00 00 00 00 00 00 00 H0
+ *partial_lo =
+ _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_d1_temp[7], 14));
+ // H1 H2 H3 00 00 00 00 00
+ *partial_hi =
+ _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_d1_temp[7], 2));
+}
+
+// ----------------------------------------------------------------------------
+// partial[7][i / 2 + j] += x;
+//
+// 00 01 02 03 04 05 06 07 00 00 00 00 00 00 00
+// 10 11 12 13 14 15 16 17 00 00 00 00 00 00 00
+// 00 20 21 22 23 24 25 26 27 00 00 00 00 00 00
+// 00 30 31 32 33 34 35 36 37 00 00 00 00 00 00
+// 00 00 40 41 42 43 44 45 46 47 00 00 00 00 00
+// 00 00 50 51 52 53 54 55 56 57 00 00 00 00 00
+// 00 00 00 60 61 62 63 64 65 66 67 00 00 00 00
+// 00 00 00 70 71 72 73 74 75 76 77 00 00 00 00
+//
+// partial[5] is the same except the source is reversed.
+LIBGAV1_ALWAYS_INLINE void AddPartial_D7_D5(__m256i* v_src, __m256i* partial_lo,
+ __m256i* partial_hi) {
+ __m256i v_pair_add[4];
+ // Add vertical source pairs.
+ v_pair_add[0] = _mm256_add_epi16(v_src[0], v_src[1]);
+ v_pair_add[1] = _mm256_add_epi16(v_src[2], v_src[3]);
+ v_pair_add[2] = _mm256_add_epi16(v_src[4], v_src[5]);
+ v_pair_add[3] = _mm256_add_epi16(v_src[6], v_src[7]);
+
+ // 00 01 02 03 04 05 06 07
+ // 10 11 12 13 14 15 16 17
+ *partial_lo = v_pair_add[0];
+ // 00 00 00 00 00 00 00 00
+ // 00 00 00 00 00 00 00 00
+ *partial_hi = _mm256_setzero_si256();
+
+ // 00 20 21 22 23 24 25 26
+ // 00 30 31 32 33 34 35 36
+ *partial_lo =
+ _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_pair_add[1], 2));
+ // 27 00 00 00 00 00 00 00
+ // 37 00 00 00 00 00 00 00
+ *partial_hi =
+ _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_pair_add[1], 14));
+
+ // 00 00 40 41 42 43 44 45
+ // 00 00 50 51 52 53 54 55
+ *partial_lo =
+ _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_pair_add[2], 4));
+ // 46 47 00 00 00 00 00 00
+ // 56 57 00 00 00 00 00 00
+ *partial_hi =
+ _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_pair_add[2], 12));
+
+ // 00 00 00 60 61 62 63 64
+ // 00 00 00 70 71 72 73 74
+ *partial_lo =
+ _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_pair_add[3], 6));
+ // 65 66 67 00 00 00 00 00
+ // 75 76 77 00 00 00 00 00
+ *partial_hi =
+ _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_pair_add[3], 10));
+}
+
+LIBGAV1_ALWAYS_INLINE void AddPartial(const uint8_t* src, ptrdiff_t stride,
+ __m256i* partial) {
+ // 8x8 input
+ // 00 01 02 03 04 05 06 07
+ // 10 11 12 13 14 15 16 17
+ // 20 21 22 23 24 25 26 27
+ // 30 31 32 33 34 35 36 37
+ // 40 41 42 43 44 45 46 47
+ // 50 51 52 53 54 55 56 57
+ // 60 61 62 63 64 65 66 67
+ // 70 71 72 73 74 75 76 77
+ __m256i v_src[8];
+ for (auto& i : v_src) {
+ i = _mm256_castsi128_si256(LoadLo8(src));
+ // Dup lower lane.
+ i = _mm256_permute2x128_si256(i, i, 0x0);
+ src += stride;
+ }
+
+ const __m256i v_zero = _mm256_setzero_si256();
+ // partial for direction 2
+ // --------------------------------------------------------------------------
+ // partial[2][i] += x;
+ // 00 10 20 30 40 50 60 70 xx xx xx xx xx xx xx xx
+ // 01 11 21 33 41 51 61 71 xx xx xx xx xx xx xx xx
+ // 02 12 22 33 42 52 62 72 xx xx xx xx xx xx xx xx
+ // 03 13 23 33 43 53 63 73 xx xx xx xx xx xx xx xx
+ // 04 14 24 34 44 54 64 74 xx xx xx xx xx xx xx xx
+ // 05 15 25 35 45 55 65 75 xx xx xx xx xx xx xx xx
+ // 06 16 26 36 46 56 66 76 xx xx xx xx xx xx xx xx
+ // 07 17 27 37 47 57 67 77 xx xx xx xx xx xx xx xx
+ const __m256i v_src_4_0 = _mm256_unpacklo_epi64(v_src[0], v_src[4]);
+ const __m256i v_src_5_1 = _mm256_unpacklo_epi64(v_src[1], v_src[5]);
+ const __m256i v_src_6_2 = _mm256_unpacklo_epi64(v_src[2], v_src[6]);
+ const __m256i v_src_7_3 = _mm256_unpacklo_epi64(v_src[3], v_src[7]);
+ const __m256i v_hsum_4_0 = _mm256_sad_epu8(v_src_4_0, v_zero);
+ const __m256i v_hsum_5_1 = _mm256_sad_epu8(v_src_5_1, v_zero);
+ const __m256i v_hsum_6_2 = _mm256_sad_epu8(v_src_6_2, v_zero);
+ const __m256i v_hsum_7_3 = _mm256_sad_epu8(v_src_7_3, v_zero);
+ const __m256i v_hsum_1_0 = _mm256_unpacklo_epi16(v_hsum_4_0, v_hsum_5_1);
+ const __m256i v_hsum_3_2 = _mm256_unpacklo_epi16(v_hsum_6_2, v_hsum_7_3);
+ const __m256i v_hsum_5_4 = _mm256_unpackhi_epi16(v_hsum_4_0, v_hsum_5_1);
+ const __m256i v_hsum_7_6 = _mm256_unpackhi_epi16(v_hsum_6_2, v_hsum_7_3);
+ partial[2] =
+ _mm256_unpacklo_epi64(_mm256_unpacklo_epi32(v_hsum_1_0, v_hsum_3_2),
+ _mm256_unpacklo_epi32(v_hsum_5_4, v_hsum_7_6));
+
+ const __m256i extend_reverse = SetrM128i(
+ _mm_set_epi32(static_cast<int>(0x80078006), static_cast<int>(0x80058004),
+ static_cast<int>(0x80038002), static_cast<int>(0x80018000)),
+ _mm_set_epi32(static_cast<int>(0x80008001), static_cast<int>(0x80028003),
+ static_cast<int>(0x80048005),
+ static_cast<int>(0x80068007)));
+
+ for (auto& i : v_src) {
+ // Zero extend unsigned 8 to 16. The upper lane is reversed.
+ i = _mm256_shuffle_epi8(i, extend_reverse);
+ }
+
+ // partial for direction 6
+ // --------------------------------------------------------------------------
+ // partial[6][j] += x;
+ // 00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx
+ // 10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx
+ // 20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx
+ // 30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx
+ // 40 41 42 43 44 45 46 47 xx xx xx xx xx xx xx xx
+ // 50 51 52 53 54 55 56 57 xx xx xx xx xx xx xx xx
+ // 60 61 62 63 64 65 66 67 xx xx xx xx xx xx xx xx
+ // 70 71 72 73 74 75 76 77 xx xx xx xx xx xx xx xx
+ partial[6] = v_src[0];
+ for (int i = 1; i < 8; ++i) {
+ partial[6] = _mm256_add_epi16(partial[6], v_src[i]);
+ }
+
+ AddPartial_D0_D4(v_src, &partial[0], &partial[4]);
+ AddPartial_D1_D3(v_src, &partial[1], &partial[3]);
+ AddPartial_D7_D5(v_src, &partial[7], &partial[5]);
+}
+
+inline __m256i SumVectorPair_S32(__m256i a) {
+ a = _mm256_hadd_epi32(a, a);
+ a = _mm256_add_epi32(a, _mm256_srli_si256(a, 4));
+ return a;
+}
+
+// |cost[0]| and |cost[4]| square the input and sum with the corresponding
+// element from the other end of the vector:
+// |kCdefDivisionTable[]| element:
+// cost[0] += (Square(partial[0][i]) + Square(partial[0][14 - i])) *
+// kCdefDivisionTable[i + 1];
+// cost[0] += Square(partial[0][7]) * kCdefDivisionTable[8];
+inline void Cost0Or4_Pair(uint32_t* cost, const __m256i partial_0,
+ const __m256i partial_4,
+ const __m256i division_table) {
+ const __m256i division_table_0 =
+ _mm256_permute2x128_si256(division_table, division_table, 0x0);
+ const __m256i division_table_1 =
+ _mm256_permute2x128_si256(division_table, division_table, 0x11);
+
+ // partial_lo
+ const __m256i a = partial_0;
+ // partial_hi
+ const __m256i b = partial_4;
+
+ // Reverse and clear upper 2 bytes.
+ const __m256i reverser = _mm256_broadcastsi128_si256(_mm_set_epi32(
+ static_cast<int>(0x80800100), 0x03020504, 0x07060908, 0x0b0a0d0c));
+
+ // 14 13 12 11 10 09 08 ZZ
+ const __m256i b_reversed = _mm256_shuffle_epi8(b, reverser);
+ // 00 14 01 13 02 12 03 11
+ const __m256i ab_lo = _mm256_unpacklo_epi16(a, b_reversed);
+ // 04 10 05 09 06 08 07 ZZ
+ const __m256i ab_hi = _mm256_unpackhi_epi16(a, b_reversed);
+
+ // Square(partial[0][i]) + Square(partial[0][14 - i])
+ const __m256i square_lo = _mm256_madd_epi16(ab_lo, ab_lo);
+ const __m256i square_hi = _mm256_madd_epi16(ab_hi, ab_hi);
+
+ const __m256i c = _mm256_mullo_epi32(square_lo, division_table_0);
+ const __m256i d = _mm256_mullo_epi32(square_hi, division_table_1);
+ const __m256i e = SumVectorPair_S32(_mm256_add_epi32(c, d));
+ // Copy upper 32bit sum to lower lane.
+ const __m128i sums =
+ _mm256_castsi256_si128(_mm256_permute4x64_epi64(e, 0x08));
+ cost[0] = _mm_cvtsi128_si32(sums);
+ cost[4] = _mm_cvtsi128_si32(_mm_srli_si128(sums, 8));
+}
+
+template <int index_a, int index_b>
+inline void CostOdd_Pair(uint32_t* cost, const __m256i partial_a,
+ const __m256i partial_b,
+ const __m256i division_table[2]) {
+ // partial_lo
+ const __m256i a = partial_a;
+ // partial_hi
+ const __m256i b = partial_b;
+
+ // Reverse and clear upper 10 bytes.
+ const __m256i reverser = _mm256_broadcastsi128_si256(
+ _mm_set_epi32(static_cast<int>(0x80808080), static_cast<int>(0x80808080),
+ static_cast<int>(0x80800100), 0x03020504));
+
+ // 10 09 08 ZZ ZZ ZZ ZZ ZZ
+ const __m256i b_reversed = _mm256_shuffle_epi8(b, reverser);
+ // 00 10 01 09 02 08 03 ZZ
+ const __m256i ab_lo = _mm256_unpacklo_epi16(a, b_reversed);
+ // 04 ZZ 05 ZZ 06 ZZ 07 ZZ
+ const __m256i ab_hi = _mm256_unpackhi_epi16(a, b_reversed);
+
+ // Square(partial[0][i]) + Square(partial[0][14 - i])
+ const __m256i square_lo = _mm256_madd_epi16(ab_lo, ab_lo);
+ const __m256i square_hi = _mm256_madd_epi16(ab_hi, ab_hi);
+
+ const __m256i c = _mm256_mullo_epi32(square_lo, division_table[0]);
+ const __m256i d = _mm256_mullo_epi32(square_hi, division_table[1]);
+ const __m256i e = SumVectorPair_S32(_mm256_add_epi32(c, d));
+ // Copy upper 32bit sum to lower lane.
+ const __m128i sums =
+ _mm256_castsi256_si128(_mm256_permute4x64_epi64(e, 0x08));
+ cost[index_a] = _mm_cvtsi128_si32(sums);
+ cost[index_b] = _mm_cvtsi128_si32(_mm_srli_si128(sums, 8));
+}
+
+inline void Cost2And6_Pair(uint32_t* cost, const __m256i partial_a,
+ const __m256i partial_b,
+ const __m256i division_table) {
+ // The upper lane is a "don't care", so only use the lower lane for
+ // calculating cost.
+ const __m256i a = _mm256_permute2x128_si256(partial_a, partial_b, 0x20);
+
+ const __m256i square_a = _mm256_madd_epi16(a, a);
+ const __m256i b = _mm256_mullo_epi32(square_a, division_table);
+ const __m256i c = SumVectorPair_S32(b);
+ // Copy upper 32bit sum to lower lane.
+ const __m128i sums =
+ _mm256_castsi256_si128(_mm256_permute4x64_epi64(c, 0x08));
+ cost[2] = _mm_cvtsi128_si32(sums);
+ cost[6] = _mm_cvtsi128_si32(_mm_srli_si128(sums, 8));
+}
+
+void CdefDirection_AVX2(const void* const source, ptrdiff_t stride,
+ uint8_t* const direction, int* const variance) {
+ assert(direction != nullptr);
+ assert(variance != nullptr);
+ const auto* src = static_cast<const uint8_t*>(source);
+ uint32_t cost[8];
+
+ // partial[0] = add partial 0,4 low
+ // partial[1] = add partial 1,3 low
+ // partial[2] = add partial 2 low
+ // partial[3] = add partial 1,3 high
+ // partial[4] = add partial 0,4 high
+ // partial[5] = add partial 7,5 high
+ // partial[6] = add partial 6 low
+ // partial[7] = add partial 7,5 low
+ __m256i partial[8];
+
+ AddPartial(src, stride, partial);
+
+ const __m256i division_table = LoadUnaligned32(kCdefDivisionTable);
+ const __m256i division_table_7 =
+ _mm256_broadcastd_epi32(_mm_cvtsi32_si128(kCdefDivisionTable[7]));
+
+ Cost2And6_Pair(cost, partial[2], partial[6], division_table_7);
+
+ Cost0Or4_Pair(cost, partial[0], partial[4], division_table);
+
+ const __m256i division_table_odd[2] = {
+ LoadUnaligned32(kCdefDivisionTableOddPairsPadded),
+ LoadUnaligned32(kCdefDivisionTableOddPairsPadded + 8)};
+
+ CostOdd_Pair<1, 3>(cost, partial[1], partial[3], division_table_odd);
+ CostOdd_Pair<7, 5>(cost, partial[7], partial[5], division_table_odd);
+
+ uint32_t best_cost = 0;
+ *direction = 0;
+ for (int i = 0; i < 8; ++i) {
+ if (cost[i] > best_cost) {
+ best_cost = cost[i];
+ *direction = i;
+ }
+ }
+ *variance = (best_cost - cost[(*direction + 4) & 7]) >> 10;
+}
+
+// -------------------------------------------------------------------------
+// CdefFilter
+
+// Load 4 vectors based on the given |direction|.
+inline void LoadDirection(const uint16_t* const src, const ptrdiff_t stride,
+ __m128i* output, const int direction) {
+ // Each |direction| describes a different set of source values. Expand this
+ // set by negating each set. For |direction| == 0 this gives a diagonal line
+ // from top right to bottom left. The first value is y, the second x. Negative
+ // y values move up.
+ // a b c d
+ // {-1, 1}, {1, -1}, {-2, 2}, {2, -2}
+ // c
+ // a
+ // 0
+ // b
+ // d
+ const int y_0 = kCdefDirections[direction][0][0];
+ const int x_0 = kCdefDirections[direction][0][1];
+ const int y_1 = kCdefDirections[direction][1][0];
+ const int x_1 = kCdefDirections[direction][1][1];
+ output[0] = LoadUnaligned16(src - y_0 * stride - x_0);
+ output[1] = LoadUnaligned16(src + y_0 * stride + x_0);
+ output[2] = LoadUnaligned16(src - y_1 * stride - x_1);
+ output[3] = LoadUnaligned16(src + y_1 * stride + x_1);
+}
+
+// Load 4 vectors based on the given |direction|. Use when |block_width| == 4 to
+// do 2 rows at a time.
+void LoadDirection4(const uint16_t* const src, const ptrdiff_t stride,
+ __m128i* output, const int direction) {
+ const int y_0 = kCdefDirections[direction][0][0];
+ const int x_0 = kCdefDirections[direction][0][1];
+ const int y_1 = kCdefDirections[direction][1][0];
+ const int x_1 = kCdefDirections[direction][1][1];
+ output[0] = LoadHi8(LoadLo8(src - y_0 * stride - x_0),
+ src - y_0 * stride + stride - x_0);
+ output[1] = LoadHi8(LoadLo8(src + y_0 * stride + x_0),
+ src + y_0 * stride + stride + x_0);
+ output[2] = LoadHi8(LoadLo8(src - y_1 * stride - x_1),
+ src - y_1 * stride + stride - x_1);
+ output[3] = LoadHi8(LoadLo8(src + y_1 * stride + x_1),
+ src + y_1 * stride + stride + x_1);
+}
+
+inline __m256i Constrain(const __m256i& pixel, const __m256i& reference,
+ const __m128i& damping, const __m256i& threshold) {
+ const __m256i diff = _mm256_sub_epi16(pixel, reference);
+ const __m256i abs_diff = _mm256_abs_epi16(diff);
+ // sign(diff) * Clip3(threshold - (std::abs(diff) >> damping),
+ // 0, std::abs(diff))
+ const __m256i shifted_diff = _mm256_srl_epi16(abs_diff, damping);
+ // For bitdepth == 8, the threshold range is [0, 15] and the damping range is
+ // [3, 6]. If pixel == kCdefLargeValue(0x4000), shifted_diff will always be
+ // larger than threshold. Subtract using saturation will return 0 when pixel
+ // == kCdefLargeValue.
+ static_assert(kCdefLargeValue == 0x4000, "Invalid kCdefLargeValue");
+ const __m256i thresh_minus_shifted_diff =
+ _mm256_subs_epu16(threshold, shifted_diff);
+ const __m256i clamp_abs_diff =
+ _mm256_min_epi16(thresh_minus_shifted_diff, abs_diff);
+ // Restore the sign.
+ return _mm256_sign_epi16(clamp_abs_diff, diff);
+}
+
+inline __m256i ApplyConstrainAndTap(const __m256i& pixel, const __m256i& val,
+ const __m256i& tap, const __m128i& damping,
+ const __m256i& threshold) {
+ const __m256i constrained = Constrain(val, pixel, damping, threshold);
+ return _mm256_mullo_epi16(constrained, tap);
+}
+
+template <int width, bool enable_primary = true, bool enable_secondary = true>
+void CdefFilter_AVX2(const uint16_t* src, const ptrdiff_t src_stride,
+ const int height, const int primary_strength,
+ const int secondary_strength, const int damping,
+ const int direction, void* dest,
+ const ptrdiff_t dst_stride) {
+ static_assert(width == 8 || width == 4, "Invalid CDEF width.");
+ static_assert(enable_primary || enable_secondary, "");
+ constexpr bool clipping_required = enable_primary && enable_secondary;
+ auto* dst = static_cast<uint8_t*>(dest);
+ __m128i primary_damping_shift, secondary_damping_shift;
+
+ // FloorLog2() requires input to be > 0.
+ // 8-bit damping range: Y: [3, 6], UV: [2, 5].
+ if (enable_primary) {
+ // primary_strength: [0, 15] -> FloorLog2: [0, 3] so a clamp is necessary
+ // for UV filtering.
+ primary_damping_shift =
+ _mm_cvtsi32_si128(std::max(0, damping - FloorLog2(primary_strength)));
+ }
+ if (enable_secondary) {
+ // secondary_strength: [0, 4] -> FloorLog2: [0, 2] so no clamp to 0 is
+ // necessary.
+ assert(damping - FloorLog2(secondary_strength) >= 0);
+ secondary_damping_shift =
+ _mm_cvtsi32_si128(damping - FloorLog2(secondary_strength));
+ }
+ const __m256i primary_tap_0 = _mm256_broadcastw_epi16(
+ _mm_cvtsi32_si128(kCdefPrimaryTaps[primary_strength & 1][0]));
+ const __m256i primary_tap_1 = _mm256_broadcastw_epi16(
+ _mm_cvtsi32_si128(kCdefPrimaryTaps[primary_strength & 1][1]));
+ const __m256i secondary_tap_0 =
+ _mm256_broadcastw_epi16(_mm_cvtsi32_si128(kCdefSecondaryTap0));
+ const __m256i secondary_tap_1 =
+ _mm256_broadcastw_epi16(_mm_cvtsi32_si128(kCdefSecondaryTap1));
+ const __m256i cdef_large_value_mask = _mm256_broadcastw_epi16(
+ _mm_cvtsi32_si128(static_cast<int16_t>(~kCdefLargeValue)));
+ const __m256i primary_threshold =
+ _mm256_broadcastw_epi16(_mm_cvtsi32_si128(primary_strength));
+ const __m256i secondary_threshold =
+ _mm256_broadcastw_epi16(_mm_cvtsi32_si128(secondary_strength));
+
+ int y = height;
+ do {
+ __m128i pixel_128;
+ if (width == 8) {
+ pixel_128 = LoadUnaligned16(src);
+ } else {
+ pixel_128 = LoadHi8(LoadLo8(src), src + src_stride);
+ }
+
+ __m256i pixel = SetrM128i(pixel_128, pixel_128);
+
+ __m256i min = pixel;
+ __m256i max = pixel;
+ __m256i sum_pair;
+
+ if (enable_primary) {
+ // Primary |direction|.
+ __m128i primary_val_128[4];
+ if (width == 8) {
+ LoadDirection(src, src_stride, primary_val_128, direction);
+ } else {
+ LoadDirection4(src, src_stride, primary_val_128, direction);
+ }
+
+ __m256i primary_val[2];
+ primary_val[0] = SetrM128i(primary_val_128[0], primary_val_128[1]);
+ primary_val[1] = SetrM128i(primary_val_128[2], primary_val_128[3]);
+
+ if (clipping_required) {
+ min = _mm256_min_epu16(min, primary_val[0]);
+ min = _mm256_min_epu16(min, primary_val[1]);
+
+ // The source is 16 bits, however, we only really care about the lower
+ // 8 bits. The upper 8 bits contain the "large" flag. After the final
+ // primary max has been calculated, zero out the upper 8 bits. Use this
+ // to find the "16 bit" max.
+ const __m256i max_p01 = _mm256_max_epu8(primary_val[0], primary_val[1]);
+ max = _mm256_max_epu16(
+ max, _mm256_and_si256(max_p01, cdef_large_value_mask));
+ }
+
+ sum_pair = ApplyConstrainAndTap(pixel, primary_val[0], primary_tap_0,
+ primary_damping_shift, primary_threshold);
+ sum_pair = _mm256_add_epi16(
+ sum_pair,
+ ApplyConstrainAndTap(pixel, primary_val[1], primary_tap_1,
+ primary_damping_shift, primary_threshold));
+ } else {
+ sum_pair = _mm256_setzero_si256();
+ }
+
+ if (enable_secondary) {
+ // Secondary |direction| values (+/- 2). Clamp |direction|.
+ __m128i secondary_val_128[8];
+ if (width == 8) {
+ LoadDirection(src, src_stride, secondary_val_128, direction + 2);
+ LoadDirection(src, src_stride, secondary_val_128 + 4, direction - 2);
+ } else {
+ LoadDirection4(src, src_stride, secondary_val_128, direction + 2);
+ LoadDirection4(src, src_stride, secondary_val_128 + 4, direction - 2);
+ }
+
+ __m256i secondary_val[4];
+ secondary_val[0] = SetrM128i(secondary_val_128[0], secondary_val_128[1]);
+ secondary_val[1] = SetrM128i(secondary_val_128[2], secondary_val_128[3]);
+ secondary_val[2] = SetrM128i(secondary_val_128[4], secondary_val_128[5]);
+ secondary_val[3] = SetrM128i(secondary_val_128[6], secondary_val_128[7]);
+
+ if (clipping_required) {
+ min = _mm256_min_epu16(min, secondary_val[0]);
+ min = _mm256_min_epu16(min, secondary_val[1]);
+ min = _mm256_min_epu16(min, secondary_val[2]);
+ min = _mm256_min_epu16(min, secondary_val[3]);
+
+ const __m256i max_s01 =
+ _mm256_max_epu8(secondary_val[0], secondary_val[1]);
+ const __m256i max_s23 =
+ _mm256_max_epu8(secondary_val[2], secondary_val[3]);
+ const __m256i max_s = _mm256_max_epu8(max_s01, max_s23);
+ max = _mm256_max_epu8(max,
+ _mm256_and_si256(max_s, cdef_large_value_mask));
+ }
+
+ sum_pair = _mm256_add_epi16(
+ sum_pair,
+ ApplyConstrainAndTap(pixel, secondary_val[0], secondary_tap_0,
+ secondary_damping_shift, secondary_threshold));
+ sum_pair = _mm256_add_epi16(
+ sum_pair,
+ ApplyConstrainAndTap(pixel, secondary_val[1], secondary_tap_1,
+ secondary_damping_shift, secondary_threshold));
+ sum_pair = _mm256_add_epi16(
+ sum_pair,
+ ApplyConstrainAndTap(pixel, secondary_val[2], secondary_tap_0,
+ secondary_damping_shift, secondary_threshold));
+ sum_pair = _mm256_add_epi16(
+ sum_pair,
+ ApplyConstrainAndTap(pixel, secondary_val[3], secondary_tap_1,
+ secondary_damping_shift, secondary_threshold));
+ }
+
+ __m128i sum = _mm_add_epi16(_mm256_castsi256_si128(sum_pair),
+ _mm256_extracti128_si256(sum_pair, 1));
+
+ // Clip3(pixel + ((8 + sum - (sum < 0)) >> 4), min, max))
+ const __m128i sum_lt_0 = _mm_srai_epi16(sum, 15);
+ // 8 + sum
+ sum = _mm_add_epi16(sum, _mm_set1_epi16(8));
+ // (... - (sum < 0)) >> 4
+ sum = _mm_add_epi16(sum, sum_lt_0);
+ sum = _mm_srai_epi16(sum, 4);
+ // pixel + ...
+ sum = _mm_add_epi16(sum, _mm256_castsi256_si128(pixel));
+ if (clipping_required) {
+ const __m128i min_128 = _mm_min_epu16(_mm256_castsi256_si128(min),
+ _mm256_extracti128_si256(min, 1));
+
+ const __m128i max_128 = _mm_max_epu16(_mm256_castsi256_si128(max),
+ _mm256_extracti128_si256(max, 1));
+ // Clip3
+ sum = _mm_min_epi16(sum, max_128);
+ sum = _mm_max_epi16(sum, min_128);
+ }
+
+ const __m128i result = _mm_packus_epi16(sum, sum);
+ if (width == 8) {
+ src += src_stride;
+ StoreLo8(dst, result);
+ dst += dst_stride;
+ --y;
+ } else {
+ src += src_stride << 1;
+ Store4(dst, result);
+ dst += dst_stride;
+ Store4(dst, _mm_srli_si128(result, 4));
+ dst += dst_stride;
+ y -= 2;
+ }
+ } while (y != 0);
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+ assert(dsp != nullptr);
+ dsp->cdef_direction = CdefDirection_AVX2;
+
+ dsp->cdef_filters[0][0] = CdefFilter_AVX2<4>;
+ dsp->cdef_filters[0][1] =
+ CdefFilter_AVX2<4, /*enable_primary=*/true, /*enable_secondary=*/false>;
+ dsp->cdef_filters[0][2] = CdefFilter_AVX2<4, /*enable_primary=*/false>;
+ dsp->cdef_filters[1][0] = CdefFilter_AVX2<8>;
+ dsp->cdef_filters[1][1] =
+ CdefFilter_AVX2<8, /*enable_primary=*/true, /*enable_secondary=*/false>;
+ dsp->cdef_filters[1][2] = CdefFilter_AVX2<8, /*enable_primary=*/false>;
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+void CdefInit_AVX2() { low_bitdepth::Init8bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+#else // !LIBGAV1_TARGETING_AVX2
+namespace libgav1 {
+namespace dsp {
+
+void CdefInit_AVX2() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_AVX2
diff --git a/src/dsp/x86/cdef_avx2.h b/src/dsp/x86/cdef_avx2.h
new file mode 100644
index 0000000..41f2d3f
--- /dev/null
+++ b/src/dsp/x86/cdef_avx2.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_CDEF_AVX2_H_
+#define LIBGAV1_SRC_DSP_X86_CDEF_AVX2_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::cdef_direction and Dsp::cdef_filters. This function is not
+// thread-safe.
+void CdefInit_AVX2();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_TARGETING_AVX2
+
+#ifndef LIBGAV1_Dsp8bpp_CdefDirection
+#define LIBGAV1_Dsp8bpp_CdefDirection LIBGAV1_CPU_AVX2
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_CdefFilters
+#define LIBGAV1_Dsp8bpp_CdefFilters LIBGAV1_CPU_AVX2
+#endif
+
+#endif // LIBGAV1_TARGETING_AVX2
+
+#endif // LIBGAV1_SRC_DSP_X86_CDEF_AVX2_H_
diff --git a/src/dsp/x86/cdef_sse4.cc b/src/dsp/x86/cdef_sse4.cc
index 3211a2d..6ede778 100644
--- a/src/dsp/x86/cdef_sse4.cc
+++ b/src/dsp/x86/cdef_sse4.cc
@@ -349,8 +349,8 @@ inline uint32_t SumVector_S32(__m128i a) {
inline uint32_t Cost0Or4(const __m128i a, const __m128i b,
const __m128i division_table[2]) {
// Reverse and clear upper 2 bytes.
- const __m128i reverser =
- _mm_set_epi32(0x80800100, 0x03020504, 0x07060908, 0x0b0a0d0c);
+ const __m128i reverser = _mm_set_epi32(static_cast<int>(0x80800100),
+ 0x03020504, 0x07060908, 0x0b0a0d0c);
// 14 13 12 11 10 09 08 ZZ
const __m128i b_reversed = _mm_shuffle_epi8(b, reverser);
// 00 14 01 13 02 12 03 11
@@ -371,7 +371,8 @@ inline uint32_t CostOdd(const __m128i a, const __m128i b,
const __m128i division_table[2]) {
// Reverse and clear upper 10 bytes.
const __m128i reverser =
- _mm_set_epi32(0x80808080, 0x80808080, 0x80800100, 0x03020504);
+ _mm_set_epi32(static_cast<int>(0x80808080), static_cast<int>(0x80808080),
+ static_cast<int>(0x80800100), 0x03020504);
// 10 09 08 ZZ ZZ ZZ ZZ ZZ
const __m128i b_reversed = _mm_shuffle_epi8(b, reverser);
// 00 10 01 09 02 08 03 ZZ
@@ -717,7 +718,7 @@ void CdefInit_SSE4_1() { low_bitdepth::Init8bpp(); }
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_TARGETING_SSE4_1
+#else // !LIBGAV1_TARGETING_SSE4_1
namespace libgav1 {
namespace dsp {
diff --git a/src/dsp/x86/common_avx2.h b/src/dsp/x86/common_avx2.h
index 4ce7de2..373116a 100644
--- a/src/dsp/x86/common_avx2.h
+++ b/src/dsp/x86/common_avx2.h
@@ -27,109 +27,60 @@
#include <cassert>
#include <cstddef>
#include <cstdint>
+#include <cstring>
namespace libgav1 {
namespace dsp {
-
-//------------------------------------------------------------------------------
-// Compatibility functions.
-
-inline __m256i SetrM128i(const __m128i lo, const __m128i hi) {
- // For compatibility with older gcc toolchains (< 8) use
- // _mm256_inserti128_si256 over _mm256_setr_m128i. Newer gcc implementations
- // are implemented similarly to the following, clang uses a different method
- // but no differences in assembly have been observed.
- return _mm256_inserti128_si256(_mm256_castsi128_si256(lo), hi, 1);
-}
-
-//------------------------------------------------------------------------------
-// Load functions.
-
-inline __m256i LoadAligned32(const void* a) {
- assert((reinterpret_cast<uintptr_t>(a) & 0x1f) == 0);
- return _mm256_load_si256(static_cast<const __m256i*>(a));
-}
-
-inline void LoadAligned64(const void* a, __m256i dst[2]) {
- assert((reinterpret_cast<uintptr_t>(a) & 0x1f) == 0);
- dst[0] = _mm256_load_si256(static_cast<const __m256i*>(a) + 0);
- dst[1] = _mm256_load_si256(static_cast<const __m256i*>(a) + 1);
-}
-
-inline __m256i LoadUnaligned32(const void* a) {
- return _mm256_loadu_si256(static_cast<const __m256i*>(a));
-}
-
-//------------------------------------------------------------------------------
-// Load functions to avoid MemorySanitizer's use-of-uninitialized-value warning.
-
-inline __m256i MaskOverreads(const __m256i source,
- const ptrdiff_t over_read_in_bytes) {
- __m256i dst = source;
-#if LIBGAV1_MSAN
- if (over_read_in_bytes >= 32) return _mm256_setzero_si256();
- if (over_read_in_bytes > 0) {
- __m128i m = _mm_set1_epi8(-1);
- for (ptrdiff_t i = 0; i < over_read_in_bytes % 16; ++i) {
- m = _mm_srli_si128(m, 1);
- }
- const __m256i mask = (over_read_in_bytes < 16)
- ? SetrM128i(_mm_set1_epi8(-1), m)
- : SetrM128i(m, _mm_setzero_si128());
- dst = _mm256_and_si256(dst, mask);
- }
-#else
- static_cast<void>(over_read_in_bytes);
-#endif
- return dst;
-}
-
-inline __m256i LoadAligned32Msan(const void* const source,
- const ptrdiff_t over_read_in_bytes) {
- return MaskOverreads(LoadAligned32(source), over_read_in_bytes);
-}
-
-inline void LoadAligned64Msan(const void* const source,
- const ptrdiff_t over_read_in_bytes,
- __m256i dst[2]) {
- dst[0] = MaskOverreads(LoadAligned32(source), over_read_in_bytes);
- dst[1] = MaskOverreads(LoadAligned32(static_cast<const __m256i*>(source) + 1),
- over_read_in_bytes);
-}
-
-inline __m256i LoadUnaligned32Msan(const void* const source,
- const ptrdiff_t over_read_in_bytes) {
- return MaskOverreads(LoadUnaligned32(source), over_read_in_bytes);
-}
-
-//------------------------------------------------------------------------------
-// Store functions.
-
-inline void StoreAligned32(void* a, const __m256i v) {
- assert((reinterpret_cast<uintptr_t>(a) & 0x1f) == 0);
- _mm256_store_si256(static_cast<__m256i*>(a), v);
-}
-
-inline void StoreAligned64(void* a, const __m256i v[2]) {
- assert((reinterpret_cast<uintptr_t>(a) & 0x1f) == 0);
- _mm256_store_si256(static_cast<__m256i*>(a) + 0, v[0]);
- _mm256_store_si256(static_cast<__m256i*>(a) + 1, v[1]);
-}
-
-inline void StoreUnaligned32(void* a, const __m256i v) {
- _mm256_storeu_si256(static_cast<__m256i*>(a), v);
-}
-
-//------------------------------------------------------------------------------
-// Arithmetic utilities.
-
-inline __m256i RightShiftWithRounding_S16(const __m256i v_val_d, int bits) {
- assert(bits <= 16);
- const __m256i v_bias_d =
- _mm256_set1_epi16(static_cast<int16_t>((1 << bits) >> 1));
- const __m256i v_tmp_d = _mm256_add_epi16(v_val_d, v_bias_d);
- return _mm256_srai_epi16(v_tmp_d, bits);
-}
+namespace avx2 {
+
+#include "src/dsp/x86/common_avx2.inc"
+#include "src/dsp/x86/common_sse4.inc"
+
+} // namespace avx2
+
+// NOLINTBEGIN(misc-unused-using-decls)
+// These function aliases shall not be visible to external code. They are
+// restricted to x86/*_avx2.cc files only. This scheme exists to distinguish two
+// possible implementations of common functions, which may differ based on
+// whether the compiler is permitted to use avx2 instructions.
+
+// common_sse4.inc
+using avx2::Load2;
+using avx2::Load2x2;
+using avx2::Load4;
+using avx2::Load4x2;
+using avx2::LoadAligned16;
+using avx2::LoadAligned16Msan;
+using avx2::LoadHi8;
+using avx2::LoadHi8Msan;
+using avx2::LoadLo8;
+using avx2::LoadLo8Msan;
+using avx2::LoadUnaligned16;
+using avx2::LoadUnaligned16Msan;
+using avx2::MaskHighNBytes;
+using avx2::RightShiftWithRounding_S16;
+using avx2::RightShiftWithRounding_S32;
+using avx2::RightShiftWithRounding_U16;
+using avx2::RightShiftWithRounding_U32;
+using avx2::Store2;
+using avx2::Store4;
+using avx2::StoreAligned16;
+using avx2::StoreHi8;
+using avx2::StoreLo8;
+using avx2::StoreUnaligned16;
+
+// common_avx2.inc
+using avx2::LoadAligned32;
+using avx2::LoadAligned32Msan;
+using avx2::LoadAligned64;
+using avx2::LoadAligned64Msan;
+using avx2::LoadUnaligned32;
+using avx2::LoadUnaligned32Msan;
+using avx2::SetrM128i;
+using avx2::StoreAligned32;
+using avx2::StoreAligned64;
+using avx2::StoreUnaligned32;
+// NOLINTEND
} // namespace dsp
} // namespace libgav1
diff --git a/src/dsp/x86/common_avx2.inc b/src/dsp/x86/common_avx2.inc
new file mode 100644
index 0000000..53b4e2e
--- /dev/null
+++ b/src/dsp/x86/common_avx2.inc
@@ -0,0 +1,121 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+//------------------------------------------------------------------------------
+// Compatibility functions.
+
+inline __m256i SetrM128i(const __m128i lo, const __m128i hi) {
+ // For compatibility with older gcc toolchains (< 8) use
+ // _mm256_inserti128_si256 over _mm256_setr_m128i. Newer gcc implementations
+ // are implemented similarly to the following, clang uses a different method
+ // but no differences in assembly have been observed.
+ return _mm256_inserti128_si256(_mm256_castsi128_si256(lo), hi, 1);
+}
+
+//------------------------------------------------------------------------------
+// Load functions.
+
+inline __m256i LoadAligned32(const void* a) {
+ assert((reinterpret_cast<uintptr_t>(a) & 0x1f) == 0);
+ return _mm256_load_si256(static_cast<const __m256i*>(a));
+}
+
+inline void LoadAligned64(const void* a, __m256i dst[2]) {
+ assert((reinterpret_cast<uintptr_t>(a) & 0x1f) == 0);
+ dst[0] = _mm256_load_si256(static_cast<const __m256i*>(a) + 0);
+ dst[1] = _mm256_load_si256(static_cast<const __m256i*>(a) + 1);
+}
+
+inline __m256i LoadUnaligned32(const void* a) {
+ return _mm256_loadu_si256(static_cast<const __m256i*>(a));
+}
+
+//------------------------------------------------------------------------------
+// Load functions to avoid MemorySanitizer's use-of-uninitialized-value warning.
+
+inline __m256i MaskOverreads(const __m256i source,
+ const ptrdiff_t over_read_in_bytes) {
+ __m256i dst = source;
+#if LIBGAV1_MSAN
+ if (over_read_in_bytes >= 32) return _mm256_setzero_si256();
+ if (over_read_in_bytes > 0) {
+ __m128i m = _mm_set1_epi8(-1);
+ for (ptrdiff_t i = 0; i < over_read_in_bytes % 16; ++i) {
+ m = _mm_srli_si128(m, 1);
+ }
+ const __m256i mask = (over_read_in_bytes < 16)
+ ? SetrM128i(_mm_set1_epi8(-1), m)
+ : SetrM128i(m, _mm_setzero_si128());
+ dst = _mm256_and_si256(dst, mask);
+ }
+#else
+ static_cast<void>(over_read_in_bytes);
+#endif
+ return dst;
+}
+
+inline __m256i LoadAligned32Msan(const void* const source,
+ const ptrdiff_t over_read_in_bytes) {
+ return MaskOverreads(LoadAligned32(source), over_read_in_bytes);
+}
+
+inline void LoadAligned64Msan(const void* const source,
+ const ptrdiff_t over_read_in_bytes,
+ __m256i dst[2]) {
+ dst[0] = MaskOverreads(LoadAligned32(source), over_read_in_bytes);
+ dst[1] = MaskOverreads(LoadAligned32(static_cast<const __m256i*>(source) + 1),
+ over_read_in_bytes);
+}
+
+inline __m256i LoadUnaligned32Msan(const void* const source,
+ const ptrdiff_t over_read_in_bytes) {
+ return MaskOverreads(LoadUnaligned32(source), over_read_in_bytes);
+}
+
+//------------------------------------------------------------------------------
+// Store functions.
+
+inline void StoreAligned32(void* a, const __m256i v) {
+ assert((reinterpret_cast<uintptr_t>(a) & 0x1f) == 0);
+ _mm256_store_si256(static_cast<__m256i*>(a), v);
+}
+
+inline void StoreAligned64(void* a, const __m256i v[2]) {
+ assert((reinterpret_cast<uintptr_t>(a) & 0x1f) == 0);
+ _mm256_store_si256(static_cast<__m256i*>(a) + 0, v[0]);
+ _mm256_store_si256(static_cast<__m256i*>(a) + 1, v[1]);
+}
+
+inline void StoreUnaligned32(void* a, const __m256i v) {
+ _mm256_storeu_si256(static_cast<__m256i*>(a), v);
+}
+
+//------------------------------------------------------------------------------
+// Arithmetic utilities.
+
+inline __m256i RightShiftWithRounding_S16(const __m256i v_val_d, int bits) {
+ assert(bits <= 16);
+ const __m256i v_bias_d =
+ _mm256_set1_epi16(static_cast<int16_t>((1 << bits) >> 1));
+ const __m256i v_tmp_d = _mm256_add_epi16(v_val_d, v_bias_d);
+ return _mm256_srai_epi16(v_tmp_d, bits);
+}
+
+inline __m256i RightShiftWithRounding_S32(const __m256i v_val_d, int bits) {
+ const __m256i v_bias_d = _mm256_set1_epi32((1 << bits) >> 1);
+ const __m256i v_tmp_d = _mm256_add_epi32(v_val_d, v_bias_d);
+ return _mm256_srai_epi32(v_tmp_d, bits);
+}
diff --git a/src/dsp/x86/common_sse4.h b/src/dsp/x86/common_sse4.h
index c510f8c..41a3a68 100644
--- a/src/dsp/x86/common_sse4.h
+++ b/src/dsp/x86/common_sse4.h
@@ -28,7 +28,6 @@
#include <cassert>
#include <cstddef>
#include <cstdint>
-#include <cstdlib>
#include <cstring>
#if 0
@@ -71,192 +70,58 @@ inline void PrintRegX(const int r, const char* const name) {
#define PR(var, N) PrintReg(var, #var, N)
#define PD(var) PrintReg(var, #var);
#define PX(var) PrintRegX(var, #var);
-#endif // 0
-
-namespace libgav1 {
-namespace dsp {
-
-//------------------------------------------------------------------------------
-// Load functions.
-
-inline __m128i Load2(const void* src) {
- int16_t val;
- memcpy(&val, src, sizeof(val));
- return _mm_cvtsi32_si128(val);
-}
-
-inline __m128i Load2x2(const void* src1, const void* src2) {
- uint16_t val1;
- uint16_t val2;
- memcpy(&val1, src1, sizeof(val1));
- memcpy(&val2, src2, sizeof(val2));
- return _mm_cvtsi32_si128(val1 | (val2 << 16));
-}
-
-// Load 2 uint8_t values into |lane| * 2 and |lane| * 2 + 1.
-template <int lane>
-inline __m128i Load2(const void* const buf, __m128i val) {
- uint16_t temp;
- memcpy(&temp, buf, 2);
- return _mm_insert_epi16(val, temp, lane);
-}
-
-inline __m128i Load4(const void* src) {
- // With new compilers such as clang 8.0.0 we can use the new _mm_loadu_si32
- // intrinsic. Both _mm_loadu_si32(src) and the code here are compiled into a
- // movss instruction.
- //
- // Until compiler support of _mm_loadu_si32 is widespread, use of
- // _mm_loadu_si32 is banned.
- int val;
- memcpy(&val, src, sizeof(val));
- return _mm_cvtsi32_si128(val);
-}
-
-inline __m128i Load4x2(const void* src1, const void* src2) {
- // With new compilers such as clang 8.0.0 we can use the new _mm_loadu_si32
- // intrinsic. Both _mm_loadu_si32(src) and the code here are compiled into a
- // movss instruction.
- //
- // Until compiler support of _mm_loadu_si32 is widespread, use of
- // _mm_loadu_si32 is banned.
- int val1, val2;
- memcpy(&val1, src1, sizeof(val1));
- memcpy(&val2, src2, sizeof(val2));
- return _mm_insert_epi32(_mm_cvtsi32_si128(val1), val2, 1);
-}
-inline __m128i LoadLo8(const void* a) {
- return _mm_loadl_epi64(static_cast<const __m128i*>(a));
-}
-
-inline __m128i LoadHi8(const __m128i v, const void* a) {
- const __m128 x =
- _mm_loadh_pi(_mm_castsi128_ps(v), static_cast<const __m64*>(a));
- return _mm_castps_si128(x);
-}
-
-inline __m128i LoadUnaligned16(const void* a) {
- return _mm_loadu_si128(static_cast<const __m128i*>(a));
-}
-
-inline __m128i LoadAligned16(const void* a) {
- assert((reinterpret_cast<uintptr_t>(a) & 0xf) == 0);
- return _mm_load_si128(static_cast<const __m128i*>(a));
-}
-
-//------------------------------------------------------------------------------
-// Load functions to avoid MemorySanitizer's use-of-uninitialized-value warning.
-
-inline __m128i MaskOverreads(const __m128i source,
- const ptrdiff_t over_read_in_bytes) {
- __m128i dst = source;
#if LIBGAV1_MSAN
- if (over_read_in_bytes > 0) {
- __m128i mask = _mm_set1_epi8(-1);
- for (ptrdiff_t i = 0; i < over_read_in_bytes; ++i) {
- mask = _mm_srli_si128(mask, 1);
- }
- dst = _mm_and_si128(dst, mask);
- }
-#else
- static_cast<void>(over_read_in_bytes);
-#endif
- return dst;
-}
+#include <sanitizer/msan_interface.h>
-inline __m128i LoadLo8Msan(const void* const source,
- const ptrdiff_t over_read_in_bytes) {
- return MaskOverreads(LoadLo8(source), over_read_in_bytes + 8);
+inline void PrintShadow(const void* r, const char* const name,
+ const size_t size) {
+ fprintf(stderr, "Shadow for %s:\n", name);
+ __msan_print_shadow(r, size);
}
+#define PS(var, N) PrintShadow(var, #var, N)
-inline __m128i LoadHi8Msan(const __m128i v, const void* source,
- const ptrdiff_t over_read_in_bytes) {
- return MaskOverreads(LoadHi8(v, source), over_read_in_bytes);
-}
-
-inline __m128i LoadAligned16Msan(const void* const source,
- const ptrdiff_t over_read_in_bytes) {
- return MaskOverreads(LoadAligned16(source), over_read_in_bytes);
-}
+#endif // LIBGAV1_MSAN
-inline __m128i LoadUnaligned16Msan(const void* const source,
- const ptrdiff_t over_read_in_bytes) {
- return MaskOverreads(LoadUnaligned16(source), over_read_in_bytes);
-}
-
-//------------------------------------------------------------------------------
-// Store functions.
-
-inline void Store2(void* dst, const __m128i x) {
- const int val = _mm_cvtsi128_si32(x);
- memcpy(dst, &val, 2);
-}
-
-inline void Store4(void* dst, const __m128i x) {
- const int val = _mm_cvtsi128_si32(x);
- memcpy(dst, &val, sizeof(val));
-}
-
-inline void StoreLo8(void* a, const __m128i v) {
- _mm_storel_epi64(static_cast<__m128i*>(a), v);
-}
-
-inline void StoreHi8(void* a, const __m128i v) {
- _mm_storeh_pi(static_cast<__m64*>(a), _mm_castsi128_ps(v));
-}
-
-inline void StoreAligned16(void* a, const __m128i v) {
- assert((reinterpret_cast<uintptr_t>(a) & 0xf) == 0);
- _mm_store_si128(static_cast<__m128i*>(a), v);
-}
-
-inline void StoreUnaligned16(void* a, const __m128i v) {
- _mm_storeu_si128(static_cast<__m128i*>(a), v);
-}
-
-//------------------------------------------------------------------------------
-// Arithmetic utilities.
-
-inline __m128i RightShiftWithRounding_U16(const __m128i v_val_d, int bits) {
- assert(bits <= 16);
- // Shift out all but the last bit.
- const __m128i v_tmp_d = _mm_srli_epi16(v_val_d, bits - 1);
- // Avg with zero will shift by 1 and round.
- return _mm_avg_epu16(v_tmp_d, _mm_setzero_si128());
-}
-
-inline __m128i RightShiftWithRounding_S16(const __m128i v_val_d, int bits) {
- assert(bits <= 16);
- const __m128i v_bias_d =
- _mm_set1_epi16(static_cast<int16_t>((1 << bits) >> 1));
- const __m128i v_tmp_d = _mm_add_epi16(v_val_d, v_bias_d);
- return _mm_srai_epi16(v_tmp_d, bits);
-}
-
-inline __m128i RightShiftWithRounding_U32(const __m128i v_val_d, int bits) {
- const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
- const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
- return _mm_srli_epi32(v_tmp_d, bits);
-}
-
-inline __m128i RightShiftWithRounding_S32(const __m128i v_val_d, int bits) {
- const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
- const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
- return _mm_srai_epi32(v_tmp_d, bits);
-}
-
-//------------------------------------------------------------------------------
-// Masking utilities
-inline __m128i MaskHighNBytes(int n) {
- static constexpr uint8_t kMask[32] = {
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 255, 255, 255, 255, 255, 255,
- 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
- };
+#endif // 0
- return LoadUnaligned16(kMask + n);
-}
+namespace libgav1 {
+namespace dsp {
+namespace sse4 {
+
+#include "src/dsp/x86/common_sse4.inc"
+
+} // namespace sse4
+
+// NOLINTBEGIN(misc-unused-using-decls)
+// These function aliases shall not be visible to external code. They are
+// restricted to x86/*_sse4.cc files only. This scheme exists to distinguish two
+// possible implementations of common functions, which may differ based on
+// whether the compiler is permitted to use avx2 instructions.
+using sse4::Load2;
+using sse4::Load2x2;
+using sse4::Load4;
+using sse4::Load4x2;
+using sse4::LoadAligned16;
+using sse4::LoadAligned16Msan;
+using sse4::LoadHi8;
+using sse4::LoadHi8Msan;
+using sse4::LoadLo8;
+using sse4::LoadLo8Msan;
+using sse4::LoadUnaligned16;
+using sse4::LoadUnaligned16Msan;
+using sse4::MaskHighNBytes;
+using sse4::RightShiftWithRounding_S16;
+using sse4::RightShiftWithRounding_S32;
+using sse4::RightShiftWithRounding_U16;
+using sse4::RightShiftWithRounding_U32;
+using sse4::Store2;
+using sse4::Store4;
+using sse4::StoreAligned16;
+using sse4::StoreHi8;
+using sse4::StoreLo8;
+using sse4::StoreUnaligned16;
+// NOLINTEND
} // namespace dsp
} // namespace libgav1
diff --git a/src/dsp/x86/common_sse4.inc b/src/dsp/x86/common_sse4.inc
new file mode 100644
index 0000000..35c56b8
--- /dev/null
+++ b/src/dsp/x86/common_sse4.inc
@@ -0,0 +1,206 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+//------------------------------------------------------------------------------
+// Load functions.
+
+inline __m128i Load2(const void* src) {
+ int16_t val;
+ memcpy(&val, src, sizeof(val));
+ return _mm_cvtsi32_si128(val);
+}
+
+inline __m128i Load2x2(const void* src1, const void* src2) {
+ uint16_t val1;
+ uint16_t val2;
+ memcpy(&val1, src1, sizeof(val1));
+ memcpy(&val2, src2, sizeof(val2));
+ return _mm_cvtsi32_si128(val1 | (val2 << 16));
+}
+
+// Load 2 uint8_t values into |lane| * 2 and |lane| * 2 + 1.
+template <int lane>
+inline __m128i Load2(const void* const buf, __m128i val) {
+ int16_t temp;
+ memcpy(&temp, buf, 2);
+ return _mm_insert_epi16(val, temp, lane);
+}
+
+inline __m128i Load4(const void* src) {
+ // With new compilers such as clang 8.0.0 we can use the new _mm_loadu_si32
+ // intrinsic. Both _mm_loadu_si32(src) and the code here are compiled into a
+ // movss instruction.
+ //
+ // Until compiler support of _mm_loadu_si32 is widespread, use of
+ // _mm_loadu_si32 is banned.
+ int val;
+ memcpy(&val, src, sizeof(val));
+ return _mm_cvtsi32_si128(val);
+}
+
+inline __m128i Load4x2(const void* src1, const void* src2) {
+ // With new compilers such as clang 8.0.0 we can use the new _mm_loadu_si32
+ // intrinsic. Both _mm_loadu_si32(src) and the code here are compiled into a
+ // movss instruction.
+ //
+ // Until compiler support of _mm_loadu_si32 is widespread, use of
+ // _mm_loadu_si32 is banned.
+ int val1, val2;
+ memcpy(&val1, src1, sizeof(val1));
+ memcpy(&val2, src2, sizeof(val2));
+ return _mm_insert_epi32(_mm_cvtsi32_si128(val1), val2, 1);
+}
+
+inline __m128i LoadLo8(const void* a) {
+ return _mm_loadl_epi64(static_cast<const __m128i*>(a));
+}
+
+inline __m128i LoadHi8(const __m128i v, const void* a) {
+ const __m128 x =
+ _mm_loadh_pi(_mm_castsi128_ps(v), static_cast<const __m64*>(a));
+ return _mm_castps_si128(x);
+}
+
+inline __m128i LoadUnaligned16(const void* a) {
+ return _mm_loadu_si128(static_cast<const __m128i*>(a));
+}
+
+inline __m128i LoadAligned16(const void* a) {
+ assert((reinterpret_cast<uintptr_t>(a) & 0xf) == 0);
+ return _mm_load_si128(static_cast<const __m128i*>(a));
+}
+
+//------------------------------------------------------------------------------
+// Load functions to avoid MemorySanitizer's use-of-uninitialized-value warning.
+
+inline __m128i MaskOverreads(const __m128i source,
+ const ptrdiff_t over_read_in_bytes) {
+ __m128i dst = source;
+#if LIBGAV1_MSAN
+ if (over_read_in_bytes > 0) {
+ __m128i mask = _mm_set1_epi8(-1);
+ for (ptrdiff_t i = 0; i < over_read_in_bytes; ++i) {
+ mask = _mm_srli_si128(mask, 1);
+ }
+ dst = _mm_and_si128(dst, mask);
+ }
+#else
+ static_cast<void>(over_read_in_bytes);
+#endif
+ return dst;
+}
+
+inline __m128i LoadLo8Msan(const void* const source,
+ const ptrdiff_t over_read_in_bytes) {
+ return MaskOverreads(LoadLo8(source), over_read_in_bytes + 8);
+}
+
+inline __m128i LoadHi8Msan(const __m128i v, const void* source,
+ const ptrdiff_t over_read_in_bytes) {
+ return MaskOverreads(LoadHi8(v, source), over_read_in_bytes);
+}
+
+inline __m128i LoadAligned16Msan(const void* const source,
+ const ptrdiff_t over_read_in_bytes) {
+ return MaskOverreads(LoadAligned16(source), over_read_in_bytes);
+}
+
+inline __m128i LoadUnaligned16Msan(const void* const source,
+ const ptrdiff_t over_read_in_bytes) {
+ return MaskOverreads(LoadUnaligned16(source), over_read_in_bytes);
+}
+
+//------------------------------------------------------------------------------
+// Store functions.
+
+inline void Store2(void* dst, const __m128i x) {
+ const int val = _mm_cvtsi128_si32(x);
+ memcpy(dst, &val, 2);
+}
+
+inline void Store4(void* dst, const __m128i x) {
+ const int val = _mm_cvtsi128_si32(x);
+ memcpy(dst, &val, sizeof(val));
+}
+
+inline void StoreLo8(void* a, const __m128i v) {
+ _mm_storel_epi64(static_cast<__m128i*>(a), v);
+}
+
+inline void StoreHi8(void* a, const __m128i v) {
+ _mm_storeh_pi(static_cast<__m64*>(a), _mm_castsi128_ps(v));
+}
+
+inline void StoreAligned16(void* a, const __m128i v) {
+ assert((reinterpret_cast<uintptr_t>(a) & 0xf) == 0);
+ _mm_store_si128(static_cast<__m128i*>(a), v);
+}
+
+inline void StoreUnaligned16(void* a, const __m128i v) {
+ _mm_storeu_si128(static_cast<__m128i*>(a), v);
+}
+
+//------------------------------------------------------------------------------
+// Arithmetic utilities.
+
+inline __m128i RightShiftWithRounding_U16(const __m128i v_val_d, int bits) {
+ assert(bits <= 16);
+ // Shift out all but the last bit.
+ const __m128i v_tmp_d = _mm_srli_epi16(v_val_d, bits - 1);
+ // Avg with zero will shift by 1 and round.
+ return _mm_avg_epu16(v_tmp_d, _mm_setzero_si128());
+}
+
+inline __m128i RightShiftWithRounding_S16(const __m128i v_val_d, int bits) {
+ assert(bits < 16);
+ const __m128i v_bias_d =
+ _mm_set1_epi16(static_cast<int16_t>((1 << bits) >> 1));
+ const __m128i v_tmp_d = _mm_add_epi16(v_val_d, v_bias_d);
+ return _mm_srai_epi16(v_tmp_d, bits);
+}
+
+inline __m128i RightShiftWithRounding_U32(const __m128i v_val_d, int bits) {
+ const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
+ const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
+ return _mm_srli_epi32(v_tmp_d, bits);
+}
+
+inline __m128i RightShiftWithRounding_S32(const __m128i v_val_d, int bits) {
+ const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
+ const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
+ return _mm_srai_epi32(v_tmp_d, bits);
+}
+
+// Use this when |bits| is not an immediate value.
+inline __m128i VariableRightShiftWithRounding_S32(const __m128i v_val_d,
+ int bits) {
+ const __m128i v_bias_d =
+ _mm_set1_epi32(static_cast<int32_t>((1 << bits) >> 1));
+ const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
+ return _mm_sra_epi32(v_tmp_d, _mm_cvtsi32_si128(bits));
+}
+
+//------------------------------------------------------------------------------
+// Masking utilities
+inline __m128i MaskHighNBytes(int n) {
+ static constexpr uint8_t kMask[32] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ };
+
+ return LoadUnaligned16(kMask + n);
+}
diff --git a/src/dsp/x86/convolve_avx2.cc b/src/dsp/x86/convolve_avx2.cc
index 3df2120..2ecb77c 100644
--- a/src/dsp/x86/convolve_avx2.cc
+++ b/src/dsp/x86/convolve_avx2.cc
@@ -26,7 +26,6 @@
#include "src/dsp/constants.h"
#include "src/dsp/dsp.h"
#include "src/dsp/x86/common_avx2.h"
-#include "src/dsp/x86/common_sse4.h"
#include "src/utils/common.h"
#include "src/utils/constants.h"
@@ -35,7 +34,7 @@ namespace dsp {
namespace low_bitdepth {
namespace {
-constexpr int kHorizontalOffset = 3;
+#include "src/dsp/x86/convolve_sse4.inc"
// Multiply every entry in |src[]| by the corresponding entry in |taps[]| and
// sum. The filters in |taps[]| are pre-shifted by 1. This prevents the final
@@ -118,58 +117,15 @@ __m256i SimpleHorizontalTaps(const __m256i* const src,
}
template <int filter_index>
-__m128i SumHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
- const __m128i* const v_tap) {
- // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17
- const __m128i v_src = LoadHi8(LoadLo8(&src[0]), &src[src_stride]);
-
- if (filter_index == 3) {
- // 03 04 04 05 05 06 06 07 13 14 14 15 15 16 16 17
- const __m128i v_src_43 = _mm_shuffle_epi8(
- v_src, _mm_set_epi32(0x0f0e0e0d, 0x0d0c0c0b, 0x07060605, 0x05040403));
- const __m128i v_sum_43 = _mm_maddubs_epi16(v_src_43, v_tap[0]); // k4k3
- return v_sum_43;
- }
-
- // 02 03 03 04 04 05 05 06 12 13 13 14 14 15 15 16
- const __m128i v_src_32 = _mm_shuffle_epi8(
- v_src, _mm_set_epi32(0x0e0d0d0c, 0x0c0b0b0a, 0x06050504, 0x04030302));
- // 04 05 05 06 06 07 07 xx 14 15 15 16 16 17 17 xx
- const __m128i v_src_54 = _mm_shuffle_epi8(
- v_src, _mm_set_epi32(0x800f0f0e, 0x0e0d0d0c, 0x80070706, 0x06050504));
- const __m128i v_madd_32 = _mm_maddubs_epi16(v_src_32, v_tap[0]); // k3k2
- const __m128i v_madd_54 = _mm_maddubs_epi16(v_src_54, v_tap[1]); // k5k4
- const __m128i v_sum_5432 = _mm_add_epi16(v_madd_54, v_madd_32);
- return v_sum_5432;
-}
-
-template <int filter_index>
-__m128i SimpleHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
- const __m128i* const v_tap) {
- __m128i sum = SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
-
- // Normally the Horizontal pass does the downshift in two passes:
- // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
- // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them
- // requires adding the rounding offset from the skipped shift.
- constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2);
-
- sum = _mm_add_epi16(sum, _mm_set1_epi16(first_shift_rounding_bit));
- sum = RightShiftWithRounding_S16(sum, kFilterBits - 1);
- return _mm_packus_epi16(sum, sum);
-}
-
-template <int filter_index>
-__m128i HorizontalTaps8To16_2x2(const uint8_t* src, const ptrdiff_t src_stride,
- const __m128i* const v_tap) {
- const __m128i sum =
- SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
+__m256i HorizontalTaps8To16(const __m256i* const src,
+ const __m256i* const v_tap) {
+ const __m256i sum = SumHorizontalTaps<filter_index>(src, v_tap);
return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
}
// Filter 2xh sizes.
-template <int num_taps, int step, int filter_index, bool is_2d = false,
+template <int num_taps, int filter_index, bool is_2d = false,
bool is_compound = false>
void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
void* const dest, const ptrdiff_t pred_stride,
@@ -183,7 +139,8 @@ void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
assert(num_taps <= 4);
if (num_taps <= 4) {
if (!is_compound) {
- int y = 0;
+ int y = height;
+ if (is_2d) y -= 1;
do {
if (is_2d) {
const __m128i sum =
@@ -202,8 +159,8 @@ void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
}
src += src_stride << 1;
- y += 2;
- } while (y < height - 1);
+ y -= 2;
+ } while (y != 0);
// The 2d filters have an odd |height| because the horizontal pass
// generates context for the vertical pass.
@@ -236,7 +193,7 @@ void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
}
// Filter widths >= 4.
-template <int num_taps, int step, int filter_index, bool is_2d = false,
+template <int num_taps, int filter_index, bool is_2d = false,
bool is_compound = false>
void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
void* const dest, const ptrdiff_t pred_stride,
@@ -251,7 +208,22 @@ void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
int x = 0;
do {
if (is_2d || is_compound) {
- // placeholder
+ // Load into 2 128 bit lanes.
+ const __m256i src_long =
+ SetrM128i(LoadUnaligned16(&src[x]), LoadUnaligned16(&src[x + 8]));
+ const __m256i result =
+ HorizontalTaps8To16<filter_index>(&src_long, v_tap);
+ const __m256i src_long2 = SetrM128i(LoadUnaligned16(&src[x + 16]),
+ LoadUnaligned16(&src[x + 24]));
+ const __m256i result2 =
+ HorizontalTaps8To16<filter_index>(&src_long2, v_tap);
+ if (is_2d) {
+ StoreAligned32(&dest16[x], result);
+ StoreAligned32(&dest16[x + 16], result2);
+ } else {
+ StoreUnaligned32(&dest16[x], result);
+ StoreUnaligned32(&dest16[x + 16], result2);
+ }
} else {
// Load src used to calculate dest8[7:0] and dest8[23:16].
const __m256i src_long = LoadUnaligned32(&src[x]);
@@ -264,7 +236,7 @@ void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
// Combine results and store.
StoreUnaligned32(&dest8[x], _mm256_unpacklo_epi64(result, result2));
}
- x += step * 4;
+ x += 32;
} while (x < width);
src += src_stride;
dest8 += pred_stride;
@@ -272,9 +244,26 @@ void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
} while (--y != 0);
} else if (width == 16) {
int y = height;
+ if (is_2d) y -= 1;
do {
if (is_2d || is_compound) {
- // placeholder
+ // Load into 2 128 bit lanes.
+ const __m256i src_long =
+ SetrM128i(LoadUnaligned16(&src[0]), LoadUnaligned16(&src[8]));
+ const __m256i result =
+ HorizontalTaps8To16<filter_index>(&src_long, v_tap);
+ const __m256i src_long2 =
+ SetrM128i(LoadUnaligned16(&src[src_stride]),
+ LoadUnaligned16(&src[8 + src_stride]));
+ const __m256i result2 =
+ HorizontalTaps8To16<filter_index>(&src_long2, v_tap);
+ if (is_2d) {
+ StoreAligned32(&dest16[0], result);
+ StoreAligned32(&dest16[pred_stride], result2);
+ } else {
+ StoreUnaligned32(&dest16[0], result);
+ StoreUnaligned32(&dest16[pred_stride], result2);
+ }
} else {
// Load into 2 128 bit lanes.
const __m256i src_long = SetrM128i(LoadUnaligned16(&src[0]),
@@ -295,11 +284,37 @@ void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
dest16 += pred_stride * 2;
y -= 2;
} while (y != 0);
+
+ // The 2d filters have an odd |height| during the horizontal pass, so
+ // filter the remaining row.
+ if (is_2d) {
+ const __m256i src_long =
+ SetrM128i(LoadUnaligned16(&src[0]), LoadUnaligned16(&src[8]));
+ const __m256i result =
+ HorizontalTaps8To16<filter_index>(&src_long, v_tap);
+ StoreAligned32(&dest16[0], result);
+ }
+
} else if (width == 8) {
int y = height;
+ if (is_2d) y -= 1;
do {
+ // Load into 2 128 bit lanes.
+ const __m128i this_row = LoadUnaligned16(&src[0]);
+ const __m128i next_row = LoadUnaligned16(&src[src_stride]);
+ const __m256i src_long = SetrM128i(this_row, next_row);
if (is_2d || is_compound) {
- // placeholder
+ const __m256i result =
+ HorizontalTaps8To16<filter_index>(&src_long, v_tap);
+ if (is_2d) {
+ StoreAligned16(&dest16[0], _mm256_castsi256_si128(result));
+ StoreAligned16(&dest16[pred_stride],
+ _mm256_extracti128_si256(result, 1));
+ } else {
+ StoreUnaligned16(&dest16[0], _mm256_castsi256_si128(result));
+ StoreUnaligned16(&dest16[pred_stride],
+ _mm256_extracti128_si256(result, 1));
+ }
} else {
const __m128i this_row = LoadUnaligned16(&src[0]);
const __m128i next_row = LoadUnaligned16(&src[src_stride]);
@@ -315,11 +330,29 @@ void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
dest16 += pred_stride * 2;
y -= 2;
} while (y != 0);
+
+ // The 2d filters have an odd |height| during the horizontal pass, so
+ // filter the remaining row.
+ if (is_2d) {
+ const __m256i src_long = _mm256_castsi128_si256(LoadUnaligned16(&src[0]));
+ const __m256i result =
+ HorizontalTaps8To16<filter_index>(&src_long, v_tap);
+ StoreAligned16(&dest16[0], _mm256_castsi256_si128(result));
+ }
+
} else { // width == 4
int y = height;
+ if (is_2d) y -= 1;
do {
+ // Load into 2 128 bit lanes.
+ const __m128i this_row = LoadUnaligned16(&src[0]);
+ const __m128i next_row = LoadUnaligned16(&src[src_stride]);
+ const __m256i src_long = SetrM128i(this_row, next_row);
if (is_2d || is_compound) {
- // placeholder
+ const __m256i result =
+ HorizontalTaps8To16<filter_index>(&src_long, v_tap);
+ StoreLo8(&dest16[0], _mm256_castsi256_si128(result));
+ StoreLo8(&dest16[pred_stride], _mm256_extracti128_si256(result, 1));
} else {
const __m128i this_row = LoadUnaligned16(&src[0]);
const __m128i next_row = LoadUnaligned16(&src[src_stride]);
@@ -335,93 +368,176 @@ void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
dest16 += pred_stride * 2;
y -= 2;
} while (y != 0);
+
+ // The 2d filters have an odd |height| during the horizontal pass, so
+ // filter the remaining row.
+ if (is_2d) {
+ const __m256i src_long = _mm256_castsi128_si256(LoadUnaligned16(&src[0]));
+ const __m256i result =
+ HorizontalTaps8To16<filter_index>(&src_long, v_tap);
+ StoreLo8(&dest16[0], _mm256_castsi256_si128(result));
+ }
}
}
template <int num_taps, bool is_2d_vertical = false>
LIBGAV1_ALWAYS_INLINE void SetupTaps(const __m128i* const filter,
- __m128i* v_tap) {
+ __m256i* v_tap) {
if (num_taps == 8) {
- v_tap[0] = _mm_shufflelo_epi16(*filter, 0x0); // k1k0
- v_tap[1] = _mm_shufflelo_epi16(*filter, 0x55); // k3k2
- v_tap[2] = _mm_shufflelo_epi16(*filter, 0xaa); // k5k4
- v_tap[3] = _mm_shufflelo_epi16(*filter, 0xff); // k7k6
if (is_2d_vertical) {
- v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
- v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]);
- v_tap[2] = _mm_cvtepi8_epi16(v_tap[2]);
- v_tap[3] = _mm_cvtepi8_epi16(v_tap[3]);
+ v_tap[0] = _mm256_broadcastd_epi32(*filter); // k1k0
+ v_tap[1] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 4)); // k3k2
+ v_tap[2] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 8)); // k5k4
+ v_tap[3] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 12)); // k7k6
} else {
- v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
- v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
- v_tap[2] = _mm_unpacklo_epi64(v_tap[2], v_tap[2]);
- v_tap[3] = _mm_unpacklo_epi64(v_tap[3], v_tap[3]);
+ v_tap[0] = _mm256_broadcastw_epi16(*filter); // k1k0
+ v_tap[1] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 2)); // k3k2
+ v_tap[2] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 4)); // k5k4
+ v_tap[3] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 6)); // k7k6
}
} else if (num_taps == 6) {
- const __m128i adjusted_filter = _mm_srli_si128(*filter, 1);
- v_tap[0] = _mm_shufflelo_epi16(adjusted_filter, 0x0); // k2k1
- v_tap[1] = _mm_shufflelo_epi16(adjusted_filter, 0x55); // k4k3
- v_tap[2] = _mm_shufflelo_epi16(adjusted_filter, 0xaa); // k6k5
if (is_2d_vertical) {
- v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
- v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]);
- v_tap[2] = _mm_cvtepi8_epi16(v_tap[2]);
+ v_tap[0] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 2)); // k2k1
+ v_tap[1] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 6)); // k4k3
+ v_tap[2] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 10)); // k6k5
} else {
- v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
- v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
- v_tap[2] = _mm_unpacklo_epi64(v_tap[2], v_tap[2]);
+ v_tap[0] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 1)); // k2k1
+ v_tap[1] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 3)); // k4k3
+ v_tap[2] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 5)); // k6k5
}
} else if (num_taps == 4) {
- v_tap[0] = _mm_shufflelo_epi16(*filter, 0x55); // k3k2
- v_tap[1] = _mm_shufflelo_epi16(*filter, 0xaa); // k5k4
if (is_2d_vertical) {
- v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
- v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]);
+ v_tap[0] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 4)); // k3k2
+ v_tap[1] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 8)); // k5k4
} else {
- v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
- v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
+ v_tap[0] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 2)); // k3k2
+ v_tap[1] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 4)); // k5k4
}
} else { // num_taps == 2
- const __m128i adjusted_filter = _mm_srli_si128(*filter, 1);
- v_tap[0] = _mm_shufflelo_epi16(adjusted_filter, 0x55); // k4k3
if (is_2d_vertical) {
- v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
+ v_tap[0] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 6)); // k4k3
} else {
- v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
+ v_tap[0] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 3)); // k4k3
}
}
}
-template <int num_taps, bool is_2d_vertical = false>
-LIBGAV1_ALWAYS_INLINE void SetupTaps(const __m128i* const filter,
- __m256i* v_tap) {
- if (num_taps == 8) {
- v_tap[0] = _mm256_broadcastw_epi16(*filter); // k1k0
- v_tap[1] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 2)); // k3k2
- v_tap[2] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 4)); // k5k4
- v_tap[3] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 6)); // k7k6
- if (is_2d_vertical) {
- // placeholder
- }
- } else if (num_taps == 6) {
- v_tap[0] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 1)); // k2k1
- v_tap[1] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 3)); // k4k3
- v_tap[2] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 5)); // k6k5
- if (is_2d_vertical) {
- // placeholder
- }
- } else if (num_taps == 4) {
- v_tap[0] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 2)); // k3k2
- v_tap[1] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 4)); // k5k4
- if (is_2d_vertical) {
- // placeholder
- }
- } else { // num_taps == 2
- v_tap[0] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 3)); // k4k3
- if (is_2d_vertical) {
- // placeholder
+template <int num_taps, bool is_compound>
+__m256i SimpleSum2DVerticalTaps(const __m256i* const src,
+ const __m256i* const taps) {
+ __m256i sum_lo =
+ _mm256_madd_epi16(_mm256_unpacklo_epi16(src[0], src[1]), taps[0]);
+ __m256i sum_hi =
+ _mm256_madd_epi16(_mm256_unpackhi_epi16(src[0], src[1]), taps[0]);
+ if (num_taps >= 4) {
+ __m256i madd_lo =
+ _mm256_madd_epi16(_mm256_unpacklo_epi16(src[2], src[3]), taps[1]);
+ __m256i madd_hi =
+ _mm256_madd_epi16(_mm256_unpackhi_epi16(src[2], src[3]), taps[1]);
+ sum_lo = _mm256_add_epi32(sum_lo, madd_lo);
+ sum_hi = _mm256_add_epi32(sum_hi, madd_hi);
+ if (num_taps >= 6) {
+ madd_lo =
+ _mm256_madd_epi16(_mm256_unpacklo_epi16(src[4], src[5]), taps[2]);
+ madd_hi =
+ _mm256_madd_epi16(_mm256_unpackhi_epi16(src[4], src[5]), taps[2]);
+ sum_lo = _mm256_add_epi32(sum_lo, madd_lo);
+ sum_hi = _mm256_add_epi32(sum_hi, madd_hi);
+ if (num_taps == 8) {
+ madd_lo =
+ _mm256_madd_epi16(_mm256_unpacklo_epi16(src[6], src[7]), taps[3]);
+ madd_hi =
+ _mm256_madd_epi16(_mm256_unpackhi_epi16(src[6], src[7]), taps[3]);
+ sum_lo = _mm256_add_epi32(sum_lo, madd_lo);
+ sum_hi = _mm256_add_epi32(sum_hi, madd_hi);
+ }
}
}
+
+ if (is_compound) {
+ return _mm256_packs_epi32(
+ RightShiftWithRounding_S32(sum_lo, kInterRoundBitsCompoundVertical - 1),
+ RightShiftWithRounding_S32(sum_hi,
+ kInterRoundBitsCompoundVertical - 1));
+ }
+
+ return _mm256_packs_epi32(
+ RightShiftWithRounding_S32(sum_lo, kInterRoundBitsVertical - 1),
+ RightShiftWithRounding_S32(sum_hi, kInterRoundBitsVertical - 1));
+}
+
+template <int num_taps, bool is_compound = false>
+void Filter2DVertical16xH(const uint16_t* src, void* const dst,
+ const ptrdiff_t dst_stride, const int width,
+ const int height, const __m256i* const taps) {
+ assert(width >= 8);
+ constexpr int next_row = num_taps - 1;
+ // The Horizontal pass uses |width| as |stride| for the intermediate buffer.
+ const ptrdiff_t src_stride = width;
+
+ auto* dst8 = static_cast<uint8_t*>(dst);
+ auto* dst16 = static_cast<uint16_t*>(dst);
+
+ int x = 0;
+ do {
+ __m256i srcs[8];
+ const uint16_t* src_x = src + x;
+ srcs[0] = LoadAligned32(src_x);
+ src_x += src_stride;
+ if (num_taps >= 4) {
+ srcs[1] = LoadAligned32(src_x);
+ src_x += src_stride;
+ srcs[2] = LoadAligned32(src_x);
+ src_x += src_stride;
+ if (num_taps >= 6) {
+ srcs[3] = LoadAligned32(src_x);
+ src_x += src_stride;
+ srcs[4] = LoadAligned32(src_x);
+ src_x += src_stride;
+ if (num_taps == 8) {
+ srcs[5] = LoadAligned32(src_x);
+ src_x += src_stride;
+ srcs[6] = LoadAligned32(src_x);
+ src_x += src_stride;
+ }
+ }
+ }
+
+ auto* dst8_x = dst8 + x;
+ auto* dst16_x = dst16 + x;
+ int y = height;
+ do {
+ srcs[next_row] = LoadAligned32(src_x);
+ src_x += src_stride;
+
+ const __m256i sum =
+ SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
+ if (is_compound) {
+ StoreUnaligned32(dst16_x, sum);
+ dst16_x += dst_stride;
+ } else {
+ const __m128i packed_sum = _mm_packus_epi16(
+ _mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum, 1));
+ StoreUnaligned16(dst8_x, packed_sum);
+ dst8_x += dst_stride;
+ }
+
+ srcs[0] = srcs[1];
+ if (num_taps >= 4) {
+ srcs[1] = srcs[2];
+ srcs[2] = srcs[3];
+ if (num_taps >= 6) {
+ srcs[3] = srcs[4];
+ srcs[4] = srcs[5];
+ if (num_taps == 8) {
+ srcs[5] = srcs[6];
+ srcs[6] = srcs[7];
+ }
+ }
+ }
+ } while (--y != 0);
+ x += 16;
+ } while (x < width);
}
template <bool is_2d = false, bool is_compound = false>
@@ -436,16 +552,16 @@ LIBGAV1_ALWAYS_INLINE void DoHorizontalPass2xH(
if (filter_index == 4) { // 4 tap.
SetupTaps<4>(&v_horizontal_filter, v_tap);
- FilterHorizontal<4, 8, 4, is_2d, is_compound>(
- src, src_stride, dst, dst_stride, width, height, v_tap);
+ FilterHorizontal<4, 4, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+ width, height, v_tap);
} else if (filter_index == 5) { // 4 tap.
SetupTaps<4>(&v_horizontal_filter, v_tap);
- FilterHorizontal<4, 8, 5, is_2d, is_compound>(
- src, src_stride, dst, dst_stride, width, height, v_tap);
+ FilterHorizontal<4, 5, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+ width, height, v_tap);
} else { // 2 tap.
SetupTaps<2>(&v_horizontal_filter, v_tap);
- FilterHorizontal<2, 8, 3, is_2d, is_compound>(
- src, src_stride, dst, dst_stride, width, height, v_tap);
+ FilterHorizontal<2, 3, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+ width, height, v_tap);
}
}
@@ -461,28 +577,792 @@ LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
if (filter_index == 2) { // 8 tap.
SetupTaps<8>(&v_horizontal_filter, v_tap);
- FilterHorizontal<8, 8, 2, is_2d, is_compound>(
- src, src_stride, dst, dst_stride, width, height, v_tap);
+ FilterHorizontal<8, 2, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+ width, height, v_tap);
} else if (filter_index == 1) { // 6 tap.
SetupTaps<6>(&v_horizontal_filter, v_tap);
- FilterHorizontal<6, 8, 1, is_2d, is_compound>(
- src, src_stride, dst, dst_stride, width, height, v_tap);
+ FilterHorizontal<6, 1, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+ width, height, v_tap);
} else if (filter_index == 0) { // 6 tap.
SetupTaps<6>(&v_horizontal_filter, v_tap);
- FilterHorizontal<6, 8, 0, is_2d, is_compound>(
- src, src_stride, dst, dst_stride, width, height, v_tap);
+ FilterHorizontal<6, 0, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+ width, height, v_tap);
} else if (filter_index == 4) { // 4 tap.
SetupTaps<4>(&v_horizontal_filter, v_tap);
- FilterHorizontal<4, 8, 4, is_2d, is_compound>(
- src, src_stride, dst, dst_stride, width, height, v_tap);
+ FilterHorizontal<4, 4, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+ width, height, v_tap);
} else if (filter_index == 5) { // 4 tap.
SetupTaps<4>(&v_horizontal_filter, v_tap);
- FilterHorizontal<4, 8, 5, is_2d, is_compound>(
- src, src_stride, dst, dst_stride, width, height, v_tap);
+ FilterHorizontal<4, 5, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+ width, height, v_tap);
} else { // 2 tap.
SetupTaps<2>(&v_horizontal_filter, v_tap);
- FilterHorizontal<2, 8, 3, is_2d, is_compound>(
- src, src_stride, dst, dst_stride, width, height, v_tap);
+ FilterHorizontal<2, 3, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+ width, height, v_tap);
+ }
+}
+
+void Convolve2D_AVX2(const void* const reference,
+ const ptrdiff_t reference_stride,
+ const int horizontal_filter_index,
+ const int vertical_filter_index,
+ const int horizontal_filter_id,
+ const int vertical_filter_id, const int width,
+ const int height, void* prediction,
+ const ptrdiff_t pred_stride) {
+ const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
+ const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
+ const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
+
+ // The output of the horizontal filter is guaranteed to fit in 16 bits.
+ alignas(32) uint16_t
+ intermediate_result[kMaxSuperBlockSizeInPixels *
+ (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
+ const int intermediate_height = height + vertical_taps - 1;
+
+ const ptrdiff_t src_stride = reference_stride;
+ const auto* src = static_cast<const uint8_t*>(reference) -
+ (vertical_taps / 2 - 1) * src_stride - kHorizontalOffset;
+ if (width > 2) {
+ DoHorizontalPass</*is_2d=*/true>(src, src_stride, intermediate_result,
+ width, width, intermediate_height,
+ horizontal_filter_id, horiz_filter_index);
+ } else {
+ // Use non avx2 version for smaller widths.
+ DoHorizontalPass2xH</*is_2d=*/true>(
+ src, src_stride, intermediate_result, width, width, intermediate_height,
+ horizontal_filter_id, horiz_filter_index);
+ }
+
+ // Vertical filter.
+ auto* dest = static_cast<uint8_t*>(prediction);
+ const ptrdiff_t dest_stride = pred_stride;
+ assert(vertical_filter_id != 0);
+
+ const __m128i v_filter =
+ LoadLo8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]);
+
+ // Use 256 bits for width > 8.
+ if (width > 8) {
+ __m256i taps_256[4];
+ const __m128i v_filter_ext = _mm_cvtepi8_epi16(v_filter);
+
+ if (vertical_taps == 8) {
+ SetupTaps<8, /*is_2d_vertical=*/true>(&v_filter_ext, taps_256);
+ Filter2DVertical16xH<8>(intermediate_result, dest, dest_stride, width,
+ height, taps_256);
+ } else if (vertical_taps == 6) {
+ SetupTaps<6, /*is_2d_vertical=*/true>(&v_filter_ext, taps_256);
+ Filter2DVertical16xH<6>(intermediate_result, dest, dest_stride, width,
+ height, taps_256);
+ } else if (vertical_taps == 4) {
+ SetupTaps<4, /*is_2d_vertical=*/true>(&v_filter_ext, taps_256);
+ Filter2DVertical16xH<4>(intermediate_result, dest, dest_stride, width,
+ height, taps_256);
+ } else { // |vertical_taps| == 2
+ SetupTaps<2, /*is_2d_vertical=*/true>(&v_filter_ext, taps_256);
+ Filter2DVertical16xH<2>(intermediate_result, dest, dest_stride, width,
+ height, taps_256);
+ }
+ } else { // width <= 8
+ __m128i taps[4];
+ // Use 128 bit code.
+ if (vertical_taps == 8) {
+ SetupTaps<8, /*is_2d_vertical=*/true>(&v_filter, taps);
+ if (width == 2) {
+ Filter2DVertical2xH<8>(intermediate_result, dest, dest_stride, height,
+ taps);
+ } else if (width == 4) {
+ Filter2DVertical4xH<8>(intermediate_result, dest, dest_stride, height,
+ taps);
+ } else {
+ Filter2DVertical<8>(intermediate_result, dest, dest_stride, width,
+ height, taps);
+ }
+ } else if (vertical_taps == 6) {
+ SetupTaps<6, /*is_2d_vertical=*/true>(&v_filter, taps);
+ if (width == 2) {
+ Filter2DVertical2xH<6>(intermediate_result, dest, dest_stride, height,
+ taps);
+ } else if (width == 4) {
+ Filter2DVertical4xH<6>(intermediate_result, dest, dest_stride, height,
+ taps);
+ } else {
+ Filter2DVertical<6>(intermediate_result, dest, dest_stride, width,
+ height, taps);
+ }
+ } else if (vertical_taps == 4) {
+ SetupTaps<4, /*is_2d_vertical=*/true>(&v_filter, taps);
+ if (width == 2) {
+ Filter2DVertical2xH<4>(intermediate_result, dest, dest_stride, height,
+ taps);
+ } else if (width == 4) {
+ Filter2DVertical4xH<4>(intermediate_result, dest, dest_stride, height,
+ taps);
+ } else {
+ Filter2DVertical<4>(intermediate_result, dest, dest_stride, width,
+ height, taps);
+ }
+ } else { // |vertical_taps| == 2
+ SetupTaps<2, /*is_2d_vertical=*/true>(&v_filter, taps);
+ if (width == 2) {
+ Filter2DVertical2xH<2>(intermediate_result, dest, dest_stride, height,
+ taps);
+ } else if (width == 4) {
+ Filter2DVertical4xH<2>(intermediate_result, dest, dest_stride, height,
+ taps);
+ } else {
+ Filter2DVertical<2>(intermediate_result, dest, dest_stride, width,
+ height, taps);
+ }
+ }
+ }
+}
+
+// The 1D compound shift is always |kInterRoundBitsHorizontal|, even for 1D
+// Vertical calculations.
+__m256i Compound1DShift(const __m256i sum) {
+ return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
+}
+
+template <int filter_index, bool unpack_high = false>
+__m256i SumVerticalTaps(const __m256i* const srcs, const __m256i* const v_tap) {
+ __m256i v_src[4];
+
+ if (!unpack_high) {
+ if (filter_index < 2) {
+ // 6 taps.
+ v_src[0] = _mm256_unpacklo_epi8(srcs[0], srcs[1]);
+ v_src[1] = _mm256_unpacklo_epi8(srcs[2], srcs[3]);
+ v_src[2] = _mm256_unpacklo_epi8(srcs[4], srcs[5]);
+ } else if (filter_index == 2) {
+ // 8 taps.
+ v_src[0] = _mm256_unpacklo_epi8(srcs[0], srcs[1]);
+ v_src[1] = _mm256_unpacklo_epi8(srcs[2], srcs[3]);
+ v_src[2] = _mm256_unpacklo_epi8(srcs[4], srcs[5]);
+ v_src[3] = _mm256_unpacklo_epi8(srcs[6], srcs[7]);
+ } else if (filter_index == 3) {
+ // 2 taps.
+ v_src[0] = _mm256_unpacklo_epi8(srcs[0], srcs[1]);
+ } else if (filter_index > 3) {
+ // 4 taps.
+ v_src[0] = _mm256_unpacklo_epi8(srcs[0], srcs[1]);
+ v_src[1] = _mm256_unpacklo_epi8(srcs[2], srcs[3]);
+ }
+ } else {
+ if (filter_index < 2) {
+ // 6 taps.
+ v_src[0] = _mm256_unpackhi_epi8(srcs[0], srcs[1]);
+ v_src[1] = _mm256_unpackhi_epi8(srcs[2], srcs[3]);
+ v_src[2] = _mm256_unpackhi_epi8(srcs[4], srcs[5]);
+ } else if (filter_index == 2) {
+ // 8 taps.
+ v_src[0] = _mm256_unpackhi_epi8(srcs[0], srcs[1]);
+ v_src[1] = _mm256_unpackhi_epi8(srcs[2], srcs[3]);
+ v_src[2] = _mm256_unpackhi_epi8(srcs[4], srcs[5]);
+ v_src[3] = _mm256_unpackhi_epi8(srcs[6], srcs[7]);
+ } else if (filter_index == 3) {
+ // 2 taps.
+ v_src[0] = _mm256_unpackhi_epi8(srcs[0], srcs[1]);
+ } else if (filter_index > 3) {
+ // 4 taps.
+ v_src[0] = _mm256_unpackhi_epi8(srcs[0], srcs[1]);
+ v_src[1] = _mm256_unpackhi_epi8(srcs[2], srcs[3]);
+ }
+ }
+ return SumOnePassTaps<filter_index>(v_src, v_tap);
+}
+
+template <int filter_index, bool is_compound = false>
+void FilterVertical32xH(const uint8_t* src, const ptrdiff_t src_stride,
+ void* const dst, const ptrdiff_t dst_stride,
+ const int width, const int height,
+ const __m256i* const v_tap) {
+ const int num_taps = GetNumTapsInFilter(filter_index);
+ const int next_row = num_taps - 1;
+ auto* dst8 = static_cast<uint8_t*>(dst);
+ auto* dst16 = static_cast<uint16_t*>(dst);
+ assert(width >= 32);
+ int x = 0;
+ do {
+ const uint8_t* src_x = src + x;
+ __m256i srcs[8];
+ srcs[0] = LoadUnaligned32(src_x);
+ src_x += src_stride;
+ if (num_taps >= 4) {
+ srcs[1] = LoadUnaligned32(src_x);
+ src_x += src_stride;
+ srcs[2] = LoadUnaligned32(src_x);
+ src_x += src_stride;
+ if (num_taps >= 6) {
+ srcs[3] = LoadUnaligned32(src_x);
+ src_x += src_stride;
+ srcs[4] = LoadUnaligned32(src_x);
+ src_x += src_stride;
+ if (num_taps == 8) {
+ srcs[5] = LoadUnaligned32(src_x);
+ src_x += src_stride;
+ srcs[6] = LoadUnaligned32(src_x);
+ src_x += src_stride;
+ }
+ }
+ }
+
+ auto* dst8_x = dst8 + x;
+ auto* dst16_x = dst16 + x;
+ int y = height;
+ do {
+ srcs[next_row] = LoadUnaligned32(src_x);
+ src_x += src_stride;
+
+ const __m256i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ const __m256i sums_hi =
+ SumVerticalTaps<filter_index, /*unpack_high=*/true>(srcs, v_tap);
+ if (is_compound) {
+ const __m256i results =
+ Compound1DShift(_mm256_permute2x128_si256(sums, sums_hi, 0x20));
+ const __m256i results_hi =
+ Compound1DShift(_mm256_permute2x128_si256(sums, sums_hi, 0x31));
+ StoreUnaligned32(dst16_x, results);
+ StoreUnaligned32(dst16_x + 16, results_hi);
+ dst16_x += dst_stride;
+ } else {
+ const __m256i results =
+ RightShiftWithRounding_S16(sums, kFilterBits - 1);
+ const __m256i results_hi =
+ RightShiftWithRounding_S16(sums_hi, kFilterBits - 1);
+ const __m256i packed_results = _mm256_packus_epi16(results, results_hi);
+
+ StoreUnaligned32(dst8_x, packed_results);
+ dst8_x += dst_stride;
+ }
+
+ srcs[0] = srcs[1];
+ if (num_taps >= 4) {
+ srcs[1] = srcs[2];
+ srcs[2] = srcs[3];
+ if (num_taps >= 6) {
+ srcs[3] = srcs[4];
+ srcs[4] = srcs[5];
+ if (num_taps == 8) {
+ srcs[5] = srcs[6];
+ srcs[6] = srcs[7];
+ }
+ }
+ }
+ } while (--y != 0);
+ x += 32;
+ } while (x < width);
+}
+
+template <int filter_index, bool is_compound = false>
+void FilterVertical16xH(const uint8_t* src, const ptrdiff_t src_stride,
+ void* const dst, const ptrdiff_t dst_stride,
+ const int /*width*/, const int height,
+ const __m256i* const v_tap) {
+ const int num_taps = GetNumTapsInFilter(filter_index);
+ const int next_row = num_taps;
+ auto* dst8 = static_cast<uint8_t*>(dst);
+ auto* dst16 = static_cast<uint16_t*>(dst);
+
+ const uint8_t* src_x = src;
+ __m256i srcs[8 + 1];
+ // The upper 128 bits hold the filter data for the next row.
+ srcs[0] = _mm256_castsi128_si256(LoadUnaligned16(src_x));
+ src_x += src_stride;
+ if (num_taps >= 4) {
+ srcs[1] = _mm256_castsi128_si256(LoadUnaligned16(src_x));
+ src_x += src_stride;
+ srcs[0] =
+ _mm256_inserti128_si256(srcs[0], _mm256_castsi256_si128(srcs[1]), 1);
+ srcs[2] = _mm256_castsi128_si256(LoadUnaligned16(src_x));
+ src_x += src_stride;
+ srcs[1] =
+ _mm256_inserti128_si256(srcs[1], _mm256_castsi256_si128(srcs[2]), 1);
+ if (num_taps >= 6) {
+ srcs[3] = _mm256_castsi128_si256(LoadUnaligned16(src_x));
+ src_x += src_stride;
+ srcs[2] =
+ _mm256_inserti128_si256(srcs[2], _mm256_castsi256_si128(srcs[3]), 1);
+ srcs[4] = _mm256_castsi128_si256(LoadUnaligned16(src_x));
+ src_x += src_stride;
+ srcs[3] =
+ _mm256_inserti128_si256(srcs[3], _mm256_castsi256_si128(srcs[4]), 1);
+ if (num_taps == 8) {
+ srcs[5] = _mm256_castsi128_si256(LoadUnaligned16(src_x));
+ src_x += src_stride;
+ srcs[4] = _mm256_inserti128_si256(srcs[4],
+ _mm256_castsi256_si128(srcs[5]), 1);
+ srcs[6] = _mm256_castsi128_si256(LoadUnaligned16(src_x));
+ src_x += src_stride;
+ srcs[5] = _mm256_inserti128_si256(srcs[5],
+ _mm256_castsi256_si128(srcs[6]), 1);
+ }
+ }
+ }
+
+ int y = height;
+ do {
+ srcs[next_row - 1] = _mm256_castsi128_si256(LoadUnaligned16(src_x));
+ src_x += src_stride;
+
+ srcs[next_row - 2] = _mm256_inserti128_si256(
+ srcs[next_row - 2], _mm256_castsi256_si128(srcs[next_row - 1]), 1);
+
+ srcs[next_row] = _mm256_castsi128_si256(LoadUnaligned16(src_x));
+ src_x += src_stride;
+
+ srcs[next_row - 1] = _mm256_inserti128_si256(
+ srcs[next_row - 1], _mm256_castsi256_si128(srcs[next_row]), 1);
+
+ const __m256i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ const __m256i sums_hi =
+ SumVerticalTaps<filter_index, /*unpack_high=*/true>(srcs, v_tap);
+ if (is_compound) {
+ const __m256i results =
+ Compound1DShift(_mm256_permute2x128_si256(sums, sums_hi, 0x20));
+ const __m256i results_hi =
+ Compound1DShift(_mm256_permute2x128_si256(sums, sums_hi, 0x31));
+
+ StoreUnaligned32(dst16, results);
+ StoreUnaligned32(dst16 + dst_stride, results_hi);
+ dst16 += dst_stride << 1;
+ } else {
+ const __m256i results = RightShiftWithRounding_S16(sums, kFilterBits - 1);
+ const __m256i results_hi =
+ RightShiftWithRounding_S16(sums_hi, kFilterBits - 1);
+ const __m256i packed_results = _mm256_packus_epi16(results, results_hi);
+ const __m128i this_dst = _mm256_castsi256_si128(packed_results);
+ const auto next_dst = _mm256_extracti128_si256(packed_results, 1);
+
+ StoreUnaligned16(dst8, this_dst);
+ StoreUnaligned16(dst8 + dst_stride, next_dst);
+ dst8 += dst_stride << 1;
+ }
+
+ srcs[0] = srcs[2];
+ if (num_taps >= 4) {
+ srcs[1] = srcs[3];
+ srcs[2] = srcs[4];
+ if (num_taps >= 6) {
+ srcs[3] = srcs[5];
+ srcs[4] = srcs[6];
+ if (num_taps == 8) {
+ srcs[5] = srcs[7];
+ srcs[6] = srcs[8];
+ }
+ }
+ }
+ y -= 2;
+ } while (y != 0);
+}
+
+template <int filter_index, bool is_compound = false>
+void FilterVertical8xH(const uint8_t* src, const ptrdiff_t src_stride,
+ void* const dst, const ptrdiff_t dst_stride,
+ const int /*width*/, const int height,
+ const __m256i* const v_tap) {
+ const int num_taps = GetNumTapsInFilter(filter_index);
+ const int next_row = num_taps;
+ auto* dst8 = static_cast<uint8_t*>(dst);
+ auto* dst16 = static_cast<uint16_t*>(dst);
+
+ const uint8_t* src_x = src;
+ __m256i srcs[8 + 1];
+ // The upper 128 bits hold the filter data for the next row.
+ srcs[0] = _mm256_castsi128_si256(LoadLo8(src_x));
+ src_x += src_stride;
+ if (num_taps >= 4) {
+ srcs[1] = _mm256_castsi128_si256(LoadLo8(src_x));
+ src_x += src_stride;
+ srcs[0] =
+ _mm256_inserti128_si256(srcs[0], _mm256_castsi256_si128(srcs[1]), 1);
+ srcs[2] = _mm256_castsi128_si256(LoadLo8(src_x));
+ src_x += src_stride;
+ srcs[1] =
+ _mm256_inserti128_si256(srcs[1], _mm256_castsi256_si128(srcs[2]), 1);
+ if (num_taps >= 6) {
+ srcs[3] = _mm256_castsi128_si256(LoadLo8(src_x));
+ src_x += src_stride;
+ srcs[2] =
+ _mm256_inserti128_si256(srcs[2], _mm256_castsi256_si128(srcs[3]), 1);
+ srcs[4] = _mm256_castsi128_si256(LoadLo8(src_x));
+ src_x += src_stride;
+ srcs[3] =
+ _mm256_inserti128_si256(srcs[3], _mm256_castsi256_si128(srcs[4]), 1);
+ if (num_taps == 8) {
+ srcs[5] = _mm256_castsi128_si256(LoadLo8(src_x));
+ src_x += src_stride;
+ srcs[4] = _mm256_inserti128_si256(srcs[4],
+ _mm256_castsi256_si128(srcs[5]), 1);
+ srcs[6] = _mm256_castsi128_si256(LoadLo8(src_x));
+ src_x += src_stride;
+ srcs[5] = _mm256_inserti128_si256(srcs[5],
+ _mm256_castsi256_si128(srcs[6]), 1);
+ }
+ }
+ }
+
+ int y = height;
+ do {
+ srcs[next_row - 1] = _mm256_castsi128_si256(LoadLo8(src_x));
+ src_x += src_stride;
+
+ srcs[next_row - 2] = _mm256_inserti128_si256(
+ srcs[next_row - 2], _mm256_castsi256_si128(srcs[next_row - 1]), 1);
+
+ srcs[next_row] = _mm256_castsi128_si256(LoadLo8(src_x));
+ src_x += src_stride;
+
+ srcs[next_row - 1] = _mm256_inserti128_si256(
+ srcs[next_row - 1], _mm256_castsi256_si128(srcs[next_row]), 1);
+
+ const __m256i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ if (is_compound) {
+ const __m256i results = Compound1DShift(sums);
+ const __m128i this_dst = _mm256_castsi256_si128(results);
+ const auto next_dst = _mm256_extracti128_si256(results, 1);
+
+ StoreUnaligned16(dst16, this_dst);
+ StoreUnaligned16(dst16 + dst_stride, next_dst);
+ dst16 += dst_stride << 1;
+ } else {
+ const __m256i results = RightShiftWithRounding_S16(sums, kFilterBits - 1);
+ const __m256i packed_results = _mm256_packus_epi16(results, results);
+ const __m128i this_dst = _mm256_castsi256_si128(packed_results);
+ const auto next_dst = _mm256_extracti128_si256(packed_results, 1);
+
+ StoreLo8(dst8, this_dst);
+ StoreLo8(dst8 + dst_stride, next_dst);
+ dst8 += dst_stride << 1;
+ }
+
+ srcs[0] = srcs[2];
+ if (num_taps >= 4) {
+ srcs[1] = srcs[3];
+ srcs[2] = srcs[4];
+ if (num_taps >= 6) {
+ srcs[3] = srcs[5];
+ srcs[4] = srcs[6];
+ if (num_taps == 8) {
+ srcs[5] = srcs[7];
+ srcs[6] = srcs[8];
+ }
+ }
+ }
+ y -= 2;
+ } while (y != 0);
+}
+
+template <int filter_index, bool is_compound = false>
+void FilterVertical8xH(const uint8_t* src, const ptrdiff_t src_stride,
+ void* const dst, const ptrdiff_t dst_stride,
+ const int /*width*/, const int height,
+ const __m128i* const v_tap) {
+ const int num_taps = GetNumTapsInFilter(filter_index);
+ const int next_row = num_taps - 1;
+ auto* dst8 = static_cast<uint8_t*>(dst);
+ auto* dst16 = static_cast<uint16_t*>(dst);
+
+ const uint8_t* src_x = src;
+ __m128i srcs[8];
+ srcs[0] = LoadLo8(src_x);
+ src_x += src_stride;
+ if (num_taps >= 4) {
+ srcs[1] = LoadLo8(src_x);
+ src_x += src_stride;
+ srcs[2] = LoadLo8(src_x);
+ src_x += src_stride;
+ if (num_taps >= 6) {
+ srcs[3] = LoadLo8(src_x);
+ src_x += src_stride;
+ srcs[4] = LoadLo8(src_x);
+ src_x += src_stride;
+ if (num_taps == 8) {
+ srcs[5] = LoadLo8(src_x);
+ src_x += src_stride;
+ srcs[6] = LoadLo8(src_x);
+ src_x += src_stride;
+ }
+ }
+ }
+
+ int y = height;
+ do {
+ srcs[next_row] = LoadLo8(src_x);
+ src_x += src_stride;
+
+ const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ if (is_compound) {
+ const __m128i results = Compound1DShift(sums);
+ StoreUnaligned16(dst16, results);
+ dst16 += dst_stride;
+ } else {
+ const __m128i results = RightShiftWithRounding_S16(sums, kFilterBits - 1);
+ StoreLo8(dst8, _mm_packus_epi16(results, results));
+ dst8 += dst_stride;
+ }
+
+ srcs[0] = srcs[1];
+ if (num_taps >= 4) {
+ srcs[1] = srcs[2];
+ srcs[2] = srcs[3];
+ if (num_taps >= 6) {
+ srcs[3] = srcs[4];
+ srcs[4] = srcs[5];
+ if (num_taps == 8) {
+ srcs[5] = srcs[6];
+ srcs[6] = srcs[7];
+ }
+ }
+ }
+ } while (--y != 0);
+}
+
+void ConvolveVertical_AVX2(const void* const reference,
+ const ptrdiff_t reference_stride,
+ const int /*horizontal_filter_index*/,
+ const int vertical_filter_index,
+ const int /*horizontal_filter_id*/,
+ const int vertical_filter_id, const int width,
+ const int height, void* prediction,
+ const ptrdiff_t pred_stride) {
+ const int filter_index = GetFilterIndex(vertical_filter_index, height);
+ const int vertical_taps = GetNumTapsInFilter(filter_index);
+ const ptrdiff_t src_stride = reference_stride;
+ const auto* src = static_cast<const uint8_t*>(reference) -
+ (vertical_taps / 2 - 1) * src_stride;
+ auto* dest = static_cast<uint8_t*>(prediction);
+ const ptrdiff_t dest_stride = pred_stride;
+ assert(vertical_filter_id != 0);
+
+ const __m128i v_filter =
+ LoadLo8(kHalfSubPixelFilters[filter_index][vertical_filter_id]);
+
+ // Use 256 bits for width > 4.
+ if (width > 4) {
+ __m256i taps_256[4];
+ if (filter_index < 2) { // 6 tap.
+ SetupTaps<6>(&v_filter, taps_256);
+ if (width == 8) {
+ FilterVertical8xH<0>(src, src_stride, dest, dest_stride, width, height,
+ taps_256);
+ } else if (width == 16) {
+ FilterVertical16xH<0>(src, src_stride, dest, dest_stride, width, height,
+ taps_256);
+ } else {
+ FilterVertical32xH<0>(src, src_stride, dest, dest_stride, width, height,
+ taps_256);
+ }
+ } else if (filter_index == 2) { // 8 tap.
+ SetupTaps<8>(&v_filter, taps_256);
+ if (width == 8) {
+ FilterVertical8xH<2>(src, src_stride, dest, dest_stride, width, height,
+ taps_256);
+ } else if (width == 16) {
+ FilterVertical16xH<2>(src, src_stride, dest, dest_stride, width, height,
+ taps_256);
+ } else {
+ FilterVertical32xH<2>(src, src_stride, dest, dest_stride, width, height,
+ taps_256);
+ }
+ } else if (filter_index == 3) { // 2 tap.
+ SetupTaps<2>(&v_filter, taps_256);
+ if (width == 8) {
+ FilterVertical8xH<3>(src, src_stride, dest, dest_stride, width, height,
+ taps_256);
+ } else if (width == 16) {
+ FilterVertical16xH<3>(src, src_stride, dest, dest_stride, width, height,
+ taps_256);
+ } else {
+ FilterVertical32xH<3>(src, src_stride, dest, dest_stride, width, height,
+ taps_256);
+ }
+ } else if (filter_index == 4) { // 4 tap.
+ SetupTaps<4>(&v_filter, taps_256);
+ if (width == 8) {
+ FilterVertical8xH<4>(src, src_stride, dest, dest_stride, width, height,
+ taps_256);
+ } else if (width == 16) {
+ FilterVertical16xH<4>(src, src_stride, dest, dest_stride, width, height,
+ taps_256);
+ } else {
+ FilterVertical32xH<4>(src, src_stride, dest, dest_stride, width, height,
+ taps_256);
+ }
+ } else {
+ SetupTaps<4>(&v_filter, taps_256);
+ if (width == 8) {
+ FilterVertical8xH<5>(src, src_stride, dest, dest_stride, width, height,
+ taps_256);
+ } else if (width == 16) {
+ FilterVertical16xH<5>(src, src_stride, dest, dest_stride, width, height,
+ taps_256);
+ } else {
+ FilterVertical32xH<5>(src, src_stride, dest, dest_stride, width, height,
+ taps_256);
+ }
+ }
+ } else { // width <= 8
+ // Use 128 bit code.
+ __m128i taps[4];
+
+ if (filter_index < 2) { // 6 tap.
+ SetupTaps<6>(&v_filter, taps);
+ if (width == 2) {
+ FilterVertical2xH<6, 0>(src, src_stride, dest, dest_stride, height,
+ taps);
+ } else {
+ FilterVertical4xH<6, 0>(src, src_stride, dest, dest_stride, height,
+ taps);
+ }
+ } else if (filter_index == 2) { // 8 tap.
+ SetupTaps<8>(&v_filter, taps);
+ if (width == 2) {
+ FilterVertical2xH<8, 2>(src, src_stride, dest, dest_stride, height,
+ taps);
+ } else {
+ FilterVertical4xH<8, 2>(src, src_stride, dest, dest_stride, height,
+ taps);
+ }
+ } else if (filter_index == 3) { // 2 tap.
+ SetupTaps<2>(&v_filter, taps);
+ if (width == 2) {
+ FilterVertical2xH<2, 3>(src, src_stride, dest, dest_stride, height,
+ taps);
+ } else {
+ FilterVertical4xH<2, 3>(src, src_stride, dest, dest_stride, height,
+ taps);
+ }
+ } else if (filter_index == 4) { // 4 tap.
+ SetupTaps<4>(&v_filter, taps);
+ if (width == 2) {
+ FilterVertical2xH<4, 4>(src, src_stride, dest, dest_stride, height,
+ taps);
+ } else {
+ FilterVertical4xH<4, 4>(src, src_stride, dest, dest_stride, height,
+ taps);
+ }
+ } else {
+ SetupTaps<4>(&v_filter, taps);
+ if (width == 2) {
+ FilterVertical2xH<4, 5>(src, src_stride, dest, dest_stride, height,
+ taps);
+ } else {
+ FilterVertical4xH<4, 5>(src, src_stride, dest, dest_stride, height,
+ taps);
+ }
+ }
+ }
+}
+
+void ConvolveCompoundVertical_AVX2(
+ const void* const reference, const ptrdiff_t reference_stride,
+ const int /*horizontal_filter_index*/, const int vertical_filter_index,
+ const int /*horizontal_filter_id*/, const int vertical_filter_id,
+ const int width, const int height, void* prediction,
+ const ptrdiff_t /*pred_stride*/) {
+ const int filter_index = GetFilterIndex(vertical_filter_index, height);
+ const int vertical_taps = GetNumTapsInFilter(filter_index);
+ const ptrdiff_t src_stride = reference_stride;
+ const auto* src = static_cast<const uint8_t*>(reference) -
+ (vertical_taps / 2 - 1) * src_stride;
+ auto* dest = static_cast<uint8_t*>(prediction);
+ const ptrdiff_t dest_stride = width;
+ assert(vertical_filter_id != 0);
+
+ const __m128i v_filter =
+ LoadLo8(kHalfSubPixelFilters[filter_index][vertical_filter_id]);
+
+ // Use 256 bits for width > 4.
+ if (width > 4) {
+ __m256i taps_256[4];
+ if (filter_index < 2) { // 6 tap.
+ SetupTaps<6>(&v_filter, taps_256);
+ if (width == 8) {
+ FilterVertical8xH<0, /*is_compound=*/true>(
+ src, src_stride, dest, dest_stride, width, height, taps_256);
+ } else if (width == 16) {
+ FilterVertical16xH<0, /*is_compound=*/true>(
+ src, src_stride, dest, dest_stride, width, height, taps_256);
+ } else {
+ FilterVertical32xH<0, /*is_compound=*/true>(
+ src, src_stride, dest, dest_stride, width, height, taps_256);
+ }
+ } else if (filter_index == 2) { // 8 tap.
+ SetupTaps<8>(&v_filter, taps_256);
+ if (width == 8) {
+ FilterVertical8xH<2, /*is_compound=*/true>(
+ src, src_stride, dest, dest_stride, width, height, taps_256);
+ } else if (width == 16) {
+ FilterVertical16xH<2, /*is_compound=*/true>(
+ src, src_stride, dest, dest_stride, width, height, taps_256);
+ } else {
+ FilterVertical32xH<2, /*is_compound=*/true>(
+ src, src_stride, dest, dest_stride, width, height, taps_256);
+ }
+ } else if (filter_index == 3) { // 2 tap.
+ SetupTaps<2>(&v_filter, taps_256);
+ if (width == 8) {
+ FilterVertical8xH<3, /*is_compound=*/true>(
+ src, src_stride, dest, dest_stride, width, height, taps_256);
+ } else if (width == 16) {
+ FilterVertical16xH<3, /*is_compound=*/true>(
+ src, src_stride, dest, dest_stride, width, height, taps_256);
+ } else {
+ FilterVertical32xH<3, /*is_compound=*/true>(
+ src, src_stride, dest, dest_stride, width, height, taps_256);
+ }
+ } else if (filter_index == 4) { // 4 tap.
+ SetupTaps<4>(&v_filter, taps_256);
+ if (width == 8) {
+ FilterVertical8xH<4, /*is_compound=*/true>(
+ src, src_stride, dest, dest_stride, width, height, taps_256);
+ } else if (width == 16) {
+ FilterVertical16xH<4, /*is_compound=*/true>(
+ src, src_stride, dest, dest_stride, width, height, taps_256);
+ } else {
+ FilterVertical32xH<4, /*is_compound=*/true>(
+ src, src_stride, dest, dest_stride, width, height, taps_256);
+ }
+ } else {
+ SetupTaps<4>(&v_filter, taps_256);
+ if (width == 8) {
+ FilterVertical8xH<5, /*is_compound=*/true>(
+ src, src_stride, dest, dest_stride, width, height, taps_256);
+ } else if (width == 16) {
+ FilterVertical16xH<5, /*is_compound=*/true>(
+ src, src_stride, dest, dest_stride, width, height, taps_256);
+ } else {
+ FilterVertical32xH<5, /*is_compound=*/true>(
+ src, src_stride, dest, dest_stride, width, height, taps_256);
+ }
+ }
+ } else { // width <= 4
+ // Use 128 bit code.
+ __m128i taps[4];
+
+ if (filter_index < 2) { // 6 tap.
+ SetupTaps<6>(&v_filter, taps);
+ FilterVertical4xH<6, 0, /*is_compound=*/true>(src, src_stride, dest,
+ dest_stride, height, taps);
+ } else if (filter_index == 2) { // 8 tap.
+ SetupTaps<8>(&v_filter, taps);
+ FilterVertical4xH<8, 2, /*is_compound=*/true>(src, src_stride, dest,
+ dest_stride, height, taps);
+ } else if (filter_index == 3) { // 2 tap.
+ SetupTaps<2>(&v_filter, taps);
+ FilterVertical4xH<2, 3, /*is_compound=*/true>(src, src_stride, dest,
+ dest_stride, height, taps);
+ } else if (filter_index == 4) { // 4 tap.
+ SetupTaps<4>(&v_filter, taps);
+ FilterVertical4xH<4, 4, /*is_compound=*/true>(src, src_stride, dest,
+ dest_stride, height, taps);
+ } else {
+ SetupTaps<4>(&v_filter, taps);
+ FilterVertical4xH<4, 5, /*is_compound=*/true>(src, src_stride, dest,
+ dest_stride, height, taps);
+ }
}
}
@@ -509,10 +1389,140 @@ void ConvolveHorizontal_AVX2(const void* const reference,
}
}
+void ConvolveCompoundHorizontal_AVX2(
+ const void* const reference, const ptrdiff_t reference_stride,
+ const int horizontal_filter_index, const int /*vertical_filter_index*/,
+ const int horizontal_filter_id, const int /*vertical_filter_id*/,
+ const int width, const int height, void* prediction,
+ const ptrdiff_t pred_stride) {
+ const int filter_index = GetFilterIndex(horizontal_filter_index, width);
+ // Set |src| to the outermost tap.
+ const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
+ auto* dest = static_cast<uint8_t*>(prediction);
+ // All compound functions output to the predictor buffer with |pred_stride|
+ // equal to |width|.
+ assert(pred_stride == width);
+ // Compound functions start at 4x4.
+ assert(width >= 4 && height >= 4);
+
+#ifdef NDEBUG
+ // Quiet compiler error.
+ (void)pred_stride;
+#endif
+
+ DoHorizontalPass</*is_2d=*/false, /*is_compound=*/true>(
+ src, reference_stride, dest, width, width, height, horizontal_filter_id,
+ filter_index);
+}
+
+void ConvolveCompound2D_AVX2(const void* const reference,
+ const ptrdiff_t reference_stride,
+ const int horizontal_filter_index,
+ const int vertical_filter_index,
+ const int horizontal_filter_id,
+ const int vertical_filter_id, const int width,
+ const int height, void* prediction,
+ const ptrdiff_t pred_stride) {
+ const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
+ const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
+ const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
+
+ // The output of the horizontal filter is guaranteed to fit in 16 bits.
+ alignas(32) uint16_t
+ intermediate_result[kMaxSuperBlockSizeInPixels *
+ (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
+ const int intermediate_height = height + vertical_taps - 1;
+
+ const ptrdiff_t src_stride = reference_stride;
+ const auto* src = static_cast<const uint8_t*>(reference) -
+ (vertical_taps / 2 - 1) * src_stride - kHorizontalOffset;
+ DoHorizontalPass</*is_2d=*/true, /*is_compound=*/true>(
+ src, src_stride, intermediate_result, width, width, intermediate_height,
+ horizontal_filter_id, horiz_filter_index);
+
+ // Vertical filter.
+ auto* dest = static_cast<uint8_t*>(prediction);
+ const ptrdiff_t dest_stride = pred_stride;
+ assert(vertical_filter_id != 0);
+
+ const __m128i v_filter =
+ LoadLo8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]);
+
+ // Use 256 bits for width > 8.
+ if (width > 8) {
+ __m256i taps_256[4];
+ const __m128i v_filter_ext = _mm_cvtepi8_epi16(v_filter);
+
+ if (vertical_taps == 8) {
+ SetupTaps<8, /*is_2d_vertical=*/true>(&v_filter_ext, taps_256);
+ Filter2DVertical16xH<8, /*is_compound=*/true>(
+ intermediate_result, dest, dest_stride, width, height, taps_256);
+ } else if (vertical_taps == 6) {
+ SetupTaps<6, /*is_2d_vertical=*/true>(&v_filter_ext, taps_256);
+ Filter2DVertical16xH<6, /*is_compound=*/true>(
+ intermediate_result, dest, dest_stride, width, height, taps_256);
+ } else if (vertical_taps == 4) {
+ SetupTaps<4, /*is_2d_vertical=*/true>(&v_filter_ext, taps_256);
+ Filter2DVertical16xH<4, /*is_compound=*/true>(
+ intermediate_result, dest, dest_stride, width, height, taps_256);
+ } else { // |vertical_taps| == 2
+ SetupTaps<2, /*is_2d_vertical=*/true>(&v_filter_ext, taps_256);
+ Filter2DVertical16xH<2, /*is_compound=*/true>(
+ intermediate_result, dest, dest_stride, width, height, taps_256);
+ }
+ } else { // width <= 8
+ __m128i taps[4];
+ // Use 128 bit code.
+ if (vertical_taps == 8) {
+ SetupTaps<8, /*is_2d_vertical=*/true>(&v_filter, taps);
+ if (width == 4) {
+ Filter2DVertical4xH<8, /*is_compound=*/true>(intermediate_result, dest,
+ dest_stride, height, taps);
+ } else {
+ Filter2DVertical<8, /*is_compound=*/true>(
+ intermediate_result, dest, dest_stride, width, height, taps);
+ }
+ } else if (vertical_taps == 6) {
+ SetupTaps<6, /*is_2d_vertical=*/true>(&v_filter, taps);
+ if (width == 4) {
+ Filter2DVertical4xH<6, /*is_compound=*/true>(intermediate_result, dest,
+ dest_stride, height, taps);
+ } else {
+ Filter2DVertical<6, /*is_compound=*/true>(
+ intermediate_result, dest, dest_stride, width, height, taps);
+ }
+ } else if (vertical_taps == 4) {
+ SetupTaps<4, /*is_2d_vertical=*/true>(&v_filter, taps);
+ if (width == 4) {
+ Filter2DVertical4xH<4, /*is_compound=*/true>(intermediate_result, dest,
+ dest_stride, height, taps);
+ } else {
+ Filter2DVertical<4, /*is_compound=*/true>(
+ intermediate_result, dest, dest_stride, width, height, taps);
+ }
+ } else { // |vertical_taps| == 2
+ SetupTaps<2, /*is_2d_vertical=*/true>(&v_filter, taps);
+ if (width == 4) {
+ Filter2DVertical4xH<2, /*is_compound=*/true>(intermediate_result, dest,
+ dest_stride, height, taps);
+ } else {
+ Filter2DVertical<2, /*is_compound=*/true>(
+ intermediate_result, dest, dest_stride, width, height, taps);
+ }
+ }
+ }
+}
+
void Init8bpp() {
Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
assert(dsp != nullptr);
dsp->convolve[0][0][0][1] = ConvolveHorizontal_AVX2;
+ dsp->convolve[0][0][1][0] = ConvolveVertical_AVX2;
+ dsp->convolve[0][0][1][1] = Convolve2D_AVX2;
+
+ dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_AVX2;
+ dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_AVX2;
+ dsp->convolve[0][1][1][1] = ConvolveCompound2D_AVX2;
}
} // namespace
@@ -523,7 +1533,7 @@ void ConvolveInit_AVX2() { low_bitdepth::Init8bpp(); }
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_TARGETING_AVX2
+#else // !LIBGAV1_TARGETING_AVX2
namespace libgav1 {
namespace dsp {
diff --git a/src/dsp/x86/convolve_avx2.h b/src/dsp/x86/convolve_avx2.h
index 6179d98..e509bc9 100644
--- a/src/dsp/x86/convolve_avx2.h
+++ b/src/dsp/x86/convolve_avx2.h
@@ -38,6 +38,22 @@ void ConvolveInit_AVX2();
#define LIBGAV1_Dsp8bpp_ConvolveHorizontal LIBGAV1_CPU_AVX2
#endif
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundHorizontal
+#define LIBGAV1_Dsp8bpp_ConvolveCompoundHorizontal LIBGAV1_CPU_AVX2
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveVertical
+#define LIBGAV1_Dsp8bpp_ConvolveVertical LIBGAV1_CPU_AVX2
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_Convolve2D
+#define LIBGAV1_Dsp8bpp_Convolve2D LIBGAV1_CPU_AVX2
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundVertical
+#define LIBGAV1_Dsp8bpp_ConvolveCompoundVertical LIBGAV1_CPU_AVX2
+#endif
+
#endif // LIBGAV1_TARGETING_AVX2
#endif // LIBGAV1_SRC_DSP_X86_CONVOLVE_AVX2_H_
diff --git a/src/dsp/x86/convolve_sse4.cc b/src/dsp/x86/convolve_sse4.cc
index 3a0fff5..9b72fe4 100644
--- a/src/dsp/x86/convolve_sse4.cc
+++ b/src/dsp/x86/convolve_sse4.cc
@@ -34,41 +34,7 @@ namespace dsp {
namespace low_bitdepth {
namespace {
-#include "src/dsp/convolve.inc"
-
-// Multiply every entry in |src[]| by the corresponding entry in |taps[]| and
-// sum. The filters in |taps[]| are pre-shifted by 1. This prevents the final
-// sum from outranging int16_t.
-template <int filter_index>
-__m128i SumOnePassTaps(const __m128i* const src, const __m128i* const taps) {
- __m128i sum;
- if (filter_index < 2) {
- // 6 taps.
- const __m128i v_madd_21 = _mm_maddubs_epi16(src[0], taps[0]); // k2k1
- const __m128i v_madd_43 = _mm_maddubs_epi16(src[1], taps[1]); // k4k3
- const __m128i v_madd_65 = _mm_maddubs_epi16(src[2], taps[2]); // k6k5
- sum = _mm_add_epi16(v_madd_21, v_madd_43);
- sum = _mm_add_epi16(sum, v_madd_65);
- } else if (filter_index == 2) {
- // 8 taps.
- const __m128i v_madd_10 = _mm_maddubs_epi16(src[0], taps[0]); // k1k0
- const __m128i v_madd_32 = _mm_maddubs_epi16(src[1], taps[1]); // k3k2
- const __m128i v_madd_54 = _mm_maddubs_epi16(src[2], taps[2]); // k5k4
- const __m128i v_madd_76 = _mm_maddubs_epi16(src[3], taps[3]); // k7k6
- const __m128i v_sum_3210 = _mm_add_epi16(v_madd_10, v_madd_32);
- const __m128i v_sum_7654 = _mm_add_epi16(v_madd_54, v_madd_76);
- sum = _mm_add_epi16(v_sum_7654, v_sum_3210);
- } else if (filter_index == 3) {
- // 2 taps.
- sum = _mm_maddubs_epi16(src[0], taps[0]); // k4k3
- } else {
- // 4 taps.
- const __m128i v_madd_32 = _mm_maddubs_epi16(src[0], taps[0]); // k3k2
- const __m128i v_madd_54 = _mm_maddubs_epi16(src[1], taps[1]); // k5k4
- sum = _mm_add_epi16(v_madd_32, v_madd_54);
- }
- return sum;
-}
+#include "src/dsp/x86/convolve_sse4.inc"
template <int filter_index>
__m128i SumHorizontalTaps(const uint8_t* const src,
@@ -125,68 +91,7 @@ __m128i HorizontalTaps8To16(const uint8_t* const src,
return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
}
-template <int filter_index>
-__m128i SumHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
- const __m128i* const v_tap) {
- const __m128i input0 = LoadLo8(&src[2]);
- const __m128i input1 = LoadLo8(&src[2 + src_stride]);
-
- if (filter_index == 3) {
- // 03 04 04 05 05 06 06 07 ....
- const __m128i input0_dup =
- _mm_srli_si128(_mm_unpacklo_epi8(input0, input0), 3);
- // 13 14 14 15 15 16 16 17 ....
- const __m128i input1_dup =
- _mm_srli_si128(_mm_unpacklo_epi8(input1, input1), 3);
- const __m128i v_src_43 = _mm_unpacklo_epi64(input0_dup, input1_dup);
- const __m128i v_sum_43 = _mm_maddubs_epi16(v_src_43, v_tap[0]); // k4k3
- return v_sum_43;
- }
-
- // 02 03 03 04 04 05 05 06 06 07 ....
- const __m128i input0_dup =
- _mm_srli_si128(_mm_unpacklo_epi8(input0, input0), 1);
- // 12 13 13 14 14 15 15 16 16 17 ....
- const __m128i input1_dup =
- _mm_srli_si128(_mm_unpacklo_epi8(input1, input1), 1);
- // 04 05 05 06 06 07 07 08 ...
- const __m128i input0_dup_54 = _mm_srli_si128(input0_dup, 4);
- // 14 15 15 16 16 17 17 18 ...
- const __m128i input1_dup_54 = _mm_srli_si128(input1_dup, 4);
- const __m128i v_src_32 = _mm_unpacklo_epi64(input0_dup, input1_dup);
- const __m128i v_src_54 = _mm_unpacklo_epi64(input0_dup_54, input1_dup_54);
- const __m128i v_madd_32 = _mm_maddubs_epi16(v_src_32, v_tap[0]); // k3k2
- const __m128i v_madd_54 = _mm_maddubs_epi16(v_src_54, v_tap[1]); // k5k4
- const __m128i v_sum_5432 = _mm_add_epi16(v_madd_54, v_madd_32);
- return v_sum_5432;
-}
-
-template <int filter_index>
-__m128i SimpleHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
- const __m128i* const v_tap) {
- __m128i sum = SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
-
- // Normally the Horizontal pass does the downshift in two passes:
- // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
- // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them
- // requires adding the rounding offset from the skipped shift.
- constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2);
-
- sum = _mm_add_epi16(sum, _mm_set1_epi16(first_shift_rounding_bit));
- sum = RightShiftWithRounding_S16(sum, kFilterBits - 1);
- return _mm_packus_epi16(sum, sum);
-}
-
-template <int filter_index>
-__m128i HorizontalTaps8To16_2x2(const uint8_t* src, const ptrdiff_t src_stride,
- const __m128i* const v_tap) {
- const __m128i sum =
- SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
-
- return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
-}
-
-template <int num_taps, int step, int filter_index, bool is_2d = false,
+template <int num_taps, int filter_index, bool is_2d = false,
bool is_compound = false>
void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
void* const dest, const ptrdiff_t pred_stride,
@@ -197,7 +102,7 @@ void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
// 4 tap filters are never used when width > 4.
if (num_taps != 4 && width > 4) {
- int y = 0;
+ int y = height;
do {
int x = 0;
do {
@@ -214,12 +119,12 @@ void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
SimpleHorizontalTaps<filter_index>(&src[x], v_tap);
StoreLo8(&dest8[x], result);
}
- x += step;
+ x += 8;
} while (x < width);
src += src_stride;
dest8 += pred_stride;
dest16 += pred_stride;
- } while (++y < height);
+ } while (--y != 0);
return;
}
@@ -229,7 +134,7 @@ void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
assert(num_taps <= 4);
if (num_taps <= 4) {
if (width == 4) {
- int y = 0;
+ int y = height;
do {
if (is_2d || is_compound) {
const __m128i v_sum = HorizontalTaps8To16<filter_index>(src, v_tap);
@@ -241,12 +146,13 @@ void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
src += src_stride;
dest8 += pred_stride;
dest16 += pred_stride;
- } while (++y < height);
+ } while (--y != 0);
return;
}
if (!is_compound) {
- int y = 0;
+ int y = height;
+ if (is_2d) y -= 1;
do {
if (is_2d) {
const __m128i sum =
@@ -265,8 +171,8 @@ void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
}
src += src_stride << 1;
- y += 2;
- } while (y < height - 1);
+ y -= 2;
+ } while (y != 0);
// The 2d filters have an odd |height| because the horizontal pass
// generates context for the vertical pass.
@@ -298,303 +204,6 @@ void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
}
}
-template <int num_taps, bool is_2d_vertical = false>
-LIBGAV1_ALWAYS_INLINE void SetupTaps(const __m128i* const filter,
- __m128i* v_tap) {
- if (num_taps == 8) {
- v_tap[0] = _mm_shufflelo_epi16(*filter, 0x0); // k1k0
- v_tap[1] = _mm_shufflelo_epi16(*filter, 0x55); // k3k2
- v_tap[2] = _mm_shufflelo_epi16(*filter, 0xaa); // k5k4
- v_tap[3] = _mm_shufflelo_epi16(*filter, 0xff); // k7k6
- if (is_2d_vertical) {
- v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
- v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]);
- v_tap[2] = _mm_cvtepi8_epi16(v_tap[2]);
- v_tap[3] = _mm_cvtepi8_epi16(v_tap[3]);
- } else {
- v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
- v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
- v_tap[2] = _mm_unpacklo_epi64(v_tap[2], v_tap[2]);
- v_tap[3] = _mm_unpacklo_epi64(v_tap[3], v_tap[3]);
- }
- } else if (num_taps == 6) {
- const __m128i adjusted_filter = _mm_srli_si128(*filter, 1);
- v_tap[0] = _mm_shufflelo_epi16(adjusted_filter, 0x0); // k2k1
- v_tap[1] = _mm_shufflelo_epi16(adjusted_filter, 0x55); // k4k3
- v_tap[2] = _mm_shufflelo_epi16(adjusted_filter, 0xaa); // k6k5
- if (is_2d_vertical) {
- v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
- v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]);
- v_tap[2] = _mm_cvtepi8_epi16(v_tap[2]);
- } else {
- v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
- v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
- v_tap[2] = _mm_unpacklo_epi64(v_tap[2], v_tap[2]);
- }
- } else if (num_taps == 4) {
- v_tap[0] = _mm_shufflelo_epi16(*filter, 0x55); // k3k2
- v_tap[1] = _mm_shufflelo_epi16(*filter, 0xaa); // k5k4
- if (is_2d_vertical) {
- v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
- v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]);
- } else {
- v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
- v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
- }
- } else { // num_taps == 2
- const __m128i adjusted_filter = _mm_srli_si128(*filter, 1);
- v_tap[0] = _mm_shufflelo_epi16(adjusted_filter, 0x55); // k4k3
- if (is_2d_vertical) {
- v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
- } else {
- v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
- }
- }
-}
-
-template <int num_taps, bool is_compound>
-__m128i SimpleSum2DVerticalTaps(const __m128i* const src,
- const __m128i* const taps) {
- __m128i sum_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[0], src[1]), taps[0]);
- __m128i sum_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[0], src[1]), taps[0]);
- if (num_taps >= 4) {
- __m128i madd_lo =
- _mm_madd_epi16(_mm_unpacklo_epi16(src[2], src[3]), taps[1]);
- __m128i madd_hi =
- _mm_madd_epi16(_mm_unpackhi_epi16(src[2], src[3]), taps[1]);
- sum_lo = _mm_add_epi32(sum_lo, madd_lo);
- sum_hi = _mm_add_epi32(sum_hi, madd_hi);
- if (num_taps >= 6) {
- madd_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[4], src[5]), taps[2]);
- madd_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[4], src[5]), taps[2]);
- sum_lo = _mm_add_epi32(sum_lo, madd_lo);
- sum_hi = _mm_add_epi32(sum_hi, madd_hi);
- if (num_taps == 8) {
- madd_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[6], src[7]), taps[3]);
- madd_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[6], src[7]), taps[3]);
- sum_lo = _mm_add_epi32(sum_lo, madd_lo);
- sum_hi = _mm_add_epi32(sum_hi, madd_hi);
- }
- }
- }
-
- if (is_compound) {
- return _mm_packs_epi32(
- RightShiftWithRounding_S32(sum_lo, kInterRoundBitsCompoundVertical - 1),
- RightShiftWithRounding_S32(sum_hi,
- kInterRoundBitsCompoundVertical - 1));
- }
-
- return _mm_packs_epi32(
- RightShiftWithRounding_S32(sum_lo, kInterRoundBitsVertical - 1),
- RightShiftWithRounding_S32(sum_hi, kInterRoundBitsVertical - 1));
-}
-
-template <int num_taps, bool is_compound = false>
-void Filter2DVertical(const uint16_t* src, void* const dst,
- const ptrdiff_t dst_stride, const int width,
- const int height, const __m128i* const taps) {
- assert(width >= 8);
- constexpr int next_row = num_taps - 1;
- // The Horizontal pass uses |width| as |stride| for the intermediate buffer.
- const ptrdiff_t src_stride = width;
-
- auto* dst8 = static_cast<uint8_t*>(dst);
- auto* dst16 = static_cast<uint16_t*>(dst);
-
- int x = 0;
- do {
- __m128i srcs[8];
- const uint16_t* src_x = src + x;
- srcs[0] = LoadAligned16(src_x);
- src_x += src_stride;
- if (num_taps >= 4) {
- srcs[1] = LoadAligned16(src_x);
- src_x += src_stride;
- srcs[2] = LoadAligned16(src_x);
- src_x += src_stride;
- if (num_taps >= 6) {
- srcs[3] = LoadAligned16(src_x);
- src_x += src_stride;
- srcs[4] = LoadAligned16(src_x);
- src_x += src_stride;
- if (num_taps == 8) {
- srcs[5] = LoadAligned16(src_x);
- src_x += src_stride;
- srcs[6] = LoadAligned16(src_x);
- src_x += src_stride;
- }
- }
- }
-
- int y = 0;
- do {
- srcs[next_row] = LoadAligned16(src_x);
- src_x += src_stride;
-
- const __m128i sum =
- SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
- if (is_compound) {
- StoreUnaligned16(dst16 + x + y * dst_stride, sum);
- } else {
- StoreLo8(dst8 + x + y * dst_stride, _mm_packus_epi16(sum, sum));
- }
-
- srcs[0] = srcs[1];
- if (num_taps >= 4) {
- srcs[1] = srcs[2];
- srcs[2] = srcs[3];
- if (num_taps >= 6) {
- srcs[3] = srcs[4];
- srcs[4] = srcs[5];
- if (num_taps == 8) {
- srcs[5] = srcs[6];
- srcs[6] = srcs[7];
- }
- }
- }
- } while (++y < height);
- x += 8;
- } while (x < width);
-}
-
-// Take advantage of |src_stride| == |width| to process two rows at a time.
-template <int num_taps, bool is_compound = false>
-void Filter2DVertical4xH(const uint16_t* src, void* const dst,
- const ptrdiff_t dst_stride, const int height,
- const __m128i* const taps) {
- auto* dst8 = static_cast<uint8_t*>(dst);
- auto* dst16 = static_cast<uint16_t*>(dst);
-
- __m128i srcs[9];
- srcs[0] = LoadAligned16(src);
- src += 8;
- if (num_taps >= 4) {
- srcs[2] = LoadAligned16(src);
- src += 8;
- srcs[1] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[0], 8), srcs[2]);
- if (num_taps >= 6) {
- srcs[4] = LoadAligned16(src);
- src += 8;
- srcs[3] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[2], 8), srcs[4]);
- if (num_taps == 8) {
- srcs[6] = LoadAligned16(src);
- src += 8;
- srcs[5] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[4], 8), srcs[6]);
- }
- }
- }
-
- int y = 0;
- do {
- srcs[num_taps] = LoadAligned16(src);
- src += 8;
- srcs[num_taps - 1] = _mm_unpacklo_epi64(
- _mm_srli_si128(srcs[num_taps - 2], 8), srcs[num_taps]);
-
- const __m128i sum =
- SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
- if (is_compound) {
- StoreUnaligned16(dst16, sum);
- dst16 += 4 << 1;
- } else {
- const __m128i results = _mm_packus_epi16(sum, sum);
- Store4(dst8, results);
- dst8 += dst_stride;
- Store4(dst8, _mm_srli_si128(results, 4));
- dst8 += dst_stride;
- }
-
- srcs[0] = srcs[2];
- if (num_taps >= 4) {
- srcs[1] = srcs[3];
- srcs[2] = srcs[4];
- if (num_taps >= 6) {
- srcs[3] = srcs[5];
- srcs[4] = srcs[6];
- if (num_taps == 8) {
- srcs[5] = srcs[7];
- srcs[6] = srcs[8];
- }
- }
- }
- y += 2;
- } while (y < height);
-}
-
-// Take advantage of |src_stride| == |width| to process four rows at a time.
-template <int num_taps>
-void Filter2DVertical2xH(const uint16_t* src, void* const dst,
- const ptrdiff_t dst_stride, const int height,
- const __m128i* const taps) {
- constexpr int next_row = (num_taps < 6) ? 4 : 8;
-
- auto* dst8 = static_cast<uint8_t*>(dst);
-
- __m128i srcs[9];
- srcs[0] = LoadAligned16(src);
- src += 8;
- if (num_taps >= 6) {
- srcs[4] = LoadAligned16(src);
- src += 8;
- srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4);
- if (num_taps == 8) {
- srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8);
- srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12);
- }
- }
-
- int y = 0;
- do {
- srcs[next_row] = LoadAligned16(src);
- src += 8;
- if (num_taps == 2) {
- srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4);
- } else if (num_taps == 4) {
- srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4);
- srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8);
- srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12);
- } else if (num_taps == 6) {
- srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8);
- srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12);
- srcs[5] = _mm_alignr_epi8(srcs[8], srcs[4], 4);
- } else if (num_taps == 8) {
- srcs[5] = _mm_alignr_epi8(srcs[8], srcs[4], 4);
- srcs[6] = _mm_alignr_epi8(srcs[8], srcs[4], 8);
- srcs[7] = _mm_alignr_epi8(srcs[8], srcs[4], 12);
- }
-
- const __m128i sum =
- SimpleSum2DVerticalTaps<num_taps, /*is_compound=*/false>(srcs, taps);
- const __m128i results = _mm_packus_epi16(sum, sum);
-
- Store2(dst8, results);
- dst8 += dst_stride;
- Store2(dst8, _mm_srli_si128(results, 2));
- // When |height| <= 4 the taps are restricted to 2 and 4 tap variants.
- // Therefore we don't need to check this condition when |height| > 4.
- if (num_taps <= 4 && height == 2) return;
- dst8 += dst_stride;
- Store2(dst8, _mm_srli_si128(results, 4));
- dst8 += dst_stride;
- Store2(dst8, _mm_srli_si128(results, 6));
- dst8 += dst_stride;
-
- srcs[0] = srcs[4];
- if (num_taps == 6) {
- srcs[1] = srcs[5];
- srcs[4] = srcs[8];
- } else if (num_taps == 8) {
- srcs[1] = srcs[5];
- srcs[2] = srcs[6];
- srcs[3] = srcs[7];
- srcs[4] = srcs[8];
- }
-
- y += 4;
- } while (y < height);
-}
-
template <bool is_2d = false, bool is_compound = false>
LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
const uint8_t* const src, const ptrdiff_t src_stride, void* const dst,
@@ -607,28 +216,28 @@ LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
if (filter_index == 2) { // 8 tap.
SetupTaps<8>(&v_horizontal_filter, v_tap);
- FilterHorizontal<8, 8, 2, is_2d, is_compound>(
- src, src_stride, dst, dst_stride, width, height, v_tap);
+ FilterHorizontal<8, 2, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+ width, height, v_tap);
} else if (filter_index == 1) { // 6 tap.
SetupTaps<6>(&v_horizontal_filter, v_tap);
- FilterHorizontal<6, 8, 1, is_2d, is_compound>(
- src, src_stride, dst, dst_stride, width, height, v_tap);
+ FilterHorizontal<6, 1, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+ width, height, v_tap);
} else if (filter_index == 0) { // 6 tap.
SetupTaps<6>(&v_horizontal_filter, v_tap);
- FilterHorizontal<6, 8, 0, is_2d, is_compound>(
- src, src_stride, dst, dst_stride, width, height, v_tap);
+ FilterHorizontal<6, 0, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+ width, height, v_tap);
} else if (filter_index == 4) { // 4 tap.
SetupTaps<4>(&v_horizontal_filter, v_tap);
- FilterHorizontal<4, 8, 4, is_2d, is_compound>(
- src, src_stride, dst, dst_stride, width, height, v_tap);
+ FilterHorizontal<4, 4, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+ width, height, v_tap);
} else if (filter_index == 5) { // 4 tap.
SetupTaps<4>(&v_horizontal_filter, v_tap);
- FilterHorizontal<4, 8, 5, is_2d, is_compound>(
- src, src_stride, dst, dst_stride, width, height, v_tap);
+ FilterHorizontal<4, 5, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+ width, height, v_tap);
} else { // 2 tap.
SetupTaps<2>(&v_horizontal_filter, v_tap);
- FilterHorizontal<2, 8, 3, is_2d, is_compound>(
- src, src_stride, dst, dst_stride, width, height, v_tap);
+ FilterHorizontal<2, 3, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+ width, height, v_tap);
}
}
@@ -718,39 +327,6 @@ void Convolve2D_SSE4_1(const void* const reference,
}
}
-// The 1D compound shift is always |kInterRoundBitsHorizontal|, even for 1D
-// Vertical calculations.
-__m128i Compound1DShift(const __m128i sum) {
- return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
-}
-
-template <int filter_index>
-__m128i SumVerticalTaps(const __m128i* const srcs, const __m128i* const v_tap) {
- __m128i v_src[4];
-
- if (filter_index < 2) {
- // 6 taps.
- v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
- v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]);
- v_src[2] = _mm_unpacklo_epi8(srcs[4], srcs[5]);
- } else if (filter_index == 2) {
- // 8 taps.
- v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
- v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]);
- v_src[2] = _mm_unpacklo_epi8(srcs[4], srcs[5]);
- v_src[3] = _mm_unpacklo_epi8(srcs[6], srcs[7]);
- } else if (filter_index == 3) {
- // 2 taps.
- v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
- } else if (filter_index > 3) {
- // 4 taps.
- v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
- v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]);
- }
- const __m128i sum = SumOnePassTaps<filter_index>(v_src, v_tap);
- return sum;
-}
-
template <int filter_index, bool is_compound = false>
void FilterVertical(const uint8_t* src, const ptrdiff_t src_stride,
void* const dst, const ptrdiff_t dst_stride,
@@ -787,7 +363,9 @@ void FilterVertical(const uint8_t* src, const ptrdiff_t src_stride,
}
}
- int y = 0;
+ auto* dst8_x = dst8 + x;
+ auto* dst16_x = dst16 + x;
+ int y = height;
do {
srcs[next_row] = LoadLo8(src_x);
src_x += src_stride;
@@ -795,11 +373,13 @@ void FilterVertical(const uint8_t* src, const ptrdiff_t src_stride,
const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
if (is_compound) {
const __m128i results = Compound1DShift(sums);
- StoreUnaligned16(dst16 + x + y * dst_stride, results);
+ StoreUnaligned16(dst16_x, results);
+ dst16_x += dst_stride;
} else {
const __m128i results =
RightShiftWithRounding_S16(sums, kFilterBits - 1);
- StoreLo8(dst8 + x + y * dst_stride, _mm_packus_epi16(results, results));
+ StoreLo8(dst8_x, _mm_packus_epi16(results, results));
+ dst8_x += dst_stride;
}
srcs[0] = srcs[1];
@@ -815,506 +395,11 @@ void FilterVertical(const uint8_t* src, const ptrdiff_t src_stride,
}
}
}
- } while (++y < height);
+ } while (--y != 0);
x += 8;
} while (x < width);
}
-template <int filter_index, bool is_compound = false>
-void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
- void* const dst, const ptrdiff_t dst_stride,
- const int height, const __m128i* const v_tap) {
- const int num_taps = GetNumTapsInFilter(filter_index);
- auto* dst8 = static_cast<uint8_t*>(dst);
- auto* dst16 = static_cast<uint16_t*>(dst);
-
- __m128i srcs[9];
-
- if (num_taps == 2) {
- srcs[2] = _mm_setzero_si128();
- // 00 01 02 03
- srcs[0] = Load4(src);
- src += src_stride;
-
- int y = 0;
- do {
- // 10 11 12 13
- const __m128i a = Load4(src);
- // 00 01 02 03 10 11 12 13
- srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
- src += src_stride;
- // 20 21 22 23
- srcs[2] = Load4(src);
- src += src_stride;
- // 10 11 12 13 20 21 22 23
- srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
-
- const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
- if (is_compound) {
- const __m128i results = Compound1DShift(sums);
- StoreUnaligned16(dst16, results);
- dst16 += 4 << 1;
- } else {
- const __m128i results_16 =
- RightShiftWithRounding_S16(sums, kFilterBits - 1);
- const __m128i results = _mm_packus_epi16(results_16, results_16);
- Store4(dst8, results);
- dst8 += dst_stride;
- Store4(dst8, _mm_srli_si128(results, 4));
- dst8 += dst_stride;
- }
-
- srcs[0] = srcs[2];
- y += 2;
- } while (y < height);
- } else if (num_taps == 4) {
- srcs[4] = _mm_setzero_si128();
- // 00 01 02 03
- srcs[0] = Load4(src);
- src += src_stride;
- // 10 11 12 13
- const __m128i a = Load4(src);
- // 00 01 02 03 10 11 12 13
- srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
- src += src_stride;
- // 20 21 22 23
- srcs[2] = Load4(src);
- src += src_stride;
- // 10 11 12 13 20 21 22 23
- srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
-
- int y = 0;
- do {
- // 30 31 32 33
- const __m128i b = Load4(src);
- // 20 21 22 23 30 31 32 33
- srcs[2] = _mm_unpacklo_epi32(srcs[2], b);
- src += src_stride;
- // 40 41 42 43
- srcs[4] = Load4(src);
- src += src_stride;
- // 30 31 32 33 40 41 42 43
- srcs[3] = _mm_unpacklo_epi32(b, srcs[4]);
-
- const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
- if (is_compound) {
- const __m128i results = Compound1DShift(sums);
- StoreUnaligned16(dst16, results);
- dst16 += 4 << 1;
- } else {
- const __m128i results_16 =
- RightShiftWithRounding_S16(sums, kFilterBits - 1);
- const __m128i results = _mm_packus_epi16(results_16, results_16);
- Store4(dst8, results);
- dst8 += dst_stride;
- Store4(dst8, _mm_srli_si128(results, 4));
- dst8 += dst_stride;
- }
-
- srcs[0] = srcs[2];
- srcs[1] = srcs[3];
- srcs[2] = srcs[4];
- y += 2;
- } while (y < height);
- } else if (num_taps == 6) {
- srcs[6] = _mm_setzero_si128();
- // 00 01 02 03
- srcs[0] = Load4(src);
- src += src_stride;
- // 10 11 12 13
- const __m128i a = Load4(src);
- // 00 01 02 03 10 11 12 13
- srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
- src += src_stride;
- // 20 21 22 23
- srcs[2] = Load4(src);
- src += src_stride;
- // 10 11 12 13 20 21 22 23
- srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
- // 30 31 32 33
- const __m128i b = Load4(src);
- // 20 21 22 23 30 31 32 33
- srcs[2] = _mm_unpacklo_epi32(srcs[2], b);
- src += src_stride;
- // 40 41 42 43
- srcs[4] = Load4(src);
- src += src_stride;
- // 30 31 32 33 40 41 42 43
- srcs[3] = _mm_unpacklo_epi32(b, srcs[4]);
-
- int y = 0;
- do {
- // 50 51 52 53
- const __m128i c = Load4(src);
- // 40 41 42 43 50 51 52 53
- srcs[4] = _mm_unpacklo_epi32(srcs[4], c);
- src += src_stride;
- // 60 61 62 63
- srcs[6] = Load4(src);
- src += src_stride;
- // 50 51 52 53 60 61 62 63
- srcs[5] = _mm_unpacklo_epi32(c, srcs[6]);
-
- const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
- if (is_compound) {
- const __m128i results = Compound1DShift(sums);
- StoreUnaligned16(dst16, results);
- dst16 += 4 << 1;
- } else {
- const __m128i results_16 =
- RightShiftWithRounding_S16(sums, kFilterBits - 1);
- const __m128i results = _mm_packus_epi16(results_16, results_16);
- Store4(dst8, results);
- dst8 += dst_stride;
- Store4(dst8, _mm_srli_si128(results, 4));
- dst8 += dst_stride;
- }
-
- srcs[0] = srcs[2];
- srcs[1] = srcs[3];
- srcs[2] = srcs[4];
- srcs[3] = srcs[5];
- srcs[4] = srcs[6];
- y += 2;
- } while (y < height);
- } else if (num_taps == 8) {
- srcs[8] = _mm_setzero_si128();
- // 00 01 02 03
- srcs[0] = Load4(src);
- src += src_stride;
- // 10 11 12 13
- const __m128i a = Load4(src);
- // 00 01 02 03 10 11 12 13
- srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
- src += src_stride;
- // 20 21 22 23
- srcs[2] = Load4(src);
- src += src_stride;
- // 10 11 12 13 20 21 22 23
- srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
- // 30 31 32 33
- const __m128i b = Load4(src);
- // 20 21 22 23 30 31 32 33
- srcs[2] = _mm_unpacklo_epi32(srcs[2], b);
- src += src_stride;
- // 40 41 42 43
- srcs[4] = Load4(src);
- src += src_stride;
- // 30 31 32 33 40 41 42 43
- srcs[3] = _mm_unpacklo_epi32(b, srcs[4]);
- // 50 51 52 53
- const __m128i c = Load4(src);
- // 40 41 42 43 50 51 52 53
- srcs[4] = _mm_unpacklo_epi32(srcs[4], c);
- src += src_stride;
- // 60 61 62 63
- srcs[6] = Load4(src);
- src += src_stride;
- // 50 51 52 53 60 61 62 63
- srcs[5] = _mm_unpacklo_epi32(c, srcs[6]);
-
- int y = 0;
- do {
- // 70 71 72 73
- const __m128i d = Load4(src);
- // 60 61 62 63 70 71 72 73
- srcs[6] = _mm_unpacklo_epi32(srcs[6], d);
- src += src_stride;
- // 80 81 82 83
- srcs[8] = Load4(src);
- src += src_stride;
- // 70 71 72 73 80 81 82 83
- srcs[7] = _mm_unpacklo_epi32(d, srcs[8]);
-
- const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
- if (is_compound) {
- const __m128i results = Compound1DShift(sums);
- StoreUnaligned16(dst16, results);
- dst16 += 4 << 1;
- } else {
- const __m128i results_16 =
- RightShiftWithRounding_S16(sums, kFilterBits - 1);
- const __m128i results = _mm_packus_epi16(results_16, results_16);
- Store4(dst8, results);
- dst8 += dst_stride;
- Store4(dst8, _mm_srli_si128(results, 4));
- dst8 += dst_stride;
- }
-
- srcs[0] = srcs[2];
- srcs[1] = srcs[3];
- srcs[2] = srcs[4];
- srcs[3] = srcs[5];
- srcs[4] = srcs[6];
- srcs[5] = srcs[7];
- srcs[6] = srcs[8];
- y += 2;
- } while (y < height);
- }
-}
-
-template <int filter_index, bool negative_outside_taps = false>
-void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride,
- void* const dst, const ptrdiff_t dst_stride,
- const int height, const __m128i* const v_tap) {
- const int num_taps = GetNumTapsInFilter(filter_index);
- auto* dst8 = static_cast<uint8_t*>(dst);
-
- __m128i srcs[9];
-
- if (num_taps == 2) {
- srcs[2] = _mm_setzero_si128();
- // 00 01
- srcs[0] = Load2(src);
- src += src_stride;
-
- int y = 0;
- do {
- // 00 01 10 11
- srcs[0] = Load2<1>(src, srcs[0]);
- src += src_stride;
- // 00 01 10 11 20 21
- srcs[0] = Load2<2>(src, srcs[0]);
- src += src_stride;
- // 00 01 10 11 20 21 30 31
- srcs[0] = Load2<3>(src, srcs[0]);
- src += src_stride;
- // 40 41
- srcs[2] = Load2<0>(src, srcs[2]);
- src += src_stride;
- // 00 01 10 11 20 21 30 31 40 41
- const __m128i srcs_0_2 = _mm_unpacklo_epi64(srcs[0], srcs[2]);
- // 10 11 20 21 30 31 40 41
- srcs[1] = _mm_srli_si128(srcs_0_2, 2);
- // This uses srcs[0]..srcs[1].
- const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
- const __m128i results_16 =
- RightShiftWithRounding_S16(sums, kFilterBits - 1);
- const __m128i results = _mm_packus_epi16(results_16, results_16);
-
- Store2(dst8, results);
- dst8 += dst_stride;
- Store2(dst8, _mm_srli_si128(results, 2));
- if (height == 2) return;
- dst8 += dst_stride;
- Store2(dst8, _mm_srli_si128(results, 4));
- dst8 += dst_stride;
- Store2(dst8, _mm_srli_si128(results, 6));
- dst8 += dst_stride;
-
- srcs[0] = srcs[2];
- y += 4;
- } while (y < height);
- } else if (num_taps == 4) {
- srcs[4] = _mm_setzero_si128();
-
- // 00 01
- srcs[0] = Load2(src);
- src += src_stride;
- // 00 01 10 11
- srcs[0] = Load2<1>(src, srcs[0]);
- src += src_stride;
- // 00 01 10 11 20 21
- srcs[0] = Load2<2>(src, srcs[0]);
- src += src_stride;
-
- int y = 0;
- do {
- // 00 01 10 11 20 21 30 31
- srcs[0] = Load2<3>(src, srcs[0]);
- src += src_stride;
- // 40 41
- srcs[4] = Load2<0>(src, srcs[4]);
- src += src_stride;
- // 40 41 50 51
- srcs[4] = Load2<1>(src, srcs[4]);
- src += src_stride;
- // 40 41 50 51 60 61
- srcs[4] = Load2<2>(src, srcs[4]);
- src += src_stride;
- // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
- const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]);
- // 10 11 20 21 30 31 40 41
- srcs[1] = _mm_srli_si128(srcs_0_4, 2);
- // 20 21 30 31 40 41 50 51
- srcs[2] = _mm_srli_si128(srcs_0_4, 4);
- // 30 31 40 41 50 51 60 61
- srcs[3] = _mm_srli_si128(srcs_0_4, 6);
-
- // This uses srcs[0]..srcs[3].
- const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
- const __m128i results_16 =
- RightShiftWithRounding_S16(sums, kFilterBits - 1);
- const __m128i results = _mm_packus_epi16(results_16, results_16);
-
- Store2(dst8, results);
- dst8 += dst_stride;
- Store2(dst8, _mm_srli_si128(results, 2));
- if (height == 2) return;
- dst8 += dst_stride;
- Store2(dst8, _mm_srli_si128(results, 4));
- dst8 += dst_stride;
- Store2(dst8, _mm_srli_si128(results, 6));
- dst8 += dst_stride;
-
- srcs[0] = srcs[4];
- y += 4;
- } while (y < height);
- } else if (num_taps == 6) {
- // During the vertical pass the number of taps is restricted when
- // |height| <= 4.
- assert(height > 4);
- srcs[8] = _mm_setzero_si128();
-
- // 00 01
- srcs[0] = Load2(src);
- src += src_stride;
- // 00 01 10 11
- srcs[0] = Load2<1>(src, srcs[0]);
- src += src_stride;
- // 00 01 10 11 20 21
- srcs[0] = Load2<2>(src, srcs[0]);
- src += src_stride;
- // 00 01 10 11 20 21 30 31
- srcs[0] = Load2<3>(src, srcs[0]);
- src += src_stride;
- // 40 41
- srcs[4] = Load2(src);
- src += src_stride;
- // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
- const __m128i srcs_0_4x = _mm_unpacklo_epi64(srcs[0], srcs[4]);
- // 10 11 20 21 30 31 40 41
- srcs[1] = _mm_srli_si128(srcs_0_4x, 2);
-
- int y = 0;
- do {
- // 40 41 50 51
- srcs[4] = Load2<1>(src, srcs[4]);
- src += src_stride;
- // 40 41 50 51 60 61
- srcs[4] = Load2<2>(src, srcs[4]);
- src += src_stride;
- // 40 41 50 51 60 61 70 71
- srcs[4] = Load2<3>(src, srcs[4]);
- src += src_stride;
- // 80 81
- srcs[8] = Load2<0>(src, srcs[8]);
- src += src_stride;
- // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
- const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]);
- // 20 21 30 31 40 41 50 51
- srcs[2] = _mm_srli_si128(srcs_0_4, 4);
- // 30 31 40 41 50 51 60 61
- srcs[3] = _mm_srli_si128(srcs_0_4, 6);
- const __m128i srcs_4_8 = _mm_unpacklo_epi64(srcs[4], srcs[8]);
- // 50 51 60 61 70 71 80 81
- srcs[5] = _mm_srli_si128(srcs_4_8, 2);
-
- // This uses srcs[0]..srcs[5].
- const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
- const __m128i results_16 =
- RightShiftWithRounding_S16(sums, kFilterBits - 1);
- const __m128i results = _mm_packus_epi16(results_16, results_16);
-
- Store2(dst8, results);
- dst8 += dst_stride;
- Store2(dst8, _mm_srli_si128(results, 2));
- dst8 += dst_stride;
- Store2(dst8, _mm_srli_si128(results, 4));
- dst8 += dst_stride;
- Store2(dst8, _mm_srli_si128(results, 6));
- dst8 += dst_stride;
-
- srcs[0] = srcs[4];
- srcs[1] = srcs[5];
- srcs[4] = srcs[8];
- y += 4;
- } while (y < height);
- } else if (num_taps == 8) {
- // During the vertical pass the number of taps is restricted when
- // |height| <= 4.
- assert(height > 4);
- srcs[8] = _mm_setzero_si128();
- // 00 01
- srcs[0] = Load2(src);
- src += src_stride;
- // 00 01 10 11
- srcs[0] = Load2<1>(src, srcs[0]);
- src += src_stride;
- // 00 01 10 11 20 21
- srcs[0] = Load2<2>(src, srcs[0]);
- src += src_stride;
- // 00 01 10 11 20 21 30 31
- srcs[0] = Load2<3>(src, srcs[0]);
- src += src_stride;
- // 40 41
- srcs[4] = Load2(src);
- src += src_stride;
- // 40 41 50 51
- srcs[4] = Load2<1>(src, srcs[4]);
- src += src_stride;
- // 40 41 50 51 60 61
- srcs[4] = Load2<2>(src, srcs[4]);
- src += src_stride;
-
- // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
- const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]);
- // 10 11 20 21 30 31 40 41
- srcs[1] = _mm_srli_si128(srcs_0_4, 2);
- // 20 21 30 31 40 41 50 51
- srcs[2] = _mm_srli_si128(srcs_0_4, 4);
- // 30 31 40 41 50 51 60 61
- srcs[3] = _mm_srli_si128(srcs_0_4, 6);
-
- int y = 0;
- do {
- // 40 41 50 51 60 61 70 71
- srcs[4] = Load2<3>(src, srcs[4]);
- src += src_stride;
- // 80 81
- srcs[8] = Load2<0>(src, srcs[8]);
- src += src_stride;
- // 80 81 90 91
- srcs[8] = Load2<1>(src, srcs[8]);
- src += src_stride;
- // 80 81 90 91 a0 a1
- srcs[8] = Load2<2>(src, srcs[8]);
- src += src_stride;
-
- // 40 41 50 51 60 61 70 71 80 81 90 91 a0 a1
- const __m128i srcs_4_8 = _mm_unpacklo_epi64(srcs[4], srcs[8]);
- // 50 51 60 61 70 71 80 81
- srcs[5] = _mm_srli_si128(srcs_4_8, 2);
- // 60 61 70 71 80 81 90 91
- srcs[6] = _mm_srli_si128(srcs_4_8, 4);
- // 70 71 80 81 90 91 a0 a1
- srcs[7] = _mm_srli_si128(srcs_4_8, 6);
-
- // This uses srcs[0]..srcs[7].
- const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
- const __m128i results_16 =
- RightShiftWithRounding_S16(sums, kFilterBits - 1);
- const __m128i results = _mm_packus_epi16(results_16, results_16);
-
- Store2(dst8, results);
- dst8 += dst_stride;
- Store2(dst8, _mm_srli_si128(results, 2));
- dst8 += dst_stride;
- Store2(dst8, _mm_srli_si128(results, 4));
- dst8 += dst_stride;
- Store2(dst8, _mm_srli_si128(results, 6));
- dst8 += dst_stride;
-
- srcs[0] = srcs[4];
- srcs[1] = srcs[5];
- srcs[2] = srcs[6];
- srcs[3] = srcs[7];
- srcs[4] = srcs[8];
- y += 4;
- } while (y < height);
- }
-}
-
void ConvolveVertical_SSE4_1(const void* const reference,
const ptrdiff_t reference_stride,
const int /*horizontal_filter_index*/,
@@ -1339,9 +424,9 @@ void ConvolveVertical_SSE4_1(const void* const reference,
if (filter_index < 2) { // 6 tap.
SetupTaps<6>(&v_filter, taps);
if (width == 2) {
- FilterVertical2xH<0>(src, src_stride, dest, dest_stride, height, taps);
+ FilterVertical2xH<6, 0>(src, src_stride, dest, dest_stride, height, taps);
} else if (width == 4) {
- FilterVertical4xH<0>(src, src_stride, dest, dest_stride, height, taps);
+ FilterVertical4xH<6, 0>(src, src_stride, dest, dest_stride, height, taps);
} else {
FilterVertical<0>(src, src_stride, dest, dest_stride, width, height,
taps);
@@ -1349,9 +434,9 @@ void ConvolveVertical_SSE4_1(const void* const reference,
} else if (filter_index == 2) { // 8 tap.
SetupTaps<8>(&v_filter, taps);
if (width == 2) {
- FilterVertical2xH<2>(src, src_stride, dest, dest_stride, height, taps);
+ FilterVertical2xH<8, 2>(src, src_stride, dest, dest_stride, height, taps);
} else if (width == 4) {
- FilterVertical4xH<2>(src, src_stride, dest, dest_stride, height, taps);
+ FilterVertical4xH<8, 2>(src, src_stride, dest, dest_stride, height, taps);
} else {
FilterVertical<2>(src, src_stride, dest, dest_stride, width, height,
taps);
@@ -1359,9 +444,9 @@ void ConvolveVertical_SSE4_1(const void* const reference,
} else if (filter_index == 3) { // 2 tap.
SetupTaps<2>(&v_filter, taps);
if (width == 2) {
- FilterVertical2xH<3>(src, src_stride, dest, dest_stride, height, taps);
+ FilterVertical2xH<2, 3>(src, src_stride, dest, dest_stride, height, taps);
} else if (width == 4) {
- FilterVertical4xH<3>(src, src_stride, dest, dest_stride, height, taps);
+ FilterVertical4xH<2, 3>(src, src_stride, dest, dest_stride, height, taps);
} else {
FilterVertical<3>(src, src_stride, dest, dest_stride, width, height,
taps);
@@ -1369,9 +454,9 @@ void ConvolveVertical_SSE4_1(const void* const reference,
} else if (filter_index == 4) { // 4 tap.
SetupTaps<4>(&v_filter, taps);
if (width == 2) {
- FilterVertical2xH<4>(src, src_stride, dest, dest_stride, height, taps);
+ FilterVertical2xH<4, 4>(src, src_stride, dest, dest_stride, height, taps);
} else if (width == 4) {
- FilterVertical4xH<4>(src, src_stride, dest, dest_stride, height, taps);
+ FilterVertical4xH<4, 4>(src, src_stride, dest, dest_stride, height, taps);
} else {
FilterVertical<4>(src, src_stride, dest, dest_stride, width, height,
taps);
@@ -1382,9 +467,9 @@ void ConvolveVertical_SSE4_1(const void* const reference,
SetupTaps<4>(&v_filter, taps);
if (width == 2) {
- FilterVertical2xH<5>(src, src_stride, dest, dest_stride, height, taps);
+ FilterVertical2xH<4, 5>(src, src_stride, dest, dest_stride, height, taps);
} else if (width == 4) {
- FilterVertical4xH<5>(src, src_stride, dest, dest_stride, height, taps);
+ FilterVertical4xH<4, 5>(src, src_stride, dest, dest_stride, height, taps);
} else {
FilterVertical<5>(src, src_stride, dest, dest_stride, width, height,
taps);
@@ -1474,8 +559,8 @@ void ConvolveCompoundVertical_SSE4_1(
if (filter_index < 2) { // 6 tap.
SetupTaps<6>(&v_filter, taps);
if (width == 4) {
- FilterVertical4xH<0, /*is_compound=*/true>(src, src_stride, dest, 4,
- height, taps);
+ FilterVertical4xH<6, 0, /*is_compound=*/true>(src, src_stride, dest, 4,
+ height, taps);
} else {
FilterVertical<0, /*is_compound=*/true>(src, src_stride, dest, width,
width, height, taps);
@@ -1484,8 +569,8 @@ void ConvolveCompoundVertical_SSE4_1(
SetupTaps<8>(&v_filter, taps);
if (width == 4) {
- FilterVertical4xH<2, /*is_compound=*/true>(src, src_stride, dest, 4,
- height, taps);
+ FilterVertical4xH<8, 2, /*is_compound=*/true>(src, src_stride, dest, 4,
+ height, taps);
} else {
FilterVertical<2, /*is_compound=*/true>(src, src_stride, dest, width,
width, height, taps);
@@ -1494,8 +579,8 @@ void ConvolveCompoundVertical_SSE4_1(
SetupTaps<2>(&v_filter, taps);
if (width == 4) {
- FilterVertical4xH<3, /*is_compound=*/true>(src, src_stride, dest, 4,
- height, taps);
+ FilterVertical4xH<2, 3, /*is_compound=*/true>(src, src_stride, dest, 4,
+ height, taps);
} else {
FilterVertical<3, /*is_compound=*/true>(src, src_stride, dest, width,
width, height, taps);
@@ -1504,8 +589,8 @@ void ConvolveCompoundVertical_SSE4_1(
SetupTaps<4>(&v_filter, taps);
if (width == 4) {
- FilterVertical4xH<4, /*is_compound=*/true>(src, src_stride, dest, 4,
- height, taps);
+ FilterVertical4xH<4, 4, /*is_compound=*/true>(src, src_stride, dest, 4,
+ height, taps);
} else {
FilterVertical<4, /*is_compound=*/true>(src, src_stride, dest, width,
width, height, taps);
@@ -1514,8 +599,8 @@ void ConvolveCompoundVertical_SSE4_1(
SetupTaps<4>(&v_filter, taps);
if (width == 4) {
- FilterVertical4xH<5, /*is_compound=*/true>(src, src_stride, dest, 4,
- height, taps);
+ FilterVertical4xH<4, 5, /*is_compound=*/true>(src, src_stride, dest, 4,
+ height, taps);
} else {
FilterVertical<5, /*is_compound=*/true>(src, src_stride, dest, width,
width, height, taps);
@@ -1752,7 +837,11 @@ inline void GetHalfSubPixelFilter(__m128i* output) {
template <int num_taps, int grade_x>
inline void PrepareSourceVectors(const uint8_t* src, const __m128i src_indices,
__m128i* const source /*[num_taps >> 1]*/) {
- const __m128i src_vals = LoadUnaligned16(src);
+ // |used_bytes| is only computed in msan builds. Mask away unused bytes for
+ // msan because it incorrectly models the outcome of the shuffles in some
+ // cases. This has not been reproduced out of context.
+ const int used_bytes = _mm_extract_epi8(src_indices, 15) + 1 + num_taps - 2;
+ const __m128i src_vals = LoadUnaligned16Msan(src, 16 - used_bytes);
source[0] = _mm_shuffle_epi8(src_vals, src_indices);
if (grade_x == 1) {
if (num_taps > 2) {
@@ -1768,7 +857,7 @@ inline void PrepareSourceVectors(const uint8_t* src, const __m128i src_indices,
assert(grade_x > 1);
assert(num_taps != 4);
// grade_x > 1 also means width >= 8 && num_taps != 4
- const __m128i src_vals_ext = LoadLo8(src + 16);
+ const __m128i src_vals_ext = LoadLo8Msan(src + 16, 24 - used_bytes);
if (num_taps > 2) {
source[1] = _mm_shuffle_epi8(_mm_alignr_epi8(src_vals_ext, src_vals, 2),
src_indices);
@@ -1983,14 +1072,10 @@ __m128i Sum2DVerticalTaps4x2(const __m128i* const src, const __m128i* taps_lo,
// |width_class| is 2, 4, or 8, according to the Store function that should be
// used.
template <int num_taps, int width_class, bool is_compound>
-#if LIBGAV1_MSAN
-__attribute__((no_sanitize_memory)) void ConvolveVerticalScale(
-#else
-inline void ConvolveVerticalScale(
-#endif
- const int16_t* src, const int width, const int subpixel_y,
- const int filter_index, const int step_y, const int height, void* dest,
- const ptrdiff_t dest_stride) {
+inline void ConvolveVerticalScale(const int16_t* src, const int width,
+ const int subpixel_y, const int filter_index,
+ const int step_y, const int height,
+ void* dest, const ptrdiff_t dest_stride) {
constexpr ptrdiff_t src_stride = kIntermediateStride;
constexpr int kernel_offset = (8 - num_taps) / 2;
const int16_t* src_y = src;
@@ -2819,7 +1904,7 @@ void ConvolveInit_SSE4_1() { low_bitdepth::Init8bpp(); }
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_TARGETING_SSE4_1
+#else // !LIBGAV1_TARGETING_SSE4_1
namespace libgav1 {
namespace dsp {
diff --git a/src/dsp/x86/convolve_sse4.inc b/src/dsp/x86/convolve_sse4.inc
new file mode 100644
index 0000000..550d6a4
--- /dev/null
+++ b/src/dsp/x86/convolve_sse4.inc
@@ -0,0 +1,934 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Common 128 bit functions used for sse4/avx2 convolve implementations.
+// This will be included inside an anonymous namespace on files where these are
+// necessary.
+
+#include "src/dsp/convolve.inc"
+
+// Multiply every entry in |src[]| by the corresponding entry in |taps[]| and
+// sum. The filters in |taps[]| are pre-shifted by 1. This prevents the final
+// sum from outranging int16_t.
+template <int filter_index>
+__m128i SumOnePassTaps(const __m128i* const src, const __m128i* const taps) {
+ __m128i sum;
+ if (filter_index < 2) {
+ // 6 taps.
+ const __m128i v_madd_21 = _mm_maddubs_epi16(src[0], taps[0]); // k2k1
+ const __m128i v_madd_43 = _mm_maddubs_epi16(src[1], taps[1]); // k4k3
+ const __m128i v_madd_65 = _mm_maddubs_epi16(src[2], taps[2]); // k6k5
+ sum = _mm_add_epi16(v_madd_21, v_madd_43);
+ sum = _mm_add_epi16(sum, v_madd_65);
+ } else if (filter_index == 2) {
+ // 8 taps.
+ const __m128i v_madd_10 = _mm_maddubs_epi16(src[0], taps[0]); // k1k0
+ const __m128i v_madd_32 = _mm_maddubs_epi16(src[1], taps[1]); // k3k2
+ const __m128i v_madd_54 = _mm_maddubs_epi16(src[2], taps[2]); // k5k4
+ const __m128i v_madd_76 = _mm_maddubs_epi16(src[3], taps[3]); // k7k6
+ const __m128i v_sum_3210 = _mm_add_epi16(v_madd_10, v_madd_32);
+ const __m128i v_sum_7654 = _mm_add_epi16(v_madd_54, v_madd_76);
+ sum = _mm_add_epi16(v_sum_7654, v_sum_3210);
+ } else if (filter_index == 3) {
+ // 2 taps.
+ sum = _mm_maddubs_epi16(src[0], taps[0]); // k4k3
+ } else {
+ // 4 taps.
+ const __m128i v_madd_32 = _mm_maddubs_epi16(src[0], taps[0]); // k3k2
+ const __m128i v_madd_54 = _mm_maddubs_epi16(src[1], taps[1]); // k5k4
+ sum = _mm_add_epi16(v_madd_32, v_madd_54);
+ }
+ return sum;
+}
+
+template <int filter_index>
+__m128i SumHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
+ const __m128i* const v_tap) {
+ // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17
+ const __m128i v_src = LoadHi8(LoadLo8(&src[0]), &src[src_stride]);
+
+ if (filter_index == 3) {
+ // 03 04 04 05 05 06 06 07 13 14 14 15 15 16 16 17
+ const __m128i v_src_43 = _mm_shuffle_epi8(
+ v_src, _mm_set_epi32(0x0f0e0e0d, 0x0d0c0c0b, 0x07060605, 0x05040403));
+ const __m128i v_sum_43 = _mm_maddubs_epi16(v_src_43, v_tap[0]); // k4k3
+ return v_sum_43;
+ }
+
+ // 02 03 03 04 04 05 05 06 12 13 13 14 14 15 15 16
+ const __m128i v_src_32 = _mm_shuffle_epi8(
+ v_src, _mm_set_epi32(0x0e0d0d0c, 0x0c0b0b0a, 0x06050504, 0x04030302));
+ // 04 05 05 06 06 07 07 xx 14 15 15 16 16 17 17 xx
+ const __m128i v_src_54 = _mm_shuffle_epi8(
+ v_src, _mm_set_epi32(static_cast<int>(0x800f0f0e), 0x0e0d0d0c,
+ static_cast<int>(0x80070706), 0x06050504));
+ const __m128i v_madd_32 = _mm_maddubs_epi16(v_src_32, v_tap[0]); // k3k2
+ const __m128i v_madd_54 = _mm_maddubs_epi16(v_src_54, v_tap[1]); // k5k4
+ const __m128i v_sum_5432 = _mm_add_epi16(v_madd_54, v_madd_32);
+ return v_sum_5432;
+}
+
+template <int filter_index>
+__m128i SimpleHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
+ const __m128i* const v_tap) {
+ __m128i sum = SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
+
+ // Normally the Horizontal pass does the downshift in two passes:
+ // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
+ // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them
+ // requires adding the rounding offset from the skipped shift.
+ constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2);
+
+ sum = _mm_add_epi16(sum, _mm_set1_epi16(first_shift_rounding_bit));
+ sum = RightShiftWithRounding_S16(sum, kFilterBits - 1);
+ return _mm_packus_epi16(sum, sum);
+}
+
+template <int filter_index>
+__m128i HorizontalTaps8To16_2x2(const uint8_t* src, const ptrdiff_t src_stride,
+ const __m128i* const v_tap) {
+ const __m128i sum =
+ SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
+
+ return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
+}
+
+template <int num_taps, bool is_2d_vertical = false>
+LIBGAV1_ALWAYS_INLINE void SetupTaps(const __m128i* const filter,
+ __m128i* v_tap) {
+ if (num_taps == 8) {
+ v_tap[0] = _mm_shufflelo_epi16(*filter, 0x0); // k1k0
+ v_tap[1] = _mm_shufflelo_epi16(*filter, 0x55); // k3k2
+ v_tap[2] = _mm_shufflelo_epi16(*filter, 0xaa); // k5k4
+ v_tap[3] = _mm_shufflelo_epi16(*filter, 0xff); // k7k6
+ if (is_2d_vertical) {
+ v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
+ v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]);
+ v_tap[2] = _mm_cvtepi8_epi16(v_tap[2]);
+ v_tap[3] = _mm_cvtepi8_epi16(v_tap[3]);
+ } else {
+ v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
+ v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
+ v_tap[2] = _mm_unpacklo_epi64(v_tap[2], v_tap[2]);
+ v_tap[3] = _mm_unpacklo_epi64(v_tap[3], v_tap[3]);
+ }
+ } else if (num_taps == 6) {
+ const __m128i adjusted_filter = _mm_srli_si128(*filter, 1);
+ v_tap[0] = _mm_shufflelo_epi16(adjusted_filter, 0x0); // k2k1
+ v_tap[1] = _mm_shufflelo_epi16(adjusted_filter, 0x55); // k4k3
+ v_tap[2] = _mm_shufflelo_epi16(adjusted_filter, 0xaa); // k6k5
+ if (is_2d_vertical) {
+ v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
+ v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]);
+ v_tap[2] = _mm_cvtepi8_epi16(v_tap[2]);
+ } else {
+ v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
+ v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
+ v_tap[2] = _mm_unpacklo_epi64(v_tap[2], v_tap[2]);
+ }
+ } else if (num_taps == 4) {
+ v_tap[0] = _mm_shufflelo_epi16(*filter, 0x55); // k3k2
+ v_tap[1] = _mm_shufflelo_epi16(*filter, 0xaa); // k5k4
+ if (is_2d_vertical) {
+ v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
+ v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]);
+ } else {
+ v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
+ v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
+ }
+ } else { // num_taps == 2
+ const __m128i adjusted_filter = _mm_srli_si128(*filter, 1);
+ v_tap[0] = _mm_shufflelo_epi16(adjusted_filter, 0x55); // k4k3
+ if (is_2d_vertical) {
+ v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
+ } else {
+ v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
+ }
+ }
+}
+
+template <int num_taps, bool is_compound>
+__m128i SimpleSum2DVerticalTaps(const __m128i* const src,
+ const __m128i* const taps) {
+ __m128i sum_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[0], src[1]), taps[0]);
+ __m128i sum_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[0], src[1]), taps[0]);
+ if (num_taps >= 4) {
+ __m128i madd_lo =
+ _mm_madd_epi16(_mm_unpacklo_epi16(src[2], src[3]), taps[1]);
+ __m128i madd_hi =
+ _mm_madd_epi16(_mm_unpackhi_epi16(src[2], src[3]), taps[1]);
+ sum_lo = _mm_add_epi32(sum_lo, madd_lo);
+ sum_hi = _mm_add_epi32(sum_hi, madd_hi);
+ if (num_taps >= 6) {
+ madd_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[4], src[5]), taps[2]);
+ madd_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[4], src[5]), taps[2]);
+ sum_lo = _mm_add_epi32(sum_lo, madd_lo);
+ sum_hi = _mm_add_epi32(sum_hi, madd_hi);
+ if (num_taps == 8) {
+ madd_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[6], src[7]), taps[3]);
+ madd_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[6], src[7]), taps[3]);
+ sum_lo = _mm_add_epi32(sum_lo, madd_lo);
+ sum_hi = _mm_add_epi32(sum_hi, madd_hi);
+ }
+ }
+ }
+
+ if (is_compound) {
+ return _mm_packs_epi32(
+ RightShiftWithRounding_S32(sum_lo, kInterRoundBitsCompoundVertical - 1),
+ RightShiftWithRounding_S32(sum_hi,
+ kInterRoundBitsCompoundVertical - 1));
+ }
+
+ return _mm_packs_epi32(
+ RightShiftWithRounding_S32(sum_lo, kInterRoundBitsVertical - 1),
+ RightShiftWithRounding_S32(sum_hi, kInterRoundBitsVertical - 1));
+}
+
+template <int num_taps, bool is_compound = false>
+void Filter2DVertical(const uint16_t* src, void* const dst,
+ const ptrdiff_t dst_stride, const int width,
+ const int height, const __m128i* const taps) {
+ assert(width >= 8);
+ constexpr int next_row = num_taps - 1;
+ // The Horizontal pass uses |width| as |stride| for the intermediate buffer.
+ const ptrdiff_t src_stride = width;
+
+ auto* dst8 = static_cast<uint8_t*>(dst);
+ auto* dst16 = static_cast<uint16_t*>(dst);
+
+ int x = 0;
+ do {
+ __m128i srcs[8];
+ const uint16_t* src_x = src + x;
+ srcs[0] = LoadAligned16(src_x);
+ src_x += src_stride;
+ if (num_taps >= 4) {
+ srcs[1] = LoadAligned16(src_x);
+ src_x += src_stride;
+ srcs[2] = LoadAligned16(src_x);
+ src_x += src_stride;
+ if (num_taps >= 6) {
+ srcs[3] = LoadAligned16(src_x);
+ src_x += src_stride;
+ srcs[4] = LoadAligned16(src_x);
+ src_x += src_stride;
+ if (num_taps == 8) {
+ srcs[5] = LoadAligned16(src_x);
+ src_x += src_stride;
+ srcs[6] = LoadAligned16(src_x);
+ src_x += src_stride;
+ }
+ }
+ }
+
+ auto* dst8_x = dst8 + x;
+ auto* dst16_x = dst16 + x;
+ int y = height;
+ do {
+ srcs[next_row] = LoadAligned16(src_x);
+ src_x += src_stride;
+
+ const __m128i sum =
+ SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
+ if (is_compound) {
+ StoreUnaligned16(dst16_x, sum);
+ dst16_x += dst_stride;
+ } else {
+ StoreLo8(dst8_x, _mm_packus_epi16(sum, sum));
+ dst8_x += dst_stride;
+ }
+
+ srcs[0] = srcs[1];
+ if (num_taps >= 4) {
+ srcs[1] = srcs[2];
+ srcs[2] = srcs[3];
+ if (num_taps >= 6) {
+ srcs[3] = srcs[4];
+ srcs[4] = srcs[5];
+ if (num_taps == 8) {
+ srcs[5] = srcs[6];
+ srcs[6] = srcs[7];
+ }
+ }
+ }
+ } while (--y != 0);
+ x += 8;
+ } while (x < width);
+}
+
+// Take advantage of |src_stride| == |width| to process two rows at a time.
+template <int num_taps, bool is_compound = false>
+void Filter2DVertical4xH(const uint16_t* src, void* const dst,
+ const ptrdiff_t dst_stride, const int height,
+ const __m128i* const taps) {
+ auto* dst8 = static_cast<uint8_t*>(dst);
+ auto* dst16 = static_cast<uint16_t*>(dst);
+
+ __m128i srcs[9];
+ srcs[0] = LoadAligned16(src);
+ src += 8;
+ if (num_taps >= 4) {
+ srcs[2] = LoadAligned16(src);
+ src += 8;
+ srcs[1] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[0], 8), srcs[2]);
+ if (num_taps >= 6) {
+ srcs[4] = LoadAligned16(src);
+ src += 8;
+ srcs[3] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[2], 8), srcs[4]);
+ if (num_taps == 8) {
+ srcs[6] = LoadAligned16(src);
+ src += 8;
+ srcs[5] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[4], 8), srcs[6]);
+ }
+ }
+ }
+
+ int y = height;
+ do {
+ srcs[num_taps] = LoadAligned16(src);
+ src += 8;
+ srcs[num_taps - 1] = _mm_unpacklo_epi64(
+ _mm_srli_si128(srcs[num_taps - 2], 8), srcs[num_taps]);
+
+ const __m128i sum =
+ SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
+ if (is_compound) {
+ StoreUnaligned16(dst16, sum);
+ dst16 += 4 << 1;
+ } else {
+ const __m128i results = _mm_packus_epi16(sum, sum);
+ Store4(dst8, results);
+ dst8 += dst_stride;
+ Store4(dst8, _mm_srli_si128(results, 4));
+ dst8 += dst_stride;
+ }
+
+ srcs[0] = srcs[2];
+ if (num_taps >= 4) {
+ srcs[1] = srcs[3];
+ srcs[2] = srcs[4];
+ if (num_taps >= 6) {
+ srcs[3] = srcs[5];
+ srcs[4] = srcs[6];
+ if (num_taps == 8) {
+ srcs[5] = srcs[7];
+ srcs[6] = srcs[8];
+ }
+ }
+ }
+ y -= 2;
+ } while (y != 0);
+}
+
+// Take advantage of |src_stride| == |width| to process four rows at a time.
+template <int num_taps>
+void Filter2DVertical2xH(const uint16_t* src, void* const dst,
+ const ptrdiff_t dst_stride, const int height,
+ const __m128i* const taps) {
+ constexpr int next_row = (num_taps < 6) ? 4 : 8;
+
+ auto* dst8 = static_cast<uint8_t*>(dst);
+
+ __m128i srcs[9];
+ srcs[0] = LoadAligned16(src);
+ src += 8;
+ if (num_taps >= 6) {
+ srcs[4] = LoadAligned16(src);
+ src += 8;
+ srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4);
+ if (num_taps == 8) {
+ srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8);
+ srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12);
+ }
+ }
+
+ int y = height;
+ do {
+ srcs[next_row] = LoadAligned16(src);
+ src += 8;
+ if (num_taps == 2) {
+ srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4);
+ } else if (num_taps == 4) {
+ srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4);
+ srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8);
+ srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12);
+ } else if (num_taps == 6) {
+ srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8);
+ srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12);
+ srcs[5] = _mm_alignr_epi8(srcs[8], srcs[4], 4);
+ } else if (num_taps == 8) {
+ srcs[5] = _mm_alignr_epi8(srcs[8], srcs[4], 4);
+ srcs[6] = _mm_alignr_epi8(srcs[8], srcs[4], 8);
+ srcs[7] = _mm_alignr_epi8(srcs[8], srcs[4], 12);
+ }
+
+ const __m128i sum =
+ SimpleSum2DVerticalTaps<num_taps, /*is_compound=*/false>(srcs, taps);
+ const __m128i results = _mm_packus_epi16(sum, sum);
+
+ Store2(dst8, results);
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 2));
+ // When |height| <= 4 the taps are restricted to 2 and 4 tap variants.
+ // Therefore we don't need to check this condition when |height| > 4.
+ if (num_taps <= 4 && height == 2) return;
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 4));
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 6));
+ dst8 += dst_stride;
+
+ srcs[0] = srcs[4];
+ if (num_taps == 6) {
+ srcs[1] = srcs[5];
+ srcs[4] = srcs[8];
+ } else if (num_taps == 8) {
+ srcs[1] = srcs[5];
+ srcs[2] = srcs[6];
+ srcs[3] = srcs[7];
+ srcs[4] = srcs[8];
+ }
+
+ y -= 4;
+ } while (y != 0);
+}
+
+// The 1D compound shift is always |kInterRoundBitsHorizontal|, even for 1D
+// Vertical calculations.
+__m128i Compound1DShift(const __m128i sum) {
+ return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
+}
+
+template <int filter_index>
+__m128i SumVerticalTaps(const __m128i* const srcs, const __m128i* const v_tap) {
+ __m128i v_src[4];
+
+ if (filter_index < 2) {
+ // 6 taps.
+ v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
+ v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]);
+ v_src[2] = _mm_unpacklo_epi8(srcs[4], srcs[5]);
+ } else if (filter_index == 2) {
+ // 8 taps.
+ v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
+ v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]);
+ v_src[2] = _mm_unpacklo_epi8(srcs[4], srcs[5]);
+ v_src[3] = _mm_unpacklo_epi8(srcs[6], srcs[7]);
+ } else if (filter_index == 3) {
+ // 2 taps.
+ v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
+ } else if (filter_index > 3) {
+ // 4 taps.
+ v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
+ v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]);
+ }
+ const __m128i sum = SumOnePassTaps<filter_index>(v_src, v_tap);
+ return sum;
+}
+
+// TODO(slavarnway): Use num_taps instead of filter_index for templates. See the
+// 2D version.
+template <int num_taps, int filter_index, bool is_compound = false>
+void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
+ void* const dst, const ptrdiff_t dst_stride,
+ const int height, const __m128i* const v_tap) {
+ auto* dst8 = static_cast<uint8_t*>(dst);
+ auto* dst16 = static_cast<uint16_t*>(dst);
+
+ __m128i srcs[9];
+
+ if (num_taps == 2) {
+ srcs[2] = _mm_setzero_si128();
+ // 00 01 02 03
+ srcs[0] = Load4(src);
+ src += src_stride;
+
+ int y = height;
+ do {
+ // 10 11 12 13
+ const __m128i a = Load4(src);
+ // 00 01 02 03 10 11 12 13
+ srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
+ src += src_stride;
+ // 20 21 22 23
+ srcs[2] = Load4(src);
+ src += src_stride;
+ // 10 11 12 13 20 21 22 23
+ srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
+
+ const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ if (is_compound) {
+ const __m128i results = Compound1DShift(sums);
+ StoreUnaligned16(dst16, results);
+ dst16 += 4 << 1;
+ } else {
+ const __m128i results_16 =
+ RightShiftWithRounding_S16(sums, kFilterBits - 1);
+ const __m128i results = _mm_packus_epi16(results_16, results_16);
+ Store4(dst8, results);
+ dst8 += dst_stride;
+ Store4(dst8, _mm_srli_si128(results, 4));
+ dst8 += dst_stride;
+ }
+
+ srcs[0] = srcs[2];
+ y -= 2;
+ } while (y != 0);
+ } else if (num_taps == 4) {
+ srcs[4] = _mm_setzero_si128();
+ // 00 01 02 03
+ srcs[0] = Load4(src);
+ src += src_stride;
+ // 10 11 12 13
+ const __m128i a = Load4(src);
+ // 00 01 02 03 10 11 12 13
+ srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
+ src += src_stride;
+ // 20 21 22 23
+ srcs[2] = Load4(src);
+ src += src_stride;
+ // 10 11 12 13 20 21 22 23
+ srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
+
+ int y = height;
+ do {
+ // 30 31 32 33
+ const __m128i b = Load4(src);
+ // 20 21 22 23 30 31 32 33
+ srcs[2] = _mm_unpacklo_epi32(srcs[2], b);
+ src += src_stride;
+ // 40 41 42 43
+ srcs[4] = Load4(src);
+ src += src_stride;
+ // 30 31 32 33 40 41 42 43
+ srcs[3] = _mm_unpacklo_epi32(b, srcs[4]);
+
+ const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ if (is_compound) {
+ const __m128i results = Compound1DShift(sums);
+ StoreUnaligned16(dst16, results);
+ dst16 += 4 << 1;
+ } else {
+ const __m128i results_16 =
+ RightShiftWithRounding_S16(sums, kFilterBits - 1);
+ const __m128i results = _mm_packus_epi16(results_16, results_16);
+ Store4(dst8, results);
+ dst8 += dst_stride;
+ Store4(dst8, _mm_srli_si128(results, 4));
+ dst8 += dst_stride;
+ }
+
+ srcs[0] = srcs[2];
+ srcs[1] = srcs[3];
+ srcs[2] = srcs[4];
+ y -= 2;
+ } while (y != 0);
+ } else if (num_taps == 6) {
+ srcs[6] = _mm_setzero_si128();
+ // 00 01 02 03
+ srcs[0] = Load4(src);
+ src += src_stride;
+ // 10 11 12 13
+ const __m128i a = Load4(src);
+ // 00 01 02 03 10 11 12 13
+ srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
+ src += src_stride;
+ // 20 21 22 23
+ srcs[2] = Load4(src);
+ src += src_stride;
+ // 10 11 12 13 20 21 22 23
+ srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
+ // 30 31 32 33
+ const __m128i b = Load4(src);
+ // 20 21 22 23 30 31 32 33
+ srcs[2] = _mm_unpacklo_epi32(srcs[2], b);
+ src += src_stride;
+ // 40 41 42 43
+ srcs[4] = Load4(src);
+ src += src_stride;
+ // 30 31 32 33 40 41 42 43
+ srcs[3] = _mm_unpacklo_epi32(b, srcs[4]);
+
+ int y = height;
+ do {
+ // 50 51 52 53
+ const __m128i c = Load4(src);
+ // 40 41 42 43 50 51 52 53
+ srcs[4] = _mm_unpacklo_epi32(srcs[4], c);
+ src += src_stride;
+ // 60 61 62 63
+ srcs[6] = Load4(src);
+ src += src_stride;
+ // 50 51 52 53 60 61 62 63
+ srcs[5] = _mm_unpacklo_epi32(c, srcs[6]);
+
+ const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ if (is_compound) {
+ const __m128i results = Compound1DShift(sums);
+ StoreUnaligned16(dst16, results);
+ dst16 += 4 << 1;
+ } else {
+ const __m128i results_16 =
+ RightShiftWithRounding_S16(sums, kFilterBits - 1);
+ const __m128i results = _mm_packus_epi16(results_16, results_16);
+ Store4(dst8, results);
+ dst8 += dst_stride;
+ Store4(dst8, _mm_srli_si128(results, 4));
+ dst8 += dst_stride;
+ }
+
+ srcs[0] = srcs[2];
+ srcs[1] = srcs[3];
+ srcs[2] = srcs[4];
+ srcs[3] = srcs[5];
+ srcs[4] = srcs[6];
+ y -= 2;
+ } while (y != 0);
+ } else if (num_taps == 8) {
+ srcs[8] = _mm_setzero_si128();
+ // 00 01 02 03
+ srcs[0] = Load4(src);
+ src += src_stride;
+ // 10 11 12 13
+ const __m128i a = Load4(src);
+ // 00 01 02 03 10 11 12 13
+ srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
+ src += src_stride;
+ // 20 21 22 23
+ srcs[2] = Load4(src);
+ src += src_stride;
+ // 10 11 12 13 20 21 22 23
+ srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
+ // 30 31 32 33
+ const __m128i b = Load4(src);
+ // 20 21 22 23 30 31 32 33
+ srcs[2] = _mm_unpacklo_epi32(srcs[2], b);
+ src += src_stride;
+ // 40 41 42 43
+ srcs[4] = Load4(src);
+ src += src_stride;
+ // 30 31 32 33 40 41 42 43
+ srcs[3] = _mm_unpacklo_epi32(b, srcs[4]);
+ // 50 51 52 53
+ const __m128i c = Load4(src);
+ // 40 41 42 43 50 51 52 53
+ srcs[4] = _mm_unpacklo_epi32(srcs[4], c);
+ src += src_stride;
+ // 60 61 62 63
+ srcs[6] = Load4(src);
+ src += src_stride;
+ // 50 51 52 53 60 61 62 63
+ srcs[5] = _mm_unpacklo_epi32(c, srcs[6]);
+
+ int y = height;
+ do {
+ // 70 71 72 73
+ const __m128i d = Load4(src);
+ // 60 61 62 63 70 71 72 73
+ srcs[6] = _mm_unpacklo_epi32(srcs[6], d);
+ src += src_stride;
+ // 80 81 82 83
+ srcs[8] = Load4(src);
+ src += src_stride;
+ // 70 71 72 73 80 81 82 83
+ srcs[7] = _mm_unpacklo_epi32(d, srcs[8]);
+
+ const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ if (is_compound) {
+ const __m128i results = Compound1DShift(sums);
+ StoreUnaligned16(dst16, results);
+ dst16 += 4 << 1;
+ } else {
+ const __m128i results_16 =
+ RightShiftWithRounding_S16(sums, kFilterBits - 1);
+ const __m128i results = _mm_packus_epi16(results_16, results_16);
+ Store4(dst8, results);
+ dst8 += dst_stride;
+ Store4(dst8, _mm_srli_si128(results, 4));
+ dst8 += dst_stride;
+ }
+
+ srcs[0] = srcs[2];
+ srcs[1] = srcs[3];
+ srcs[2] = srcs[4];
+ srcs[3] = srcs[5];
+ srcs[4] = srcs[6];
+ srcs[5] = srcs[7];
+ srcs[6] = srcs[8];
+ y -= 2;
+ } while (y != 0);
+ }
+}
+
+template <int num_taps, int filter_index, bool negative_outside_taps = false>
+void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride,
+ void* const dst, const ptrdiff_t dst_stride,
+ const int height, const __m128i* const v_tap) {
+ auto* dst8 = static_cast<uint8_t*>(dst);
+
+ __m128i srcs[9];
+
+ if (num_taps == 2) {
+ srcs[2] = _mm_setzero_si128();
+ // 00 01
+ srcs[0] = Load2(src);
+ src += src_stride;
+
+ int y = height;
+ do {
+ // 00 01 10 11
+ srcs[0] = Load2<1>(src, srcs[0]);
+ src += src_stride;
+ // 00 01 10 11 20 21
+ srcs[0] = Load2<2>(src, srcs[0]);
+ src += src_stride;
+ // 00 01 10 11 20 21 30 31
+ srcs[0] = Load2<3>(src, srcs[0]);
+ src += src_stride;
+ // 40 41
+ srcs[2] = Load2<0>(src, srcs[2]);
+ src += src_stride;
+ // 00 01 10 11 20 21 30 31 40 41
+ const __m128i srcs_0_2 = _mm_unpacklo_epi64(srcs[0], srcs[2]);
+ // 10 11 20 21 30 31 40 41
+ srcs[1] = _mm_srli_si128(srcs_0_2, 2);
+ // This uses srcs[0]..srcs[1].
+ const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ const __m128i results_16 =
+ RightShiftWithRounding_S16(sums, kFilterBits - 1);
+ const __m128i results = _mm_packus_epi16(results_16, results_16);
+
+ Store2(dst8, results);
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 2));
+ if (height == 2) return;
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 4));
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 6));
+ dst8 += dst_stride;
+
+ srcs[0] = srcs[2];
+ y -= 4;
+ } while (y != 0);
+ } else if (num_taps == 4) {
+ srcs[4] = _mm_setzero_si128();
+
+ // 00 01
+ srcs[0] = Load2(src);
+ src += src_stride;
+ // 00 01 10 11
+ srcs[0] = Load2<1>(src, srcs[0]);
+ src += src_stride;
+ // 00 01 10 11 20 21
+ srcs[0] = Load2<2>(src, srcs[0]);
+ src += src_stride;
+
+ int y = height;
+ do {
+ // 00 01 10 11 20 21 30 31
+ srcs[0] = Load2<3>(src, srcs[0]);
+ src += src_stride;
+ // 40 41
+ srcs[4] = Load2<0>(src, srcs[4]);
+ src += src_stride;
+ // 40 41 50 51
+ srcs[4] = Load2<1>(src, srcs[4]);
+ src += src_stride;
+ // 40 41 50 51 60 61
+ srcs[4] = Load2<2>(src, srcs[4]);
+ src += src_stride;
+ // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
+ const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]);
+ // 10 11 20 21 30 31 40 41
+ srcs[1] = _mm_srli_si128(srcs_0_4, 2);
+ // 20 21 30 31 40 41 50 51
+ srcs[2] = _mm_srli_si128(srcs_0_4, 4);
+ // 30 31 40 41 50 51 60 61
+ srcs[3] = _mm_srli_si128(srcs_0_4, 6);
+
+ // This uses srcs[0]..srcs[3].
+ const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ const __m128i results_16 =
+ RightShiftWithRounding_S16(sums, kFilterBits - 1);
+ const __m128i results = _mm_packus_epi16(results_16, results_16);
+
+ Store2(dst8, results);
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 2));
+ if (height == 2) return;
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 4));
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 6));
+ dst8 += dst_stride;
+
+ srcs[0] = srcs[4];
+ y -= 4;
+ } while (y != 0);
+ } else if (num_taps == 6) {
+ // During the vertical pass the number of taps is restricted when
+ // |height| <= 4.
+ assert(height > 4);
+ srcs[8] = _mm_setzero_si128();
+
+ // 00 01
+ srcs[0] = Load2(src);
+ src += src_stride;
+ // 00 01 10 11
+ srcs[0] = Load2<1>(src, srcs[0]);
+ src += src_stride;
+ // 00 01 10 11 20 21
+ srcs[0] = Load2<2>(src, srcs[0]);
+ src += src_stride;
+ // 00 01 10 11 20 21 30 31
+ srcs[0] = Load2<3>(src, srcs[0]);
+ src += src_stride;
+ // 40 41
+ srcs[4] = Load2(src);
+ src += src_stride;
+ // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
+ const __m128i srcs_0_4x = _mm_unpacklo_epi64(srcs[0], srcs[4]);
+ // 10 11 20 21 30 31 40 41
+ srcs[1] = _mm_srli_si128(srcs_0_4x, 2);
+
+ int y = height;
+ do {
+ // 40 41 50 51
+ srcs[4] = Load2<1>(src, srcs[4]);
+ src += src_stride;
+ // 40 41 50 51 60 61
+ srcs[4] = Load2<2>(src, srcs[4]);
+ src += src_stride;
+ // 40 41 50 51 60 61 70 71
+ srcs[4] = Load2<3>(src, srcs[4]);
+ src += src_stride;
+ // 80 81
+ srcs[8] = Load2<0>(src, srcs[8]);
+ src += src_stride;
+ // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
+ const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]);
+ // 20 21 30 31 40 41 50 51
+ srcs[2] = _mm_srli_si128(srcs_0_4, 4);
+ // 30 31 40 41 50 51 60 61
+ srcs[3] = _mm_srli_si128(srcs_0_4, 6);
+ const __m128i srcs_4_8 = _mm_unpacklo_epi64(srcs[4], srcs[8]);
+ // 50 51 60 61 70 71 80 81
+ srcs[5] = _mm_srli_si128(srcs_4_8, 2);
+
+ // This uses srcs[0]..srcs[5].
+ const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ const __m128i results_16 =
+ RightShiftWithRounding_S16(sums, kFilterBits - 1);
+ const __m128i results = _mm_packus_epi16(results_16, results_16);
+
+ Store2(dst8, results);
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 2));
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 4));
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 6));
+ dst8 += dst_stride;
+
+ srcs[0] = srcs[4];
+ srcs[1] = srcs[5];
+ srcs[4] = srcs[8];
+ y -= 4;
+ } while (y != 0);
+ } else if (num_taps == 8) {
+ // During the vertical pass the number of taps is restricted when
+ // |height| <= 4.
+ assert(height > 4);
+ srcs[8] = _mm_setzero_si128();
+ // 00 01
+ srcs[0] = Load2(src);
+ src += src_stride;
+ // 00 01 10 11
+ srcs[0] = Load2<1>(src, srcs[0]);
+ src += src_stride;
+ // 00 01 10 11 20 21
+ srcs[0] = Load2<2>(src, srcs[0]);
+ src += src_stride;
+ // 00 01 10 11 20 21 30 31
+ srcs[0] = Load2<3>(src, srcs[0]);
+ src += src_stride;
+ // 40 41
+ srcs[4] = Load2(src);
+ src += src_stride;
+ // 40 41 50 51
+ srcs[4] = Load2<1>(src, srcs[4]);
+ src += src_stride;
+ // 40 41 50 51 60 61
+ srcs[4] = Load2<2>(src, srcs[4]);
+ src += src_stride;
+
+ // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
+ const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]);
+ // 10 11 20 21 30 31 40 41
+ srcs[1] = _mm_srli_si128(srcs_0_4, 2);
+ // 20 21 30 31 40 41 50 51
+ srcs[2] = _mm_srli_si128(srcs_0_4, 4);
+ // 30 31 40 41 50 51 60 61
+ srcs[3] = _mm_srli_si128(srcs_0_4, 6);
+
+ int y = height;
+ do {
+ // 40 41 50 51 60 61 70 71
+ srcs[4] = Load2<3>(src, srcs[4]);
+ src += src_stride;
+ // 80 81
+ srcs[8] = Load2<0>(src, srcs[8]);
+ src += src_stride;
+ // 80 81 90 91
+ srcs[8] = Load2<1>(src, srcs[8]);
+ src += src_stride;
+ // 80 81 90 91 a0 a1
+ srcs[8] = Load2<2>(src, srcs[8]);
+ src += src_stride;
+
+ // 40 41 50 51 60 61 70 71 80 81 90 91 a0 a1
+ const __m128i srcs_4_8 = _mm_unpacklo_epi64(srcs[4], srcs[8]);
+ // 50 51 60 61 70 71 80 81
+ srcs[5] = _mm_srli_si128(srcs_4_8, 2);
+ // 60 61 70 71 80 81 90 91
+ srcs[6] = _mm_srli_si128(srcs_4_8, 4);
+ // 70 71 80 81 90 91 a0 a1
+ srcs[7] = _mm_srli_si128(srcs_4_8, 6);
+
+ // This uses srcs[0]..srcs[7].
+ const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ const __m128i results_16 =
+ RightShiftWithRounding_S16(sums, kFilterBits - 1);
+ const __m128i results = _mm_packus_epi16(results_16, results_16);
+
+ Store2(dst8, results);
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 2));
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 4));
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 6));
+ dst8 += dst_stride;
+
+ srcs[0] = srcs[4];
+ srcs[1] = srcs[5];
+ srcs[2] = srcs[6];
+ srcs[3] = srcs[7];
+ srcs[4] = srcs[8];
+ y -= 4;
+ } while (y != 0);
+ }
+}
diff --git a/src/dsp/x86/distance_weighted_blend_sse4.cc b/src/dsp/x86/distance_weighted_blend_sse4.cc
index deb57ef..3c29b19 100644
--- a/src/dsp/x86/distance_weighted_blend_sse4.cc
+++ b/src/dsp/x86/distance_weighted_blend_sse4.cc
@@ -30,6 +30,7 @@
namespace libgav1 {
namespace dsp {
+namespace low_bitdepth {
namespace {
constexpr int kInterPostRoundBit = 4;
@@ -212,13 +213,231 @@ void Init8bpp() {
}
} // namespace
+} // namespace low_bitdepth
-void DistanceWeightedBlendInit_SSE4_1() { Init8bpp(); }
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+constexpr int kMax10bppSample = (1 << 10) - 1;
+constexpr int kInterPostRoundBit = 4;
+
+inline __m128i ComputeWeightedAverage8(const __m128i& pred0,
+ const __m128i& pred1,
+ const __m128i& weight0,
+ const __m128i& weight1) {
+ // This offset is a combination of round_factor and round_offset
+ // which are to be added and subtracted respectively.
+ // Here kInterPostRoundBit + 4 is considering bitdepth=10.
+ constexpr int offset =
+ (1 << ((kInterPostRoundBit + 4) - 1)) - (kCompoundOffset << 4);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i bias = _mm_set1_epi32(offset);
+ const __m128i clip_high = _mm_set1_epi16(kMax10bppSample);
+
+ __m128i prediction0 = _mm_cvtepu16_epi32(pred0);
+ __m128i mult0 = _mm_mullo_epi32(prediction0, weight0);
+ __m128i prediction1 = _mm_cvtepu16_epi32(pred1);
+ __m128i mult1 = _mm_mullo_epi32(prediction1, weight1);
+ __m128i sum = _mm_add_epi32(mult0, mult1);
+ sum = _mm_add_epi32(sum, bias);
+ const __m128i result0 = _mm_srai_epi32(sum, kInterPostRoundBit + 4);
+
+ prediction0 = _mm_unpackhi_epi16(pred0, zero);
+ mult0 = _mm_mullo_epi32(prediction0, weight0);
+ prediction1 = _mm_unpackhi_epi16(pred1, zero);
+ mult1 = _mm_mullo_epi32(prediction1, weight1);
+ sum = _mm_add_epi32(mult0, mult1);
+ sum = _mm_add_epi32(sum, bias);
+ const __m128i result1 = _mm_srai_epi32(sum, kInterPostRoundBit + 4);
+ const __m128i pack = _mm_packus_epi32(result0, result1);
+
+ return _mm_min_epi16(pack, clip_high);
+}
+
+template <int height>
+inline void DistanceWeightedBlend4xH_SSE4_1(
+ const uint16_t* pred_0, const uint16_t* pred_1, const uint8_t weight_0,
+ const uint8_t weight_1, void* const dest, const ptrdiff_t dest_stride) {
+ auto* dst = static_cast<uint16_t*>(dest);
+ const __m128i weight0 = _mm_set1_epi32(weight_0);
+ const __m128i weight1 = _mm_set1_epi32(weight_1);
+
+ int y = height;
+ do {
+ const __m128i src_00 = LoadLo8(pred_0);
+ const __m128i src_10 = LoadLo8(pred_1);
+ pred_0 += 4;
+ pred_1 += 4;
+ __m128i src_0 = LoadHi8(src_00, pred_0);
+ __m128i src_1 = LoadHi8(src_10, pred_1);
+ pred_0 += 4;
+ pred_1 += 4;
+ const __m128i res0 =
+ ComputeWeightedAverage8(src_0, src_1, weight0, weight1);
+
+ const __m128i src_01 = LoadLo8(pred_0);
+ const __m128i src_11 = LoadLo8(pred_1);
+ pred_0 += 4;
+ pred_1 += 4;
+ src_0 = LoadHi8(src_01, pred_0);
+ src_1 = LoadHi8(src_11, pred_1);
+ pred_0 += 4;
+ pred_1 += 4;
+ const __m128i res1 =
+ ComputeWeightedAverage8(src_0, src_1, weight0, weight1);
+
+ StoreLo8(dst, res0);
+ dst += dest_stride;
+ StoreHi8(dst, res0);
+ dst += dest_stride;
+ StoreLo8(dst, res1);
+ dst += dest_stride;
+ StoreHi8(dst, res1);
+ dst += dest_stride;
+ y -= 4;
+ } while (y != 0);
+}
+
+template <int height>
+inline void DistanceWeightedBlend8xH_SSE4_1(
+ const uint16_t* pred_0, const uint16_t* pred_1, const uint8_t weight_0,
+ const uint8_t weight_1, void* const dest, const ptrdiff_t dest_stride) {
+ auto* dst = static_cast<uint16_t*>(dest);
+ const __m128i weight0 = _mm_set1_epi32(weight_0);
+ const __m128i weight1 = _mm_set1_epi32(weight_1);
+
+ int y = height;
+ do {
+ const __m128i src_00 = LoadAligned16(pred_0);
+ const __m128i src_10 = LoadAligned16(pred_1);
+ pred_0 += 8;
+ pred_1 += 8;
+ const __m128i res0 =
+ ComputeWeightedAverage8(src_00, src_10, weight0, weight1);
+
+ const __m128i src_01 = LoadAligned16(pred_0);
+ const __m128i src_11 = LoadAligned16(pred_1);
+ pred_0 += 8;
+ pred_1 += 8;
+ const __m128i res1 =
+ ComputeWeightedAverage8(src_01, src_11, weight0, weight1);
+
+ StoreUnaligned16(dst, res0);
+ dst += dest_stride;
+ StoreUnaligned16(dst, res1);
+ dst += dest_stride;
+ y -= 2;
+ } while (y != 0);
+}
+
+inline void DistanceWeightedBlendLarge_SSE4_1(
+ const uint16_t* pred_0, const uint16_t* pred_1, const uint8_t weight_0,
+ const uint8_t weight_1, const int width, const int height, void* const dest,
+ const ptrdiff_t dest_stride) {
+ auto* dst = static_cast<uint16_t*>(dest);
+ const __m128i weight0 = _mm_set1_epi32(weight_0);
+ const __m128i weight1 = _mm_set1_epi32(weight_1);
+
+ int y = height;
+ do {
+ int x = 0;
+ do {
+ const __m128i src_0_lo = LoadAligned16(pred_0 + x);
+ const __m128i src_1_lo = LoadAligned16(pred_1 + x);
+ const __m128i res_lo =
+ ComputeWeightedAverage8(src_0_lo, src_1_lo, weight0, weight1);
+
+ const __m128i src_0_hi = LoadAligned16(pred_0 + x + 8);
+ const __m128i src_1_hi = LoadAligned16(pred_1 + x + 8);
+ const __m128i res_hi =
+ ComputeWeightedAverage8(src_0_hi, src_1_hi, weight0, weight1);
+
+ StoreUnaligned16(dst + x, res_lo);
+ x += 8;
+ StoreUnaligned16(dst + x, res_hi);
+ x += 8;
+ } while (x < width);
+ dst += dest_stride;
+ pred_0 += width;
+ pred_1 += width;
+ } while (--y != 0);
+}
+
+void DistanceWeightedBlend_SSE4_1(const void* prediction_0,
+ const void* prediction_1,
+ const uint8_t weight_0,
+ const uint8_t weight_1, const int width,
+ const int height, void* const dest,
+ const ptrdiff_t dest_stride) {
+ const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+ const ptrdiff_t dst_stride = dest_stride / sizeof(*pred_0);
+ if (width == 4) {
+ if (height == 4) {
+ DistanceWeightedBlend4xH_SSE4_1<4>(pred_0, pred_1, weight_0, weight_1,
+ dest, dst_stride);
+ } else if (height == 8) {
+ DistanceWeightedBlend4xH_SSE4_1<8>(pred_0, pred_1, weight_0, weight_1,
+ dest, dst_stride);
+ } else {
+ assert(height == 16);
+ DistanceWeightedBlend4xH_SSE4_1<16>(pred_0, pred_1, weight_0, weight_1,
+ dest, dst_stride);
+ }
+ return;
+ }
+
+ if (width == 8) {
+ switch (height) {
+ case 4:
+ DistanceWeightedBlend8xH_SSE4_1<4>(pred_0, pred_1, weight_0, weight_1,
+ dest, dst_stride);
+ return;
+ case 8:
+ DistanceWeightedBlend8xH_SSE4_1<8>(pred_0, pred_1, weight_0, weight_1,
+ dest, dst_stride);
+ return;
+ case 16:
+ DistanceWeightedBlend8xH_SSE4_1<16>(pred_0, pred_1, weight_0, weight_1,
+ dest, dst_stride);
+ return;
+ default:
+ assert(height == 32);
+ DistanceWeightedBlend8xH_SSE4_1<32>(pred_0, pred_1, weight_0, weight_1,
+ dest, dst_stride);
+
+ return;
+ }
+ }
+
+ DistanceWeightedBlendLarge_SSE4_1(pred_0, pred_1, weight_0, weight_1, width,
+ height, dest, dst_stride);
+}
+
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+#if DSP_ENABLED_10BPP_SSE4_1(DistanceWeightedBlend)
+ dsp->distance_weighted_blend = DistanceWeightedBlend_SSE4_1;
+#endif
+}
+
+} // namespace
+} // namespace high_bitdepth
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+void DistanceWeightedBlendInit_SSE4_1() {
+ low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ high_bitdepth::Init10bpp();
+#endif
+}
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_TARGETING_SSE4_1
+#else // !LIBGAV1_TARGETING_SSE4_1
namespace libgav1 {
namespace dsp {
diff --git a/src/dsp/x86/distance_weighted_blend_sse4.h b/src/dsp/x86/distance_weighted_blend_sse4.h
index 8646eca..dbb9f88 100644
--- a/src/dsp/x86/distance_weighted_blend_sse4.h
+++ b/src/dsp/x86/distance_weighted_blend_sse4.h
@@ -36,6 +36,10 @@ void DistanceWeightedBlendInit_SSE4_1();
#define LIBGAV1_Dsp8bpp_DistanceWeightedBlend LIBGAV1_CPU_SSE4_1
#endif
+#ifndef LIBGAV1_Dsp10bpp_DistanceWeightedBlend
+#define LIBGAV1_Dsp10bpp_DistanceWeightedBlend LIBGAV1_CPU_SSE4_1
+#endif
+
#endif // LIBGAV1_TARGETING_SSE4_1
#endif // LIBGAV1_SRC_DSP_X86_DISTANCE_WEIGHTED_BLEND_SSE4_H_
diff --git a/src/dsp/x86/film_grain_sse4.cc b/src/dsp/x86/film_grain_sse4.cc
new file mode 100644
index 0000000..745c1ca
--- /dev/null
+++ b/src/dsp/x86/film_grain_sse4.cc
@@ -0,0 +1,514 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/film_grain.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+#include <smmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/common.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/film_grain_common.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/logging.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace film_grain {
+namespace {
+
+// Load 8 values from source, widening to int16_t intermediate value size.
+// The function is overloaded for each type and bitdepth for simplicity.
+inline __m128i LoadSource(const int8_t* src) {
+ return _mm_cvtepi8_epi16(LoadLo8(src));
+}
+
+// Load 8 values from source, widening to int16_t intermediate value size.
+inline __m128i LoadSource(const uint8_t* src) {
+ return _mm_cvtepu8_epi16(LoadLo8(src));
+}
+
+inline __m128i LoadSourceMsan(const uint8_t* src, const int valid_range) {
+ return _mm_cvtepu8_epi16(LoadLo8Msan(src, 8 - valid_range));
+}
+
+// Store 8 values to dest, narrowing to uint8_t from int16_t intermediate value.
+inline void StoreUnsigned(uint8_t* dest, const __m128i data) {
+ StoreLo8(dest, _mm_packus_epi16(data, data));
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+// Load 8 values from source.
+inline __m128i LoadSource(const int16_t* src) { return LoadUnaligned16(src); }
+
+// Load 8 values from source.
+inline __m128i LoadSource(const uint16_t* src) { return LoadUnaligned16(src); }
+
+// Store 8 values to dest.
+inline void StoreUnsigned(uint16_t* dest, const __m128i data) {
+ StoreUnaligned16(dest, data);
+}
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+// For BlendNoiseWithImageChromaWithCfl, only |subsampling_x| is needed.
+inline __m128i GetAverageLuma(const uint8_t* const luma, int subsampling_x) {
+ if (subsampling_x != 0) {
+ const __m128i src = LoadUnaligned16(luma);
+
+ return RightShiftWithRounding_U16(
+ _mm_hadd_epi16(_mm_cvtepu8_epi16(src),
+ _mm_unpackhi_epi8(src, _mm_setzero_si128())),
+ 1);
+ }
+ return _mm_cvtepu8_epi16(LoadLo8(luma));
+}
+
+inline __m128i GetAverageLumaMsan(const uint8_t* const luma, int subsampling_x,
+ int valid_range) {
+ if (subsampling_x != 0) {
+ const __m128i src = LoadUnaligned16Msan(luma, 16 - valid_range);
+
+ return RightShiftWithRounding_U16(
+ _mm_hadd_epi16(_mm_cvtepu8_epi16(src),
+ _mm_unpackhi_epi8(src, _mm_setzero_si128())),
+ 1);
+ }
+ return _mm_cvtepu8_epi16(LoadLo8Msan(luma, 8 - valid_range));
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+// For BlendNoiseWithImageChromaWithCfl, only |subsampling_x| is needed.
+inline __m128i GetAverageLuma(const uint16_t* const luma, int subsampling_x) {
+ if (subsampling_x != 0) {
+ return RightShiftWithRounding_U16(
+ _mm_hadd_epi16(LoadUnaligned16(luma), LoadUnaligned16(luma + 8)), 1);
+ }
+ return LoadUnaligned16(luma);
+}
+
+inline __m128i GetAverageLumaMsan(const uint16_t* const luma, int subsampling_x,
+ int valid_range) {
+ if (subsampling_x != 0) {
+ return RightShiftWithRounding_U16(
+ _mm_hadd_epi16(
+ LoadUnaligned16Msan(luma, 16 - valid_range * sizeof(*luma)),
+ LoadUnaligned16Msan(luma + 8, 32 - valid_range * sizeof(*luma))),
+ 1);
+ }
+ return LoadUnaligned16Msan(luma, 16 - valid_range * sizeof(*luma));
+}
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+inline __m128i Clip3(const __m128i value, const __m128i low,
+ const __m128i high) {
+ const __m128i clipped_to_ceiling = _mm_min_epi16(high, value);
+ return _mm_max_epi16(low, clipped_to_ceiling);
+}
+
+template <int bitdepth, typename Pixel>
+inline __m128i GetScalingFactors(
+ const uint8_t scaling_lut[kScalingLookupTableSize], const Pixel* source) {
+ alignas(16) int16_t start_vals[8];
+ if (bitdepth == 8) {
+ // TODO(petersonab): Speed this up by creating a uint16_t scaling_lut.
+ // Currently this code results in a series of movzbl.
+ for (int i = 0; i < 8; ++i) {
+ start_vals[i] = scaling_lut[source[i]];
+ }
+ return LoadAligned16(start_vals);
+ }
+ alignas(16) int16_t end_vals[8];
+ // TODO(petersonab): Precompute this into a larger table for direct lookups.
+ for (int i = 0; i < 8; ++i) {
+ const int index = source[i] >> 2;
+ start_vals[i] = scaling_lut[index];
+ end_vals[i] = scaling_lut[index + 1];
+ }
+ const __m128i start = LoadAligned16(start_vals);
+ const __m128i end = LoadAligned16(end_vals);
+ __m128i remainder = LoadSource(source);
+ remainder = _mm_srli_epi16(_mm_slli_epi16(remainder, 14), 1);
+ const __m128i delta = _mm_mulhrs_epi16(_mm_sub_epi16(end, start), remainder);
+ return _mm_add_epi16(start, delta);
+}
+
+// |scaling_shift| is in range [8,11].
+template <int bitdepth>
+inline __m128i ScaleNoise(const __m128i noise, const __m128i scaling,
+ const __m128i scaling_shift) {
+ const __m128i shifted_scale_factors = _mm_sll_epi16(scaling, scaling_shift);
+ return _mm_mulhrs_epi16(noise, shifted_scale_factors);
+}
+
+template <int bitdepth, typename GrainType, typename Pixel>
+void BlendNoiseWithImageLuma_SSE4_1(
+ const void* noise_image_ptr, int min_value, int max_luma, int scaling_shift,
+ int width, int height, int start_height,
+ const uint8_t scaling_lut_y[kScalingLookupTableSize],
+ const void* source_plane_y, ptrdiff_t source_stride_y, void* dest_plane_y,
+ ptrdiff_t dest_stride_y) {
+ const auto* noise_image =
+ static_cast<const Array2D<GrainType>*>(noise_image_ptr);
+ const auto* in_y_row = static_cast<const Pixel*>(source_plane_y);
+ source_stride_y /= sizeof(Pixel);
+ auto* out_y_row = static_cast<Pixel*>(dest_plane_y);
+ dest_stride_y /= sizeof(Pixel);
+ const __m128i floor = _mm_set1_epi16(min_value);
+ const __m128i ceiling = _mm_set1_epi16(max_luma);
+ const int safe_width = width & ~7;
+ const __m128i derived_scaling_shift = _mm_cvtsi32_si128(15 - scaling_shift);
+ int y = 0;
+ do {
+ int x = 0;
+ for (; x < safe_width; x += 8) {
+ // TODO(b/133525232): Make 16-pixel version of loop body.
+ const __m128i orig = LoadSource(&in_y_row[x]);
+ const __m128i scaling =
+ GetScalingFactors<bitdepth, Pixel>(scaling_lut_y, &in_y_row[x]);
+ __m128i noise = LoadSource(&(noise_image[kPlaneY][y + start_height][x]));
+
+ noise = ScaleNoise<bitdepth>(noise, scaling, derived_scaling_shift);
+ const __m128i combined = _mm_add_epi16(orig, noise);
+ StoreUnsigned(&out_y_row[x], Clip3(combined, floor, ceiling));
+ }
+
+ if (x < width) {
+ Pixel luma_buffer[8];
+ // Prevent arbitrary indices from entering GetScalingFactors.
+ memset(luma_buffer, 0, sizeof(luma_buffer));
+ const int valid_range = width - x;
+ memcpy(luma_buffer, &in_y_row[x], valid_range * sizeof(in_y_row[0]));
+ luma_buffer[valid_range] = in_y_row[width - 1];
+ const __m128i orig = LoadSource(&in_y_row[x]);
+ const __m128i scaling =
+ GetScalingFactors<bitdepth, Pixel>(scaling_lut_y, luma_buffer);
+ __m128i noise = LoadSource(&(noise_image[kPlaneY][y + start_height][x]));
+
+ noise = ScaleNoise<bitdepth>(noise, scaling, derived_scaling_shift);
+ const __m128i combined = _mm_add_epi16(orig, noise);
+ StoreUnsigned(&out_y_row[x], Clip3(combined, floor, ceiling));
+ }
+ in_y_row += source_stride_y;
+ out_y_row += dest_stride_y;
+ } while (++y < height);
+ out_y_row = static_cast<Pixel*>(dest_plane_y);
+}
+
+template <int bitdepth, typename GrainType, typename Pixel>
+inline __m128i BlendChromaValsWithCfl(
+ const Pixel* average_luma_buffer,
+ const uint8_t scaling_lut[kScalingLookupTableSize],
+ const Pixel* chroma_cursor, const GrainType* noise_image_cursor,
+ const __m128i scaling_shift) {
+ const __m128i scaling =
+ GetScalingFactors<bitdepth, Pixel>(scaling_lut, average_luma_buffer);
+ const __m128i orig = LoadSource(chroma_cursor);
+ __m128i noise = LoadSource(noise_image_cursor);
+ noise = ScaleNoise<bitdepth>(noise, scaling, scaling_shift);
+ return _mm_add_epi16(orig, noise);
+}
+
+template <int bitdepth, typename GrainType, typename Pixel>
+LIBGAV1_ALWAYS_INLINE void BlendChromaPlaneWithCfl_SSE4_1(
+ const Array2D<GrainType>& noise_image, int min_value, int max_chroma,
+ int width, int height, int start_height, int subsampling_x,
+ int subsampling_y, int scaling_shift,
+ const uint8_t scaling_lut[kScalingLookupTableSize], const Pixel* in_y_row,
+ ptrdiff_t source_stride_y, const Pixel* in_chroma_row,
+ ptrdiff_t source_stride_chroma, Pixel* out_chroma_row,
+ ptrdiff_t dest_stride) {
+ const __m128i floor = _mm_set1_epi16(min_value);
+ const __m128i ceiling = _mm_set1_epi16(max_chroma);
+ alignas(16) Pixel luma_buffer[16];
+
+ const int chroma_height = (height + subsampling_y) >> subsampling_y;
+ const int chroma_width = (width + subsampling_x) >> subsampling_x;
+ // |chroma_width| is rounded up. If |width| is odd, then the final pixel will
+ // need to be guarded from overread, even if |chroma_width| is divisible by 8.
+ const int safe_chroma_width = (chroma_width - (width & 1)) & ~7;
+
+ // Writing to this buffer avoids the cost of doing 8 lane lookups in a row
+ // in GetScalingFactors.
+ Pixel average_luma_buffer[8];
+ assert(start_height % 2 == 0);
+ start_height >>= subsampling_y;
+ const __m128i derived_scaling_shift = _mm_cvtsi32_si128(15 - scaling_shift);
+ int y = 0;
+ do {
+ int x = 0;
+ for (; x < safe_chroma_width; x += 8) {
+ const int luma_x = x << subsampling_x;
+ // TODO(petersonab): Consider specializing by subsampling_x. In the 444
+ // case &in_y_row[x] can be passed to GetScalingFactors directly.
+ const __m128i average_luma =
+ GetAverageLuma(&in_y_row[luma_x], subsampling_x);
+ StoreUnsigned(average_luma_buffer, average_luma);
+
+ const __m128i blended =
+ BlendChromaValsWithCfl<bitdepth, GrainType, Pixel>(
+ average_luma_buffer, scaling_lut, &in_chroma_row[x],
+ &(noise_image[y + start_height][x]), derived_scaling_shift);
+ StoreUnsigned(&out_chroma_row[x], Clip3(blended, floor, ceiling));
+ }
+
+ // This section only runs if width % (8 << sub_x) != 0. It should never run
+ // on 720p and above.
+ if (x < chroma_width) {
+ // Prevent huge indices from entering GetScalingFactors due to
+ // uninitialized values. This is not a problem in 8bpp because the table
+ // is made larger than 255 values.
+ if (bitdepth > 8) {
+ memset(luma_buffer, 0, sizeof(luma_buffer));
+ }
+ const int luma_x = x << subsampling_x;
+ const int valid_range = width - luma_x;
+ assert(valid_range < 16);
+ memcpy(luma_buffer, &in_y_row[luma_x], valid_range * sizeof(in_y_row[0]));
+ luma_buffer[valid_range] = in_y_row[width - 1];
+ const __m128i average_luma =
+ GetAverageLumaMsan(luma_buffer, subsampling_x, valid_range + 1);
+ StoreUnsigned(average_luma_buffer, average_luma);
+
+ const __m128i blended =
+ BlendChromaValsWithCfl<bitdepth, GrainType, Pixel>(
+ average_luma_buffer, scaling_lut, &in_chroma_row[x],
+ &(noise_image[y + start_height][x]), derived_scaling_shift);
+ StoreUnsigned(&out_chroma_row[x], Clip3(blended, floor, ceiling));
+ }
+
+ in_y_row += source_stride_y << subsampling_y;
+ in_chroma_row += source_stride_chroma;
+ out_chroma_row += dest_stride;
+ } while (++y < chroma_height);
+}
+
+// This function is for the case params_.chroma_scaling_from_luma == true.
+// This further implies that scaling_lut_u == scaling_lut_v == scaling_lut_y.
+template <int bitdepth, typename GrainType, typename Pixel>
+void BlendNoiseWithImageChromaWithCfl_SSE4_1(
+ Plane plane, const FilmGrainParams& params, const void* noise_image_ptr,
+ int min_value, int max_chroma, int width, int height, int start_height,
+ int subsampling_x, int subsampling_y,
+ const uint8_t scaling_lut[kScalingLookupTableSize],
+ const void* source_plane_y, ptrdiff_t source_stride_y,
+ const void* source_plane_uv, ptrdiff_t source_stride_uv,
+ void* dest_plane_uv, ptrdiff_t dest_stride_uv) {
+ const auto* noise_image =
+ static_cast<const Array2D<GrainType>*>(noise_image_ptr);
+ const auto* in_y = static_cast<const Pixel*>(source_plane_y);
+ source_stride_y /= sizeof(Pixel);
+
+ const auto* in_uv = static_cast<const Pixel*>(source_plane_uv);
+ source_stride_uv /= sizeof(Pixel);
+ auto* out_uv = static_cast<Pixel*>(dest_plane_uv);
+ dest_stride_uv /= sizeof(Pixel);
+ BlendChromaPlaneWithCfl_SSE4_1<bitdepth, GrainType, Pixel>(
+ noise_image[plane], min_value, max_chroma, width, height, start_height,
+ subsampling_x, subsampling_y, params.chroma_scaling, scaling_lut, in_y,
+ source_stride_y, in_uv, source_stride_uv, out_uv, dest_stride_uv);
+}
+
+} // namespace
+
+namespace low_bitdepth {
+namespace {
+
+// |offset| is 32x4 packed to add with the result of _mm_madd_epi16.
+inline __m128i BlendChromaValsNoCfl8bpp(
+ const uint8_t scaling_lut[kScalingLookupTableSize], const __m128i& orig,
+ const int8_t* noise_image_cursor, const __m128i& average_luma,
+ const __m128i& scaling_shift, const __m128i& offset,
+ const __m128i& weights) {
+ uint8_t merged_buffer[8];
+ const __m128i combined_lo =
+ _mm_madd_epi16(_mm_unpacklo_epi16(average_luma, orig), weights);
+ const __m128i combined_hi =
+ _mm_madd_epi16(_mm_unpackhi_epi16(average_luma, orig), weights);
+ const __m128i merged_base = _mm_packs_epi32(_mm_srai_epi32((combined_lo), 6),
+ _mm_srai_epi32((combined_hi), 6));
+
+ const __m128i merged = _mm_add_epi16(merged_base, offset);
+
+ StoreLo8(merged_buffer, _mm_packus_epi16(merged, merged));
+ const __m128i scaling =
+ GetScalingFactors<8, uint8_t>(scaling_lut, merged_buffer);
+ __m128i noise = LoadSource(noise_image_cursor);
+ noise = ScaleNoise<8>(noise, scaling, scaling_shift);
+ return _mm_add_epi16(orig, noise);
+}
+
+LIBGAV1_ALWAYS_INLINE void BlendChromaPlane8bpp_SSE4_1(
+ const Array2D<int8_t>& noise_image, int min_value, int max_chroma,
+ int width, int height, int start_height, int subsampling_x,
+ int subsampling_y, int scaling_shift, int chroma_offset,
+ int chroma_multiplier, int luma_multiplier,
+ const uint8_t scaling_lut[kScalingLookupTableSize], const uint8_t* in_y_row,
+ ptrdiff_t source_stride_y, const uint8_t* in_chroma_row,
+ ptrdiff_t source_stride_chroma, uint8_t* out_chroma_row,
+ ptrdiff_t dest_stride) {
+ const __m128i floor = _mm_set1_epi16(min_value);
+ const __m128i ceiling = _mm_set1_epi16(max_chroma);
+
+ const int chroma_height = (height + subsampling_y) >> subsampling_y;
+ const int chroma_width = (width + subsampling_x) >> subsampling_x;
+ // |chroma_width| is rounded up. If |width| is odd, then the final luma pixel
+ // will need to be guarded from overread, even if |chroma_width| is a
+ // multiple of 8.
+ const int safe_chroma_width = (chroma_width - (width & 1)) & ~7;
+ alignas(16) uint8_t luma_buffer[16];
+ const __m128i offset = _mm_set1_epi16(chroma_offset);
+ const __m128i multipliers = _mm_set1_epi32(LeftShift(chroma_multiplier, 16) |
+ (luma_multiplier & 0xFFFF));
+ const __m128i derived_scaling_shift = _mm_cvtsi32_si128(15 - scaling_shift);
+
+ start_height >>= subsampling_y;
+ int y = 0;
+ do {
+ int x = 0;
+ for (; x < safe_chroma_width; x += 8) {
+ const int luma_x = x << subsampling_x;
+ const __m128i average_luma =
+ GetAverageLuma(&in_y_row[luma_x], subsampling_x);
+ const __m128i orig_chroma = LoadSource(&in_chroma_row[x]);
+ const __m128i blended = BlendChromaValsNoCfl8bpp(
+ scaling_lut, orig_chroma, &(noise_image[y + start_height][x]),
+ average_luma, derived_scaling_shift, offset, multipliers);
+ StoreUnsigned(&out_chroma_row[x], Clip3(blended, floor, ceiling));
+ }
+
+ if (x < chroma_width) {
+ // Begin right edge iteration. Same as the normal iterations, but the
+ // |average_luma| computation requires a duplicated luma value at the
+ // end.
+ const int luma_x = x << subsampling_x;
+ const int valid_range = width - luma_x;
+ assert(valid_range < 16);
+ // There is no need to pre-initialize this buffer, because merged values
+ // used as indices are saturated in the 8bpp case. Uninitialized values
+ // are written outside the frame.
+ memcpy(luma_buffer, &in_y_row[luma_x], valid_range * sizeof(in_y_row[0]));
+ luma_buffer[valid_range] = in_y_row[width - 1];
+ const int valid_range_chroma = chroma_width - x;
+ uint8_t chroma_buffer[8];
+ memcpy(chroma_buffer, &in_chroma_row[x],
+ valid_range_chroma * sizeof(in_chroma_row[0]));
+
+ const __m128i average_luma =
+ GetAverageLumaMsan(luma_buffer, subsampling_x, valid_range + 1);
+ const __m128i orig_chroma =
+ LoadSourceMsan(chroma_buffer, valid_range_chroma);
+ const __m128i blended = BlendChromaValsNoCfl8bpp(
+ scaling_lut, orig_chroma, &(noise_image[y + start_height][x]),
+ average_luma, derived_scaling_shift, offset, multipliers);
+ StoreUnsigned(&out_chroma_row[x], Clip3(blended, floor, ceiling));
+ // End of right edge iteration.
+ }
+
+ in_y_row += source_stride_y << subsampling_y;
+ in_chroma_row += source_stride_chroma;
+ out_chroma_row += dest_stride;
+ } while (++y < chroma_height);
+}
+
+// This function is for the case params_.chroma_scaling_from_luma == false.
+void BlendNoiseWithImageChroma8bpp_SSE4_1(
+ Plane plane, const FilmGrainParams& params, const void* noise_image_ptr,
+ int min_value, int max_chroma, int width, int height, int start_height,
+ int subsampling_x, int subsampling_y,
+ const uint8_t scaling_lut[kScalingLookupTableSize],
+ const void* source_plane_y, ptrdiff_t source_stride_y,
+ const void* source_plane_uv, ptrdiff_t source_stride_uv,
+ void* dest_plane_uv, ptrdiff_t dest_stride_uv) {
+ assert(plane == kPlaneU || plane == kPlaneV);
+ const auto* noise_image =
+ static_cast<const Array2D<int8_t>*>(noise_image_ptr);
+ const auto* in_y = static_cast<const uint8_t*>(source_plane_y);
+ const auto* in_uv = static_cast<const uint8_t*>(source_plane_uv);
+ auto* out_uv = static_cast<uint8_t*>(dest_plane_uv);
+
+ const int offset = (plane == kPlaneU) ? params.u_offset : params.v_offset;
+ const int luma_multiplier =
+ (plane == kPlaneU) ? params.u_luma_multiplier : params.v_luma_multiplier;
+ const int multiplier =
+ (plane == kPlaneU) ? params.u_multiplier : params.v_multiplier;
+ BlendChromaPlane8bpp_SSE4_1(
+ noise_image[plane], min_value, max_chroma, width, height, start_height,
+ subsampling_x, subsampling_y, params.chroma_scaling, offset, multiplier,
+ luma_multiplier, scaling_lut, in_y, source_stride_y, in_uv,
+ source_stride_uv, out_uv, dest_stride_uv);
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+
+ dsp->film_grain.blend_noise_luma =
+ BlendNoiseWithImageLuma_SSE4_1<8, int8_t, uint8_t>;
+ dsp->film_grain.blend_noise_chroma[0] = BlendNoiseWithImageChroma8bpp_SSE4_1;
+ dsp->film_grain.blend_noise_chroma[1] =
+ BlendNoiseWithImageChromaWithCfl_SSE4_1<8, int8_t, uint8_t>;
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+
+ dsp->film_grain.blend_noise_luma =
+ BlendNoiseWithImageLuma_SSE4_1<10, int16_t, uint16_t>;
+ dsp->film_grain.blend_noise_chroma[1] =
+ BlendNoiseWithImageChromaWithCfl_SSE4_1<10, int16_t, uint16_t>;
+}
+
+} // namespace
+} // namespace high_bitdepth
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+} // namespace film_grain
+
+void FilmGrainInit_SSE4_1() {
+ film_grain::low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ film_grain::high_bitdepth::Init10bpp();
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+}
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_ENABLE_SSE4_1
+
+namespace libgav1 {
+namespace dsp {
+
+void FilmGrainInit_SSE4_1() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/film_grain_sse4.h b/src/dsp/x86/film_grain_sse4.h
new file mode 100644
index 0000000..1cacbac
--- /dev/null
+++ b/src/dsp/x86/film_grain_sse4.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_FILM_GRAIN_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_FILM_GRAIN_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initialize members of Dsp::film_grain. This function is not thread-safe.
+void FilmGrainInit_SSE4_1();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_TARGETING_SSE4_1
+#define LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseLuma LIBGAV1_DSP_SSE4_1
+#define LIBGAV1_Dsp10bpp_FilmGrainBlendNoiseLuma LIBGAV1_DSP_SSE4_1
+#define LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseChroma LIBGAV1_DSP_SSE4_1
+#define LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseChromaWithCfl LIBGAV1_DSP_SSE4_1
+#define LIBGAV1_Dsp10bpp_FilmGrainBlendNoiseChromaWithCfl LIBGAV1_DSP_SSE4_1
+#endif // LIBGAV1_TARGETING_SSE4_1
+
+#endif // LIBGAV1_SRC_DSP_X86_FILM_GRAIN_SSE4_H_
diff --git a/src/dsp/x86/intra_edge_sse4.cc b/src/dsp/x86/intra_edge_sse4.cc
index 4a8658d..d6af907 100644
--- a/src/dsp/x86/intra_edge_sse4.cc
+++ b/src/dsp/x86/intra_edge_sse4.cc
@@ -22,7 +22,7 @@
#include <cassert>
#include <cstddef>
#include <cstdint>
-#include <cstring> // memcpy
+#include <cstring>
#include "src/dsp/constants.h"
#include "src/dsp/dsp.h"
@@ -259,7 +259,7 @@ void IntraEdgeInit_SSE4_1() { Init8bpp(); }
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_TARGETING_SSE4_1
+#else // !LIBGAV1_TARGETING_SSE4_1
namespace libgav1 {
namespace dsp {
diff --git a/src/dsp/x86/intrapred_cfl_sse4.cc b/src/dsp/x86/intrapred_cfl_sse4.cc
index fac1556..f2dcfdb 100644
--- a/src/dsp/x86/intrapred_cfl_sse4.cc
+++ b/src/dsp/x86/intrapred_cfl_sse4.cc
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "src/dsp/intrapred.h"
+#include "src/dsp/intrapred_cfl.h"
#include "src/utils/cpu.h"
#if LIBGAV1_TARGETING_SSE4_1
@@ -29,9 +29,48 @@
#include "src/dsp/x86/common_sse4.h"
#include "src/utils/common.h"
#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
namespace libgav1 {
namespace dsp {
+namespace {
+
+// This duplicates the last two 16-bit values in |row|.
+inline __m128i LastRowSamples(const __m128i row) {
+ return _mm_shuffle_epi32(row, 0xFF);
+}
+
+// This duplicates the last 16-bit value in |row|.
+inline __m128i LastRowResult(const __m128i row) {
+ const __m128i dup_row = _mm_shufflehi_epi16(row, 0xFF);
+ return _mm_shuffle_epi32(dup_row, 0xFF);
+}
+
+// Takes in two sums of input row pairs, and completes the computation for two
+// output rows.
+inline __m128i StoreLumaResults4_420(const __m128i vertical_sum0,
+ const __m128i vertical_sum1,
+ int16_t* luma_ptr) {
+ __m128i result = _mm_hadd_epi16(vertical_sum0, vertical_sum1);
+ result = _mm_slli_epi16(result, 1);
+ StoreLo8(luma_ptr, result);
+ StoreHi8(luma_ptr + kCflLumaBufferStride, result);
+ return result;
+}
+
+// Takes two halves of a vertically added pair of rows and completes the
+// computation for one output row.
+inline __m128i StoreLumaResults8_420(const __m128i vertical_sum0,
+ const __m128i vertical_sum1,
+ int16_t* luma_ptr) {
+ __m128i result = _mm_hadd_epi16(vertical_sum0, vertical_sum1);
+ result = _mm_slli_epi16(result, 1);
+ StoreUnaligned16(luma_ptr, result);
+ return result;
+}
+
+} // namespace
+
namespace low_bitdepth {
namespace {
@@ -40,8 +79,8 @@ namespace {
inline __m128i CflPredictUnclipped(const __m128i* input, __m128i alpha_q12,
__m128i alpha_sign, __m128i dc_q0) {
- __m128i ac_q3 = LoadUnaligned16(input);
- __m128i ac_sign = _mm_sign_epi16(alpha_sign, ac_q3);
+ const __m128i ac_q3 = LoadUnaligned16(input);
+ const __m128i ac_sign = _mm_sign_epi16(alpha_sign, ac_q3);
__m128i scaled_luma_q0 = _mm_mulhrs_epi16(_mm_abs_epi16(ac_q3), alpha_q12);
scaled_luma_q0 = _mm_sign_epi16(scaled_luma_q0, ac_sign);
return _mm_add_epi16(scaled_luma_q0, dc_q0);
@@ -88,8 +127,7 @@ void CflIntraPredictor_SSE4_1(
template <int block_height_log2, bool is_inside>
void CflSubsampler444_4xH_SSE4_1(
int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
- const int /*max_luma_width*/, const int max_luma_height,
- const void* const source, ptrdiff_t stride) {
+ const int max_luma_height, const void* const source, ptrdiff_t stride) {
static_assert(block_height_log2 <= 4, "");
const int block_height = 1 << block_height_log2;
const int visible_height = max_luma_height;
@@ -119,12 +157,15 @@ void CflSubsampler444_4xH_SSE4_1(
} while (y < visible_height);
if (!is_inside) {
- int y = visible_height;
+ // Replicate the 2 high lanes.
+ samples = _mm_shuffle_epi32(samples, 0xee);
do {
+ StoreLo8(luma_ptr, samples);
+ luma_ptr += kCflLumaBufferStride;
StoreHi8(luma_ptr, samples);
luma_ptr += kCflLumaBufferStride;
sum = _mm_add_epi16(sum, samples);
- ++y;
+ y += 2;
} while (y < block_height);
}
@@ -152,15 +193,15 @@ void CflSubsampler444_4xH_SSE4_1(
static_assert(block_height_log2 <= 4, "");
assert(max_luma_width >= 4);
assert(max_luma_height >= 4);
- const int block_height = 1 << block_height_log2;
- const int block_width = 4;
+ static_cast<void>(max_luma_width);
+ constexpr int block_height = 1 << block_height_log2;
- if (block_height <= max_luma_height && block_width <= max_luma_width) {
- CflSubsampler444_4xH_SSE4_1<block_height_log2, true>(
- luma, max_luma_width, max_luma_height, source, stride);
+ if (block_height <= max_luma_height) {
+ CflSubsampler444_4xH_SSE4_1<block_height_log2, true>(luma, max_luma_height,
+ source, stride);
} else {
- CflSubsampler444_4xH_SSE4_1<block_height_log2, false>(
- luma, max_luma_width, max_luma_height, source, stride);
+ CflSubsampler444_4xH_SSE4_1<block_height_log2, false>(luma, max_luma_height,
+ source, stride);
}
}
@@ -302,19 +343,9 @@ void CflSubsampler444_SSE4_1(
__m128i inner_sum_lo, inner_sum_hi;
int y = 0;
do {
-#if LIBGAV1_MSAN // We can load uninitialized values here. Even though they are
- // then masked off by blendv, MSAN isn't smart enough to
- // understand that. So we switch to a C implementation here.
- uint16_t c_arr[16];
- for (int x = 0; x < 16; x++) {
- const int x_index = std::min(x, visible_width_16 - 1);
- c_arr[x] = src[x_index] << 3;
- }
- samples0 = LoadUnaligned16(c_arr);
- samples1 = LoadUnaligned16(c_arr + 8);
- static_cast<void>(blend_mask_16);
-#else
- __m128i samples01 = LoadUnaligned16(src);
+ // We can load uninitialized values here. Even though they are then masked
+ // off by blendv, MSAN doesn't model that behavior.
+ __m128i samples01 = LoadUnaligned16Msan(src, invisible_width_16);
if (!inside) {
const __m128i border16 =
@@ -323,26 +354,15 @@ void CflSubsampler444_SSE4_1(
}
samples0 = _mm_slli_epi16(_mm_cvtepu8_epi16(samples01), 3);
samples1 = _mm_slli_epi16(_mm_unpackhi_epi8(samples01, zero), 3);
-#endif // LIBGAV1_MSAN
StoreUnaligned16(luma_ptr, samples0);
StoreUnaligned16(luma_ptr + 8, samples1);
__m128i inner_sum = _mm_add_epi16(samples0, samples1);
if (block_width == 32) {
-#if LIBGAV1_MSAN // We can load uninitialized values here. Even though they are
- // then masked off by blendv, MSAN isn't smart enough to
- // understand that. So we switch to a C implementation here.
- uint16_t c_arr[16];
- for (int x = 16; x < 32; x++) {
- const int x_index = std::min(x, visible_width_32 - 1);
- c_arr[x - 16] = src[x_index] << 3;
- }
- samples2 = LoadUnaligned16(c_arr);
- samples3 = LoadUnaligned16(c_arr + 8);
- static_cast<void>(blend_mask_32);
-#else
- __m128i samples23 = LoadUnaligned16(src + 16);
+ // We can load uninitialized values here. Even though they are then masked
+ // off by blendv, MSAN doesn't model that behavior.
+ __m128i samples23 = LoadUnaligned16Msan(src + 16, invisible_width_32);
if (!inside) {
const __m128i border32 =
_mm_set1_epi8(static_cast<int8_t>(src[visible_width_32 - 1]));
@@ -350,7 +370,6 @@ void CflSubsampler444_SSE4_1(
}
samples2 = _mm_slli_epi16(_mm_cvtepu8_epi16(samples23), 3);
samples3 = _mm_slli_epi16(_mm_unpackhi_epi8(samples23, zero), 3);
-#endif // LIBGAV1_MSAN
StoreUnaligned16(luma_ptr + 16, samples2);
StoreUnaligned16(luma_ptr + 24, samples3);
@@ -418,29 +437,6 @@ void CflSubsampler444_SSE4_1(
}
}
-// Takes in two sums of input row pairs, and completes the computation for two
-// output rows.
-inline __m128i StoreLumaResults4_420(const __m128i vertical_sum0,
- const __m128i vertical_sum1,
- int16_t* luma_ptr) {
- __m128i result = _mm_hadd_epi16(vertical_sum0, vertical_sum1);
- result = _mm_slli_epi16(result, 1);
- StoreLo8(luma_ptr, result);
- StoreHi8(luma_ptr + kCflLumaBufferStride, result);
- return result;
-}
-
-// Takes two halves of a vertically added pair of rows and completes the
-// computation for one output row.
-inline __m128i StoreLumaResults8_420(const __m128i vertical_sum0,
- const __m128i vertical_sum1,
- int16_t* luma_ptr) {
- __m128i result = _mm_hadd_epi16(vertical_sum0, vertical_sum1);
- result = _mm_slli_epi16(result, 1);
- StoreUnaligned16(luma_ptr, result);
- return result;
-}
-
template <int block_height_log2>
void CflSubsampler420_4xH_SSE4_1(
int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
@@ -511,17 +507,6 @@ void CflSubsampler420_4xH_SSE4_1(
}
}
-// This duplicates the last two 16-bit values in |row|.
-inline __m128i LastRowSamples(const __m128i row) {
- return _mm_shuffle_epi32(row, 0xFF);
-}
-
-// This duplicates the last 16-bit value in |row|.
-inline __m128i LastRowResult(const __m128i row) {
- const __m128i dup_row = _mm_shufflehi_epi16(row, 0xFF);
- return _mm_shuffle_epi32(dup_row, 0xFF);
-}
-
template <int block_height_log2, int max_luma_width>
inline void CflSubsampler420Impl_8xH_SSE4_1(
int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
@@ -655,10 +640,11 @@ inline void CflSubsampler420Impl_WxH_SSE4_1(
__m128i final_sum = zero;
const int block_height = 1 << block_height_log2;
const int luma_height = std::min(block_height, max_luma_height >> 1);
+ static_assert(max_luma_width <= 32, "");
int16_t* luma_ptr = luma[0];
__m128i final_row_result;
- // Begin first y section, covering width up to 16.
+ // Begin first y section, covering width up to 32.
int y = 0;
do {
const uint8_t* src_next = src + stride;
@@ -694,29 +680,32 @@ inline void CflSubsampler420Impl_WxH_SSE4_1(
final_row_result =
StoreLumaResults8_420(luma_sum2, luma_sum3, luma_ptr + 8);
sum = _mm_add_epi16(sum, final_row_result);
+ if (block_width_log2 == 5) {
+ const __m128i wide_fill = LastRowResult(final_row_result);
+ sum = _mm_add_epi16(sum, wide_fill);
+ sum = _mm_add_epi16(sum, wide_fill);
+ }
final_sum = _mm_add_epi32(final_sum, _mm_cvtepu16_epi32(sum));
final_sum = _mm_add_epi32(final_sum, _mm_unpackhi_epi16(sum, zero));
src += stride << 1;
luma_ptr += kCflLumaBufferStride;
} while (++y < luma_height);
- // Because max_luma_width is at most 32, any values beyond x=16 will
- // necessarily be duplicated.
- if (block_width_log2 == 5) {
- const __m128i wide_fill = LastRowResult(final_row_result);
- // Multiply duplicated value by number of occurrences, height * 4, since
- // there are 16 in each row and the value appears in the vector 4 times.
- final_sum = _mm_add_epi32(
- final_sum,
- _mm_slli_epi32(_mm_cvtepi16_epi32(wide_fill), block_height_log2 + 2));
- }
-
// Begin second y section.
if (y < block_height) {
const __m128i final_fill0 =
LoadUnaligned16(luma_ptr - kCflLumaBufferStride);
const __m128i final_fill1 =
LoadUnaligned16(luma_ptr - kCflLumaBufferStride + 8);
+ __m128i wide_fill;
+
+ if (block_width_log2 == 5) {
+ // There are 16 16-bit fill values per row, shifting by 2 accounts for
+ // the widening to 32-bit.
+ wide_fill =
+ _mm_slli_epi32(_mm_cvtepi16_epi32(LastRowResult(final_fill1)), 2);
+ }
+
const __m128i final_inner_sum = _mm_add_epi16(final_fill0, final_fill1);
const __m128i final_inner_sum0 = _mm_cvtepu16_epi32(final_inner_sum);
const __m128i final_inner_sum1 = _mm_unpackhi_epi16(final_inner_sum, zero);
@@ -726,6 +715,9 @@ inline void CflSubsampler420Impl_WxH_SSE4_1(
do {
StoreUnaligned16(luma_ptr, final_fill0);
StoreUnaligned16(luma_ptr + 8, final_fill1);
+ if (block_width_log2 == 5) {
+ final_sum = _mm_add_epi32(final_sum, wide_fill);
+ }
luma_ptr += kCflLumaBufferStride;
final_sum = _mm_add_epi32(final_sum, final_fill_to_sum);
@@ -747,14 +739,10 @@ inline void CflSubsampler420Impl_WxH_SSE4_1(
const __m128i samples1 = LoadUnaligned16(luma_ptr + 8);
final_row_result = _mm_sub_epi16(samples1, averages);
StoreUnaligned16(luma_ptr + 8, final_row_result);
- }
- if (block_width_log2 == 5) {
- int16_t* wide_luma_ptr = luma[0] + 16;
- const __m128i wide_fill = LastRowResult(final_row_result);
- for (int i = 0; i < block_height;
- ++i, wide_luma_ptr += kCflLumaBufferStride) {
- StoreUnaligned16(wide_luma_ptr, wide_fill);
- StoreUnaligned16(wide_luma_ptr + 8, wide_fill);
+ if (block_width_log2 == 5) {
+ const __m128i wide_fill = LastRowResult(final_row_result);
+ StoreUnaligned16(luma_ptr + 16, wide_fill);
+ StoreUnaligned16(luma_ptr + 24, wide_fill);
}
}
}
@@ -958,7 +946,882 @@ void Init8bpp() {
} // namespace
} // namespace low_bitdepth
-void IntraPredCflInit_SSE4_1() { low_bitdepth::Init8bpp(); }
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+//------------------------------------------------------------------------------
+// CflIntraPredictor_10bpp_SSE4_1
+
+inline __m128i CflPredictUnclipped(const __m128i* input, __m128i alpha_q12,
+ __m128i alpha_sign, __m128i dc_q0) {
+ const __m128i ac_q3 = LoadUnaligned16(input);
+ const __m128i ac_sign = _mm_sign_epi16(alpha_sign, ac_q3);
+ __m128i scaled_luma_q0 = _mm_mulhrs_epi16(_mm_abs_epi16(ac_q3), alpha_q12);
+ scaled_luma_q0 = _mm_sign_epi16(scaled_luma_q0, ac_sign);
+ return _mm_add_epi16(scaled_luma_q0, dc_q0);
+}
+
+inline __m128i ClipEpi16(__m128i x, __m128i min, __m128i max) {
+ return _mm_max_epi16(_mm_min_epi16(x, max), min);
+}
+
+template <int width, int height>
+void CflIntraPredictor_10bpp_SSE4_1(
+ void* const dest, ptrdiff_t stride,
+ const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int alpha) {
+ constexpr int kCflLumaBufferStrideLog2_16i = 5;
+ constexpr int kCflLumaBufferStrideLog2_128i =
+ kCflLumaBufferStrideLog2_16i - 3;
+ constexpr int kRowIncr = 1 << kCflLumaBufferStrideLog2_128i;
+ auto* dst = static_cast<uint16_t*>(dest);
+ const __m128i alpha_sign = _mm_set1_epi16(alpha);
+ const __m128i alpha_q12 = _mm_slli_epi16(_mm_abs_epi16(alpha_sign), 9);
+ auto* row = reinterpret_cast<const __m128i*>(luma);
+ const __m128i* row_end = row + (height << kCflLumaBufferStrideLog2_128i);
+ const __m128i dc_val = _mm_set1_epi16(dst[0]);
+ const __m128i min = _mm_setzero_si128();
+ const __m128i max = _mm_set1_epi16((1 << kBitdepth10) - 1);
+
+ stride >>= 1;
+
+ do {
+ __m128i res = CflPredictUnclipped(row, alpha_q12, alpha_sign, dc_val);
+ res = ClipEpi16(res, min, max);
+ if (width == 4) {
+ StoreLo8(dst, res);
+ } else if (width == 8) {
+ StoreUnaligned16(dst, res);
+ } else if (width == 16) {
+ StoreUnaligned16(dst, res);
+ const __m128i res_1 =
+ CflPredictUnclipped(row + 1, alpha_q12, alpha_sign, dc_val);
+ StoreUnaligned16(dst + 8, ClipEpi16(res_1, min, max));
+ } else {
+ StoreUnaligned16(dst, res);
+ const __m128i res_1 =
+ CflPredictUnclipped(row + 1, alpha_q12, alpha_sign, dc_val);
+ StoreUnaligned16(dst + 8, ClipEpi16(res_1, min, max));
+ const __m128i res_2 =
+ CflPredictUnclipped(row + 2, alpha_q12, alpha_sign, dc_val);
+ StoreUnaligned16(dst + 16, ClipEpi16(res_2, min, max));
+ const __m128i res_3 =
+ CflPredictUnclipped(row + 3, alpha_q12, alpha_sign, dc_val);
+ StoreUnaligned16(dst + 24, ClipEpi16(res_3, min, max));
+ }
+
+ dst += stride;
+ } while ((row += kRowIncr) < row_end);
+}
+
+template <int block_height_log2, bool is_inside>
+void CflSubsampler444_4xH_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_height, const void* const source, ptrdiff_t stride) {
+ static_assert(block_height_log2 <= 4, "");
+ const int block_height = 1 << block_height_log2;
+ const int visible_height = max_luma_height;
+ const auto* src = static_cast<const uint16_t*>(source);
+ const ptrdiff_t src_stride = stride / sizeof(src[0]);
+ int16_t* luma_ptr = luma[0];
+ __m128i zero = _mm_setzero_si128();
+ __m128i sum = zero;
+ __m128i samples;
+ int y = visible_height;
+
+ do {
+ samples = LoadHi8(LoadLo8(src), src + src_stride);
+ src += src_stride << 1;
+ sum = _mm_add_epi16(sum, samples);
+ y -= 2;
+ } while (y != 0);
+
+ if (!is_inside) {
+ y = visible_height;
+ samples = _mm_unpackhi_epi64(samples, samples);
+ do {
+ sum = _mm_add_epi16(sum, samples);
+ y += 2;
+ } while (y < block_height);
+ }
+
+ sum = _mm_add_epi32(_mm_unpackhi_epi16(sum, zero), _mm_cvtepu16_epi32(sum));
+ sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 8));
+ sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 4));
+
+ // Here the left shift by 3 (to increase precision) is nullified in right
+ // shift ((log2 of width 4) + 1).
+ __m128i averages = RightShiftWithRounding_U32(sum, block_height_log2 - 1);
+ averages = _mm_shufflelo_epi16(averages, 0);
+ src = static_cast<const uint16_t*>(source);
+ luma_ptr = luma[0];
+ y = visible_height;
+ do {
+ samples = LoadLo8(src);
+ samples = _mm_slli_epi16(samples, 3);
+ StoreLo8(luma_ptr, _mm_sub_epi16(samples, averages));
+ src += src_stride;
+ luma_ptr += kCflLumaBufferStride;
+ } while (--y != 0);
+
+ if (!is_inside) {
+ y = visible_height;
+ // Replicate last line
+ do {
+ StoreLo8(luma_ptr, _mm_sub_epi16(samples, averages));
+ luma_ptr += kCflLumaBufferStride;
+ } while (++y < block_height);
+ }
+}
+
+template <int block_height_log2>
+void CflSubsampler444_4xH_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* const source, ptrdiff_t stride) {
+ static_cast<void>(max_luma_width);
+ static_cast<void>(max_luma_height);
+ static_assert(block_height_log2 <= 4, "");
+ assert(max_luma_width >= 4);
+ assert(max_luma_height >= 4);
+ const int block_height = 1 << block_height_log2;
+
+ if (block_height <= max_luma_height) {
+ CflSubsampler444_4xH_SSE4_1<block_height_log2, true>(luma, max_luma_height,
+ source, stride);
+ } else {
+ CflSubsampler444_4xH_SSE4_1<block_height_log2, false>(luma, max_luma_height,
+ source, stride);
+ }
+}
+
+template <int block_height_log2, bool is_inside>
+void CflSubsampler444_8xH_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_height, const void* const source, ptrdiff_t stride) {
+ const int block_height = 1 << block_height_log2;
+ const int visible_height = max_luma_height;
+ const __m128i dup16 = _mm_set1_epi32(0x01000100);
+ const auto* src = static_cast<const uint16_t*>(source);
+ const ptrdiff_t src_stride = stride / sizeof(src[0]);
+ int16_t* luma_ptr = luma[0];
+ const __m128i zero = _mm_setzero_si128();
+ __m128i sum = zero;
+ __m128i samples;
+ int y = visible_height;
+
+ do {
+ samples = LoadUnaligned16(src);
+ src += src_stride;
+ sum = _mm_add_epi16(sum, samples);
+ } while (--y != 0);
+
+ if (!is_inside) {
+ y = visible_height;
+ do {
+ sum = _mm_add_epi16(sum, samples);
+ } while (++y < block_height);
+ }
+
+ sum = _mm_add_epi32(_mm_unpackhi_epi16(sum, zero), _mm_cvtepu16_epi32(sum));
+ sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 8));
+ sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 4));
+
+ // Here the left shift by 3 (to increase precision) is nullified in right
+ // shift (log2 of width 8).
+ __m128i averages = RightShiftWithRounding_U32(sum, block_height_log2);
+ averages = _mm_shuffle_epi8(averages, dup16);
+
+ src = static_cast<const uint16_t*>(source);
+ luma_ptr = luma[0];
+ y = visible_height;
+ do {
+ samples = LoadUnaligned16(src);
+ samples = _mm_slli_epi16(samples, 3);
+ StoreUnaligned16(luma_ptr, _mm_sub_epi16(samples, averages));
+ src += src_stride;
+ luma_ptr += kCflLumaBufferStride;
+ } while (--y != 0);
+
+ if (!is_inside) {
+ y = visible_height;
+ // Replicate last line
+ do {
+ StoreUnaligned16(luma_ptr, _mm_sub_epi16(samples, averages));
+ luma_ptr += kCflLumaBufferStride;
+ } while (++y < block_height);
+ }
+}
+
+template <int block_height_log2>
+void CflSubsampler444_8xH_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* const source, ptrdiff_t stride) {
+ static_cast<void>(max_luma_width);
+ static_cast<void>(max_luma_height);
+ static_assert(block_height_log2 <= 5, "");
+ assert(max_luma_width >= 4);
+ assert(max_luma_height >= 4);
+ const int block_height = 1 << block_height_log2;
+ const int block_width = 8;
+
+ const int horz_inside = block_width <= max_luma_width;
+ const int vert_inside = block_height <= max_luma_height;
+ if (horz_inside && vert_inside) {
+ CflSubsampler444_8xH_SSE4_1<block_height_log2, true>(luma, max_luma_height,
+ source, stride);
+ } else {
+ CflSubsampler444_8xH_SSE4_1<block_height_log2, false>(luma, max_luma_height,
+ source, stride);
+ }
+}
+
+template <int block_width_log2, int block_height_log2, bool is_inside>
+void CflSubsampler444_WxH_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* const source, ptrdiff_t stride) {
+ const int block_height = 1 << block_height_log2;
+ const int visible_height = max_luma_height;
+ const int block_width = 1 << block_width_log2;
+ const __m128i dup16 = _mm_set1_epi32(0x01000100);
+ const __m128i zero = _mm_setzero_si128();
+ const auto* src = static_cast<const uint16_t*>(source);
+ const ptrdiff_t src_stride = stride / sizeof(src[0]);
+ int16_t* luma_ptr = luma[0];
+ __m128i sum = zero;
+ __m128i inner_sum_lo, inner_sum_hi;
+ __m128i samples[4];
+ int y = visible_height;
+
+ do {
+ samples[0] = LoadUnaligned16(src);
+ samples[1] = (max_luma_width >= 16) ? LoadUnaligned16(src + 8)
+ : LastRowResult(samples[0]);
+ __m128i inner_sum = _mm_add_epi16(samples[0], samples[1]);
+ if (block_width == 32) {
+ samples[2] = (max_luma_width >= 24) ? LoadUnaligned16(src + 16)
+ : LastRowResult(samples[1]);
+ samples[3] = (max_luma_width == 32) ? LoadUnaligned16(src + 24)
+ : LastRowResult(samples[2]);
+
+ inner_sum = _mm_add_epi16(samples[2], inner_sum);
+ inner_sum = _mm_add_epi16(samples[3], inner_sum);
+ }
+ inner_sum_lo = _mm_cvtepu16_epi32(inner_sum);
+ inner_sum_hi = _mm_unpackhi_epi16(inner_sum, zero);
+ sum = _mm_add_epi32(sum, inner_sum_lo);
+ sum = _mm_add_epi32(sum, inner_sum_hi);
+ src += src_stride;
+ } while (--y != 0);
+
+ if (!is_inside) {
+ y = visible_height;
+ __m128i inner_sum = _mm_add_epi16(samples[0], samples[1]);
+ if (block_width == 32) {
+ inner_sum = _mm_add_epi16(samples[2], inner_sum);
+ inner_sum = _mm_add_epi16(samples[3], inner_sum);
+ }
+ inner_sum_lo = _mm_cvtepu16_epi32(inner_sum);
+ inner_sum_hi = _mm_unpackhi_epi16(inner_sum, zero);
+ do {
+ sum = _mm_add_epi32(sum, inner_sum_lo);
+ sum = _mm_add_epi32(sum, inner_sum_hi);
+ } while (++y < block_height);
+ }
+
+ sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 8));
+ sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 4));
+
+ // Here the left shift by 3 (to increase precision) is subtracted in right
+ // shift factor (block_width_log2 + block_height_log2 - 3).
+ __m128i averages =
+ RightShiftWithRounding_U32(sum, block_width_log2 + block_height_log2 - 3);
+ averages = _mm_shuffle_epi8(averages, dup16);
+
+ src = static_cast<const uint16_t*>(source);
+ __m128i samples_ext = zero;
+ luma_ptr = luma[0];
+ y = visible_height;
+ do {
+ int idx = 0;
+ for (int x = 0; x < block_width; x += 8) {
+ if (max_luma_width > x) {
+ samples[idx] = LoadUnaligned16(&src[x]);
+ samples[idx] = _mm_slli_epi16(samples[idx], 3);
+ samples_ext = samples[idx];
+ } else {
+ samples[idx] = LastRowResult(samples_ext);
+ }
+ StoreUnaligned16(&luma_ptr[x], _mm_sub_epi16(samples[idx++], averages));
+ }
+ src += src_stride;
+ luma_ptr += kCflLumaBufferStride;
+ } while (--y != 0);
+
+ if (!is_inside) {
+ y = visible_height;
+ // Replicate last line
+ do {
+ int idx = 0;
+ for (int x = 0; x < block_width; x += 8) {
+ StoreUnaligned16(&luma_ptr[x], _mm_sub_epi16(samples[idx++], averages));
+ }
+ luma_ptr += kCflLumaBufferStride;
+ } while (++y < block_height);
+ }
+}
+
+template <int block_width_log2, int block_height_log2>
+void CflSubsampler444_WxH_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* const source, ptrdiff_t stride) {
+ static_assert(block_width_log2 == 4 || block_width_log2 == 5,
+ "This function will only work for block_width 16 and 32.");
+ static_assert(block_height_log2 <= 5, "");
+ assert(max_luma_width >= 4);
+ assert(max_luma_height >= 4);
+
+ const int block_height = 1 << block_height_log2;
+ const int vert_inside = block_height <= max_luma_height;
+ if (vert_inside) {
+ CflSubsampler444_WxH_SSE4_1<block_width_log2, block_height_log2, true>(
+ luma, max_luma_width, max_luma_height, source, stride);
+ } else {
+ CflSubsampler444_WxH_SSE4_1<block_width_log2, block_height_log2, false>(
+ luma, max_luma_width, max_luma_height, source, stride);
+ }
+}
+
+template <int block_height_log2>
+void CflSubsampler420_4xH_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int /*max_luma_width*/, const int max_luma_height,
+ const void* const source, ptrdiff_t stride) {
+ const int block_height = 1 << block_height_log2;
+ const auto* src = static_cast<const uint16_t*>(source);
+ const ptrdiff_t src_stride = stride / sizeof(src[0]);
+ int16_t* luma_ptr = luma[0];
+ const __m128i zero = _mm_setzero_si128();
+ __m128i final_sum = zero;
+ const int luma_height = std::min(block_height, max_luma_height >> 1);
+ int y = luma_height;
+
+ do {
+ const __m128i samples_row0 = LoadUnaligned16(src);
+ src += src_stride;
+ const __m128i samples_row1 = LoadUnaligned16(src);
+ src += src_stride;
+ const __m128i luma_sum01 = _mm_add_epi16(samples_row0, samples_row1);
+
+ const __m128i samples_row2 = LoadUnaligned16(src);
+ src += src_stride;
+ const __m128i samples_row3 = LoadUnaligned16(src);
+ src += src_stride;
+ const __m128i luma_sum23 = _mm_add_epi16(samples_row2, samples_row3);
+ __m128i sum = StoreLumaResults4_420(luma_sum01, luma_sum23, luma_ptr);
+ luma_ptr += kCflLumaBufferStride << 1;
+
+ const __m128i samples_row4 = LoadUnaligned16(src);
+ src += src_stride;
+ const __m128i samples_row5 = LoadUnaligned16(src);
+ src += src_stride;
+ const __m128i luma_sum45 = _mm_add_epi16(samples_row4, samples_row5);
+
+ const __m128i samples_row6 = LoadUnaligned16(src);
+ src += src_stride;
+ const __m128i samples_row7 = LoadUnaligned16(src);
+ src += src_stride;
+ const __m128i luma_sum67 = _mm_add_epi16(samples_row6, samples_row7);
+ sum = _mm_add_epi16(
+ sum, StoreLumaResults4_420(luma_sum45, luma_sum67, luma_ptr));
+ luma_ptr += kCflLumaBufferStride << 1;
+
+ final_sum = _mm_add_epi32(final_sum, _mm_cvtepu16_epi32(sum));
+ final_sum = _mm_add_epi32(final_sum, _mm_unpackhi_epi16(sum, zero));
+ y -= 4;
+ } while (y != 0);
+
+ const __m128i final_fill = LoadLo8(luma_ptr - kCflLumaBufferStride);
+ const __m128i final_fill_to_sum = _mm_cvtepu16_epi32(final_fill);
+ for (y = luma_height; y < block_height; ++y) {
+ StoreLo8(luma_ptr, final_fill);
+ luma_ptr += kCflLumaBufferStride;
+ final_sum = _mm_add_epi32(final_sum, final_fill_to_sum);
+ }
+ final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 8));
+ final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 4));
+
+ __m128i averages = RightShiftWithRounding_U32(
+ final_sum, block_height_log2 + 2 /*log2 of width 4*/);
+
+ averages = _mm_shufflelo_epi16(averages, 0);
+ luma_ptr = luma[0];
+ y = block_height;
+ do {
+ const __m128i samples = LoadLo8(luma_ptr);
+ StoreLo8(luma_ptr, _mm_sub_epi16(samples, averages));
+ luma_ptr += kCflLumaBufferStride;
+ } while (--y != 0);
+}
+
+template <int block_height_log2, int max_luma_width>
+inline void CflSubsampler420Impl_8xH_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_height, const void* const source, ptrdiff_t stride) {
+ const int block_height = 1 << block_height_log2;
+ const auto* src = static_cast<const uint16_t*>(source);
+ const ptrdiff_t src_stride = stride / sizeof(src[0]);
+ const __m128i zero = _mm_setzero_si128();
+ __m128i final_sum = zero;
+ int16_t* luma_ptr = luma[0];
+ const int luma_height = std::min(block_height, max_luma_height >> 1);
+ int y = luma_height;
+
+ do {
+ const __m128i samples_row00 = LoadUnaligned16(src);
+ const __m128i samples_row01 = (max_luma_width == 16)
+ ? LoadUnaligned16(src + 8)
+ : LastRowSamples(samples_row00);
+ src += src_stride;
+ const __m128i samples_row10 = LoadUnaligned16(src);
+ const __m128i samples_row11 = (max_luma_width == 16)
+ ? LoadUnaligned16(src + 8)
+ : LastRowSamples(samples_row10);
+ src += src_stride;
+ const __m128i luma_sum00 = _mm_add_epi16(samples_row00, samples_row10);
+ const __m128i luma_sum01 = _mm_add_epi16(samples_row01, samples_row11);
+ __m128i sum = StoreLumaResults8_420(luma_sum00, luma_sum01, luma_ptr);
+ luma_ptr += kCflLumaBufferStride;
+
+ const __m128i samples_row20 = LoadUnaligned16(src);
+ const __m128i samples_row21 = (max_luma_width == 16)
+ ? LoadUnaligned16(src + 8)
+ : LastRowSamples(samples_row20);
+ src += src_stride;
+ const __m128i samples_row30 = LoadUnaligned16(src);
+ const __m128i samples_row31 = (max_luma_width == 16)
+ ? LoadUnaligned16(src + 8)
+ : LastRowSamples(samples_row30);
+ src += src_stride;
+ const __m128i luma_sum10 = _mm_add_epi16(samples_row20, samples_row30);
+ const __m128i luma_sum11 = _mm_add_epi16(samples_row21, samples_row31);
+ sum = _mm_add_epi16(
+ sum, StoreLumaResults8_420(luma_sum10, luma_sum11, luma_ptr));
+ luma_ptr += kCflLumaBufferStride;
+
+ const __m128i samples_row40 = LoadUnaligned16(src);
+ const __m128i samples_row41 = (max_luma_width == 16)
+ ? LoadUnaligned16(src + 8)
+ : LastRowSamples(samples_row40);
+ src += src_stride;
+ const __m128i samples_row50 = LoadUnaligned16(src);
+ const __m128i samples_row51 = (max_luma_width == 16)
+ ? LoadUnaligned16(src + 8)
+ : LastRowSamples(samples_row50);
+ src += src_stride;
+ const __m128i luma_sum20 = _mm_add_epi16(samples_row40, samples_row50);
+ const __m128i luma_sum21 = _mm_add_epi16(samples_row41, samples_row51);
+ sum = _mm_add_epi16(
+ sum, StoreLumaResults8_420(luma_sum20, luma_sum21, luma_ptr));
+ luma_ptr += kCflLumaBufferStride;
+
+ const __m128i samples_row60 = LoadUnaligned16(src);
+ const __m128i samples_row61 = (max_luma_width == 16)
+ ? LoadUnaligned16(src + 8)
+ : LastRowSamples(samples_row60);
+ src += src_stride;
+ const __m128i samples_row70 = LoadUnaligned16(src);
+ const __m128i samples_row71 = (max_luma_width == 16)
+ ? LoadUnaligned16(src + 8)
+ : LastRowSamples(samples_row70);
+ src += src_stride;
+ const __m128i luma_sum30 = _mm_add_epi16(samples_row60, samples_row70);
+ const __m128i luma_sum31 = _mm_add_epi16(samples_row61, samples_row71);
+ sum = _mm_add_epi16(
+ sum, StoreLumaResults8_420(luma_sum30, luma_sum31, luma_ptr));
+ luma_ptr += kCflLumaBufferStride;
+
+ final_sum = _mm_add_epi32(final_sum, _mm_cvtepu16_epi32(sum));
+ final_sum = _mm_add_epi32(final_sum, _mm_unpackhi_epi16(sum, zero));
+ y -= 4;
+ } while (y != 0);
+
+ // Duplicate the final row downward to the end after max_luma_height.
+ const __m128i final_fill = LoadUnaligned16(luma_ptr - kCflLumaBufferStride);
+ const __m128i final_fill_to_sum0 = _mm_cvtepi16_epi32(final_fill);
+ const __m128i final_fill_to_sum1 =
+ _mm_cvtepi16_epi32(_mm_srli_si128(final_fill, 8));
+ const __m128i final_fill_to_sum =
+ _mm_add_epi32(final_fill_to_sum0, final_fill_to_sum1);
+ for (y = luma_height; y < block_height; ++y) {
+ StoreUnaligned16(luma_ptr, final_fill);
+ luma_ptr += kCflLumaBufferStride;
+ final_sum = _mm_add_epi32(final_sum, final_fill_to_sum);
+ }
+ final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 8));
+ final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 4));
+
+ __m128i averages = RightShiftWithRounding_S32(
+ final_sum, block_height_log2 + 3 /*log2 of width 8*/);
+
+ averages = _mm_shufflelo_epi16(averages, 0);
+ averages = _mm_shuffle_epi32(averages, 0);
+ luma_ptr = luma[0];
+ y = block_height;
+ do {
+ const __m128i samples = LoadUnaligned16(luma_ptr);
+ StoreUnaligned16(luma_ptr, _mm_sub_epi16(samples, averages));
+ luma_ptr += kCflLumaBufferStride;
+ } while (--y != 0);
+}
+
+template <int block_height_log2>
+void CflSubsampler420_8xH_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* const source, ptrdiff_t stride) {
+ if (max_luma_width == 8) {
+ CflSubsampler420Impl_8xH_SSE4_1<block_height_log2, 8>(luma, max_luma_height,
+ source, stride);
+ } else {
+ CflSubsampler420Impl_8xH_SSE4_1<block_height_log2, 16>(
+ luma, max_luma_height, source, stride);
+ }
+}
+
+template <int block_width_log2, int block_height_log2, int max_luma_width>
+inline void CflSubsampler420Impl_WxH_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_height, const void* const source, ptrdiff_t stride) {
+ const auto* src = static_cast<const uint16_t*>(source);
+ const ptrdiff_t src_stride = stride / sizeof(src[0]);
+ const __m128i zero = _mm_setzero_si128();
+ __m128i final_sum = zero;
+ const int block_height = 1 << block_height_log2;
+ const int luma_height = std::min(block_height, max_luma_height >> 1);
+ int16_t* luma_ptr = luma[0];
+ __m128i final_row_result;
+ // Begin first y section, covering width up to 32.
+ int y = luma_height;
+
+ do {
+ const uint16_t* src_next = src + src_stride;
+ const __m128i samples_row00 = LoadUnaligned16(src);
+ const __m128i samples_row01 = (max_luma_width >= 16)
+ ? LoadUnaligned16(src + 8)
+ : LastRowSamples(samples_row00);
+ const __m128i samples_row02 = (max_luma_width >= 24)
+ ? LoadUnaligned16(src + 16)
+ : LastRowSamples(samples_row01);
+ const __m128i samples_row03 = (max_luma_width == 32)
+ ? LoadUnaligned16(src + 24)
+ : LastRowSamples(samples_row02);
+ const __m128i samples_row10 = LoadUnaligned16(src_next);
+ const __m128i samples_row11 = (max_luma_width >= 16)
+ ? LoadUnaligned16(src_next + 8)
+ : LastRowSamples(samples_row10);
+ const __m128i samples_row12 = (max_luma_width >= 24)
+ ? LoadUnaligned16(src_next + 16)
+ : LastRowSamples(samples_row11);
+ const __m128i samples_row13 = (max_luma_width == 32)
+ ? LoadUnaligned16(src_next + 24)
+ : LastRowSamples(samples_row12);
+ const __m128i luma_sum0 = _mm_add_epi16(samples_row00, samples_row10);
+ const __m128i luma_sum1 = _mm_add_epi16(samples_row01, samples_row11);
+ const __m128i luma_sum2 = _mm_add_epi16(samples_row02, samples_row12);
+ const __m128i luma_sum3 = _mm_add_epi16(samples_row03, samples_row13);
+ __m128i sum = StoreLumaResults8_420(luma_sum0, luma_sum1, luma_ptr);
+ final_row_result =
+ StoreLumaResults8_420(luma_sum2, luma_sum3, luma_ptr + 8);
+ sum = _mm_add_epi16(sum, final_row_result);
+ final_sum = _mm_add_epi32(final_sum, _mm_cvtepu16_epi32(sum));
+ final_sum = _mm_add_epi32(final_sum, _mm_unpackhi_epi16(sum, zero));
+
+ // Because max_luma_width is at most 32, any values beyond x=16 will
+ // necessarily be duplicated.
+ if (block_width_log2 == 5) {
+ const __m128i wide_fill = LastRowResult(final_row_result);
+ // There are 16 16-bit fill values per row, shifting by 2 accounts for
+ // the widening to 32-bit.
+ final_sum = _mm_add_epi32(
+ final_sum, _mm_slli_epi32(_mm_cvtepi16_epi32(wide_fill), 2));
+ }
+ src += src_stride << 1;
+ luma_ptr += kCflLumaBufferStride;
+ } while (--y != 0);
+
+ // Begin second y section.
+ y = luma_height;
+ if (y < block_height) {
+ const __m128i final_fill0 =
+ LoadUnaligned16(luma_ptr - kCflLumaBufferStride);
+ const __m128i final_fill1 =
+ LoadUnaligned16(luma_ptr - kCflLumaBufferStride + 8);
+ __m128i wide_fill;
+ if (block_width_log2 == 5) {
+ // There are 16 16-bit fill values per row, shifting by 2 accounts for
+ // the widening to 32-bit.
+ wide_fill =
+ _mm_slli_epi32(_mm_cvtepi16_epi32(LastRowResult(final_fill1)), 2);
+ }
+ const __m128i final_inner_sum = _mm_add_epi16(final_fill0, final_fill1);
+ const __m128i final_inner_sum0 = _mm_cvtepu16_epi32(final_inner_sum);
+ const __m128i final_inner_sum1 = _mm_unpackhi_epi16(final_inner_sum, zero);
+ const __m128i final_fill_to_sum =
+ _mm_add_epi32(final_inner_sum0, final_inner_sum1);
+
+ do {
+ StoreUnaligned16(luma_ptr, final_fill0);
+ StoreUnaligned16(luma_ptr + 8, final_fill1);
+ if (block_width_log2 == 5) {
+ final_sum = _mm_add_epi32(final_sum, wide_fill);
+ }
+ luma_ptr += kCflLumaBufferStride;
+ final_sum = _mm_add_epi32(final_sum, final_fill_to_sum);
+ } while (++y < block_height);
+ } // End second y section.
+
+ final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 8));
+ final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 4));
+
+ __m128i averages = RightShiftWithRounding_S32(
+ final_sum, block_width_log2 + block_height_log2);
+ averages = _mm_shufflelo_epi16(averages, 0);
+ averages = _mm_shuffle_epi32(averages, 0);
+
+ luma_ptr = luma[0];
+ y = block_height;
+ do {
+ const __m128i samples0 = LoadUnaligned16(luma_ptr);
+ StoreUnaligned16(luma_ptr, _mm_sub_epi16(samples0, averages));
+ const __m128i samples1 = LoadUnaligned16(luma_ptr + 8);
+ final_row_result = _mm_sub_epi16(samples1, averages);
+ StoreUnaligned16(luma_ptr + 8, final_row_result);
+
+ if (block_width_log2 == 5) {
+ const __m128i wide_fill = LastRowResult(final_row_result);
+ StoreUnaligned16(luma_ptr + 16, wide_fill);
+ StoreUnaligned16(luma_ptr + 24, wide_fill);
+ }
+ luma_ptr += kCflLumaBufferStride;
+ } while (--y != 0);
+}
+
+template <int block_width_log2, int block_height_log2>
+void CflSubsampler420_WxH_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* const source, ptrdiff_t stride) {
+ switch (max_luma_width) {
+ case 8:
+ CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 8>(
+ luma, max_luma_height, source, stride);
+ return;
+ case 16:
+ CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 16>(
+ luma, max_luma_height, source, stride);
+ return;
+ case 24:
+ CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 24>(
+ luma, max_luma_height, source, stride);
+ return;
+ default:
+ assert(max_luma_width == 32);
+ CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 32>(
+ luma, max_luma_height, source, stride);
+ return;
+ }
+}
+
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize4x4] =
+ CflIntraPredictor_10bpp_SSE4_1<4, 4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x8_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize4x8] =
+ CflIntraPredictor_10bpp_SSE4_1<4, 8>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x16_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize4x16] =
+ CflIntraPredictor_10bpp_SSE4_1<4, 16>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x4_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize8x4] =
+ CflIntraPredictor_10bpp_SSE4_1<8, 4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x8_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize8x8] =
+ CflIntraPredictor_10bpp_SSE4_1<8, 8>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x16_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize8x16] =
+ CflIntraPredictor_10bpp_SSE4_1<8, 16>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x32_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize8x32] =
+ CflIntraPredictor_10bpp_SSE4_1<8, 32>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x4_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize16x4] =
+ CflIntraPredictor_10bpp_SSE4_1<16, 4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x8_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize16x8] =
+ CflIntraPredictor_10bpp_SSE4_1<16, 8>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x16_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize16x16] =
+ CflIntraPredictor_10bpp_SSE4_1<16, 16>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x32_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize16x32] =
+ CflIntraPredictor_10bpp_SSE4_1<16, 32>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x8_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize32x8] =
+ CflIntraPredictor_10bpp_SSE4_1<32, 8>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x16_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize32x16] =
+ CflIntraPredictor_10bpp_SSE4_1<32, 16>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x32_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize32x32] =
+ CflIntraPredictor_10bpp_SSE4_1<32, 32>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] =
+ CflSubsampler420_4xH_SSE4_1<2>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x8_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] =
+ CflSubsampler420_4xH_SSE4_1<3>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x16_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] =
+ CflSubsampler420_4xH_SSE4_1<4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x4_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] =
+ CflSubsampler420_8xH_SSE4_1<2>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x8_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] =
+ CflSubsampler420_8xH_SSE4_1<3>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x16_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] =
+ CflSubsampler420_8xH_SSE4_1<4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x32_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] =
+ CflSubsampler420_8xH_SSE4_1<5>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x4_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] =
+ CflSubsampler420_WxH_SSE4_1<4, 2>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x8_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] =
+ CflSubsampler420_WxH_SSE4_1<4, 3>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x16_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] =
+ CflSubsampler420_WxH_SSE4_1<4, 4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x32_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] =
+ CflSubsampler420_WxH_SSE4_1<4, 5>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x8_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] =
+ CflSubsampler420_WxH_SSE4_1<5, 3>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x16_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] =
+ CflSubsampler420_WxH_SSE4_1<5, 4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x32_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] =
+ CflSubsampler420_WxH_SSE4_1<5, 5>;
+#endif
+
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] =
+ CflSubsampler444_4xH_SSE4_1<2>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x8_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] =
+ CflSubsampler444_4xH_SSE4_1<3>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x16_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] =
+ CflSubsampler444_4xH_SSE4_1<4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x4_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] =
+ CflSubsampler444_8xH_SSE4_1<2>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x8_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] =
+ CflSubsampler444_8xH_SSE4_1<3>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x16_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] =
+ CflSubsampler444_8xH_SSE4_1<4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x32_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] =
+ CflSubsampler444_8xH_SSE4_1<5>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x4_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] =
+ CflSubsampler444_WxH_SSE4_1<4, 2>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x8_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] =
+ CflSubsampler444_WxH_SSE4_1<4, 3>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x16_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] =
+ CflSubsampler444_WxH_SSE4_1<4, 4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x32_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] =
+ CflSubsampler444_WxH_SSE4_1<4, 5>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x8_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] =
+ CflSubsampler444_WxH_SSE4_1<5, 3>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x16_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] =
+ CflSubsampler444_WxH_SSE4_1<5, 4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x32_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] =
+ CflSubsampler444_WxH_SSE4_1<5, 5>;
+#endif
+}
+
+} // namespace
+} // namespace high_bitdepth
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+void IntraPredCflInit_SSE4_1() {
+ low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ high_bitdepth::Init10bpp();
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+}
} // namespace dsp
} // namespace libgav1
diff --git a/src/dsp/x86/intrapred_cfl_sse4.h b/src/dsp/x86/intrapred_cfl_sse4.h
new file mode 100644
index 0000000..5d1a425
--- /dev/null
+++ b/src/dsp/x86/intrapred_cfl_sse4.h
@@ -0,0 +1,376 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_INTRAPRED_CFL_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_INTRAPRED_CFL_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::cfl_intra_predictors and Dsp::cfl_subsamplers, see the
+// defines below for specifics. These functions are not thread-safe.
+void IntraPredCflInit_SSE4_1();
+
+} // namespace dsp
+} // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+//------------------------------------------------------------------------------
+// 10bpp
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+#endif // LIBGAV1_TARGETING_SSE4_1
+
+#endif // LIBGAV1_SRC_DSP_X86_INTRAPRED_CFL_SSE4_H_
diff --git a/src/dsp/x86/intrapred_directional_sse4.cc b/src/dsp/x86/intrapred_directional_sse4.cc
new file mode 100644
index 0000000..e642aee
--- /dev/null
+++ b/src/dsp/x86/intrapred_directional_sse4.cc
@@ -0,0 +1,1478 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred_directional.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <smmintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/dsp/x86/transpose_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/memory.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+//------------------------------------------------------------------------------
+// 7.11.2.4. Directional intra prediction process
+
+// Special case: An |xstep| of 64 corresponds to an angle delta of 45, meaning
+// upsampling is ruled out. In addition, the bits masked by 0x3F for
+// |shift_val| are 0 for all multiples of 64, so the formula
+// val = top[top_base_x]*shift + top[top_base_x+1]*(32-shift), reduces to
+// val = top[top_base_x+1] << 5, meaning only the second set of pixels is
+// involved in the output. Hence |top| is offset by 1.
+inline void DirectionalZone1_Step64(uint8_t* dst, ptrdiff_t stride,
+ const uint8_t* const top, const int width,
+ const int height) {
+ ptrdiff_t offset = 1;
+ if (height == 4) {
+ memcpy(dst, top + offset, width);
+ dst += stride;
+ memcpy(dst, top + offset + 1, width);
+ dst += stride;
+ memcpy(dst, top + offset + 2, width);
+ dst += stride;
+ memcpy(dst, top + offset + 3, width);
+ return;
+ }
+ int y = 0;
+ do {
+ memcpy(dst, top + offset, width);
+ dst += stride;
+ memcpy(dst, top + offset + 1, width);
+ dst += stride;
+ memcpy(dst, top + offset + 2, width);
+ dst += stride;
+ memcpy(dst, top + offset + 3, width);
+ dst += stride;
+ memcpy(dst, top + offset + 4, width);
+ dst += stride;
+ memcpy(dst, top + offset + 5, width);
+ dst += stride;
+ memcpy(dst, top + offset + 6, width);
+ dst += stride;
+ memcpy(dst, top + offset + 7, width);
+ dst += stride;
+
+ offset += 8;
+ y += 8;
+ } while (y < height);
+}
+
+inline void DirectionalZone1_4xH(uint8_t* dst, ptrdiff_t stride,
+ const uint8_t* const top, const int height,
+ const int xstep, const bool upsampled) {
+ const int upsample_shift = static_cast<int>(upsampled);
+ const int scale_bits = 6 - upsample_shift;
+ const __m128i max_shift = _mm_set1_epi8(32);
+ // Downscaling for a weighted average whose weights sum to 32 (max_shift).
+ const int rounding_bits = 5;
+ const int max_base_x = (height + 3 /* width - 1 */) << upsample_shift;
+ const __m128i final_top_val = _mm_set1_epi16(top[max_base_x]);
+ const __m128i sampler = upsampled ? _mm_set_epi64x(0, 0x0706050403020100)
+ : _mm_set_epi64x(0, 0x0403030202010100);
+ // Each 16-bit value here corresponds to a position that may exceed
+ // |max_base_x|. When added to the top_base_x, it is used to mask values
+ // that pass the end of |top|. Starting from 1 to simulate "cmpge" which is
+ // not supported for packed integers.
+ const __m128i offsets =
+ _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
+
+ // All rows from |min_corner_only_y| down will simply use memcpy. |max_base_x|
+ // is always greater than |height|, so clipping to 1 is enough to make the
+ // logic work.
+ const int xstep_units = std::max(xstep >> scale_bits, 1);
+ const int min_corner_only_y = std::min(max_base_x / xstep_units, height);
+
+ // Rows up to this y-value can be computed without checking for bounds.
+ int y = 0;
+ int top_x = xstep;
+
+ for (; y < min_corner_only_y; ++y, dst += stride, top_x += xstep) {
+ const int top_base_x = top_x >> scale_bits;
+
+ // Permit negative values of |top_x|.
+ const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+ const __m128i shift = _mm_set1_epi8(shift_val);
+ const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
+ const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
+ __m128i top_index_vect = _mm_set1_epi16(top_base_x);
+ top_index_vect = _mm_add_epi16(top_index_vect, offsets);
+ const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
+
+ // Load 8 values because we will select the sampled values based on
+ // |upsampled|.
+ const __m128i values = LoadLo8(top + top_base_x);
+ const __m128i sampled_values = _mm_shuffle_epi8(values, sampler);
+ const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect);
+ __m128i prod = _mm_maddubs_epi16(sampled_values, shifts);
+ prod = RightShiftWithRounding_U16(prod, rounding_bits);
+ // Replace pixels from invalid range with top-right corner.
+ prod = _mm_blendv_epi8(prod, final_top_val, past_max);
+ Store4(dst, _mm_packus_epi16(prod, prod));
+ }
+
+ // Fill in corner-only rows.
+ for (; y < height; ++y) {
+ memset(dst, top[max_base_x], /* width */ 4);
+ dst += stride;
+ }
+}
+
+// 7.11.2.4 (7) angle < 90
+inline void DirectionalZone1_Large(uint8_t* dest, ptrdiff_t stride,
+ const uint8_t* const top_row,
+ const int width, const int height,
+ const int xstep, const bool upsampled) {
+ const int upsample_shift = static_cast<int>(upsampled);
+ const __m128i sampler =
+ upsampled ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
+ : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
+ const int scale_bits = 6 - upsample_shift;
+ const int max_base_x = ((width + height) - 1) << upsample_shift;
+
+ const __m128i max_shift = _mm_set1_epi8(32);
+ // Downscaling for a weighted average whose weights sum to 32 (max_shift).
+ const int rounding_bits = 5;
+ const int base_step = 1 << upsample_shift;
+ const int base_step8 = base_step << 3;
+
+ // All rows from |min_corner_only_y| down will simply use memcpy. |max_base_x|
+ // is always greater than |height|, so clipping to 1 is enough to make the
+ // logic work.
+ const int xstep_units = std::max(xstep >> scale_bits, 1);
+ const int min_corner_only_y = std::min(max_base_x / xstep_units, height);
+
+ // Rows up to this y-value can be computed without checking for bounds.
+ const int max_no_corner_y = std::min(
+ LeftShift((max_base_x - (base_step * width)), scale_bits) / xstep,
+ height);
+ // No need to check for exceeding |max_base_x| in the first loop.
+ int y = 0;
+ int top_x = xstep;
+ for (; y < max_no_corner_y; ++y, dest += stride, top_x += xstep) {
+ int top_base_x = top_x >> scale_bits;
+ // Permit negative values of |top_x|.
+ const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+ const __m128i shift = _mm_set1_epi8(shift_val);
+ const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
+ const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
+ int x = 0;
+ do {
+ const __m128i top_vals = LoadUnaligned16(top_row + top_base_x);
+ __m128i vals = _mm_shuffle_epi8(top_vals, sampler);
+ vals = _mm_maddubs_epi16(vals, shifts);
+ vals = RightShiftWithRounding_U16(vals, rounding_bits);
+ StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
+ top_base_x += base_step8;
+ x += 8;
+ } while (x < width);
+ }
+
+ // Each 16-bit value here corresponds to a position that may exceed
+ // |max_base_x|. When added to the top_base_x, it is used to mask values
+ // that pass the end of |top|. Starting from 1 to simulate "cmpge" which is
+ // not supported for packed integers.
+ const __m128i offsets =
+ _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
+
+ const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
+ const __m128i final_top_val = _mm_set1_epi16(top_row[max_base_x]);
+ const __m128i base_step8_vect = _mm_set1_epi16(base_step8);
+ for (; y < min_corner_only_y; ++y, dest += stride, top_x += xstep) {
+ int top_base_x = top_x >> scale_bits;
+
+ const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+ const __m128i shift = _mm_set1_epi8(shift_val);
+ const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
+ const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
+ __m128i top_index_vect = _mm_set1_epi16(top_base_x);
+ top_index_vect = _mm_add_epi16(top_index_vect, offsets);
+
+ int x = 0;
+ const int min_corner_only_x =
+ std::min(width, ((max_base_x - top_base_x) >> upsample_shift) + 7) & ~7;
+ for (; x < min_corner_only_x;
+ x += 8, top_base_x += base_step8,
+ top_index_vect = _mm_add_epi16(top_index_vect, base_step8_vect)) {
+ const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect);
+ // Assuming a buffer zone of 8 bytes at the end of top_row, this prevents
+ // reading out of bounds. If all indices are past max and we don't need to
+ // use the loaded bytes at all, |top_base_x| becomes 0. |top_base_x| will
+ // reset for the next |y|.
+ top_base_x &= ~_mm_cvtsi128_si32(past_max);
+ const __m128i top_vals = LoadUnaligned16(top_row + top_base_x);
+ __m128i vals = _mm_shuffle_epi8(top_vals, sampler);
+ vals = _mm_maddubs_epi16(vals, shifts);
+ vals = RightShiftWithRounding_U16(vals, rounding_bits);
+ vals = _mm_blendv_epi8(vals, final_top_val, past_max);
+ StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
+ }
+ // Corner-only section of the row.
+ memset(dest + x, top_row[max_base_x], width - x);
+ }
+ // Fill in corner-only rows.
+ for (; y < height; ++y) {
+ memset(dest, top_row[max_base_x], width);
+ dest += stride;
+ }
+}
+
+// 7.11.2.4 (7) angle < 90
+inline void DirectionalZone1_SSE4_1(uint8_t* dest, ptrdiff_t stride,
+ const uint8_t* const top_row,
+ const int width, const int height,
+ const int xstep, const bool upsampled) {
+ const int upsample_shift = static_cast<int>(upsampled);
+ if (xstep == 64) {
+ DirectionalZone1_Step64(dest, stride, top_row, width, height);
+ return;
+ }
+ if (width == 4) {
+ DirectionalZone1_4xH(dest, stride, top_row, height, xstep, upsampled);
+ return;
+ }
+ if (width >= 32) {
+ DirectionalZone1_Large(dest, stride, top_row, width, height, xstep,
+ upsampled);
+ return;
+ }
+ const __m128i sampler =
+ upsampled ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
+ : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
+ const int scale_bits = 6 - upsample_shift;
+ const int max_base_x = ((width + height) - 1) << upsample_shift;
+
+ const __m128i max_shift = _mm_set1_epi8(32);
+ // Downscaling for a weighted average whose weights sum to 32 (max_shift).
+ const int rounding_bits = 5;
+ const int base_step = 1 << upsample_shift;
+ const int base_step8 = base_step << 3;
+
+ // No need to check for exceeding |max_base_x| in the loops.
+ if (((xstep * height) >> scale_bits) + base_step * width < max_base_x) {
+ int top_x = xstep;
+ int y = 0;
+ do {
+ int top_base_x = top_x >> scale_bits;
+ // Permit negative values of |top_x|.
+ const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+ const __m128i shift = _mm_set1_epi8(shift_val);
+ const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
+ const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
+ int x = 0;
+ do {
+ const __m128i top_vals = LoadUnaligned16(top_row + top_base_x);
+ __m128i vals = _mm_shuffle_epi8(top_vals, sampler);
+ vals = _mm_maddubs_epi16(vals, shifts);
+ vals = RightShiftWithRounding_U16(vals, rounding_bits);
+ StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
+ top_base_x += base_step8;
+ x += 8;
+ } while (x < width);
+ dest += stride;
+ top_x += xstep;
+ } while (++y < height);
+ return;
+ }
+
+ // Each 16-bit value here corresponds to a position that may exceed
+ // |max_base_x|. When added to the top_base_x, it is used to mask values
+ // that pass the end of |top|. Starting from 1 to simulate "cmpge" which is
+ // not supported for packed integers.
+ const __m128i offsets =
+ _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
+
+ const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
+ const __m128i final_top_val = _mm_set1_epi16(top_row[max_base_x]);
+ const __m128i base_step8_vect = _mm_set1_epi16(base_step8);
+ int top_x = xstep;
+ int y = 0;
+ do {
+ int top_base_x = top_x >> scale_bits;
+
+ if (top_base_x >= max_base_x) {
+ for (int i = y; i < height; ++i) {
+ memset(dest, top_row[max_base_x], width);
+ dest += stride;
+ }
+ return;
+ }
+
+ const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+ const __m128i shift = _mm_set1_epi8(shift_val);
+ const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
+ const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
+ __m128i top_index_vect = _mm_set1_epi16(top_base_x);
+ top_index_vect = _mm_add_epi16(top_index_vect, offsets);
+
+ int x = 0;
+ for (; x < width - 8;
+ x += 8, top_base_x += base_step8,
+ top_index_vect = _mm_add_epi16(top_index_vect, base_step8_vect)) {
+ const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect);
+ // Assuming a buffer zone of 8 bytes at the end of top_row, this prevents
+ // reading out of bounds. If all indices are past max and we don't need to
+ // use the loaded bytes at all, |top_base_x| becomes 0. |top_base_x| will
+ // reset for the next |y|.
+ top_base_x &= ~_mm_cvtsi128_si32(past_max);
+ const __m128i top_vals = LoadUnaligned16(top_row + top_base_x);
+ __m128i vals = _mm_shuffle_epi8(top_vals, sampler);
+ vals = _mm_maddubs_epi16(vals, shifts);
+ vals = RightShiftWithRounding_U16(vals, rounding_bits);
+ vals = _mm_blendv_epi8(vals, final_top_val, past_max);
+ StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
+ }
+ const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect);
+ __m128i vals;
+ if (upsampled) {
+ vals = LoadUnaligned16(top_row + top_base_x);
+ } else {
+ const __m128i top_vals = LoadLo8(top_row + top_base_x);
+ vals = _mm_shuffle_epi8(top_vals, sampler);
+ vals = _mm_insert_epi8(vals, top_row[top_base_x + 8], 15);
+ }
+ vals = _mm_maddubs_epi16(vals, shifts);
+ vals = RightShiftWithRounding_U16(vals, rounding_bits);
+ vals = _mm_blendv_epi8(vals, final_top_val, past_max);
+ StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
+ dest += stride;
+ top_x += xstep;
+ } while (++y < height);
+}
+
+void DirectionalIntraPredictorZone1_SSE4_1(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const int width, const int height,
+ const int xstep,
+ const bool upsampled_top) {
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ auto* dst = static_cast<uint8_t*>(dest);
+ DirectionalZone1_SSE4_1(dst, stride, top_ptr, width, height, xstep,
+ upsampled_top);
+}
+
+template <bool upsampled>
+inline void DirectionalZone3_4x4(uint8_t* dest, ptrdiff_t stride,
+ const uint8_t* const left_column,
+ const int base_left_y, const int ystep) {
+ // For use in the non-upsampled case.
+ const __m128i sampler = _mm_set_epi64x(0, 0x0403030202010100);
+ const int upsample_shift = static_cast<int>(upsampled);
+ const int scale_bits = 6 - upsample_shift;
+ const __m128i max_shift = _mm_set1_epi8(32);
+ // Downscaling for a weighted average whose weights sum to 32 (max_shift).
+ const int rounding_bits = 5;
+
+ __m128i result_block[4];
+ for (int x = 0, left_y = base_left_y; x < 4; x++, left_y += ystep) {
+ const int left_base_y = left_y >> scale_bits;
+ const int shift_val = ((left_y << upsample_shift) & 0x3F) >> 1;
+ const __m128i shift = _mm_set1_epi8(shift_val);
+ const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
+ const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
+ __m128i vals;
+ if (upsampled) {
+ vals = LoadLo8(left_column + left_base_y);
+ } else {
+ const __m128i top_vals = LoadLo8(left_column + left_base_y);
+ vals = _mm_shuffle_epi8(top_vals, sampler);
+ }
+ vals = _mm_maddubs_epi16(vals, shifts);
+ vals = RightShiftWithRounding_U16(vals, rounding_bits);
+ result_block[x] = _mm_packus_epi16(vals, vals);
+ }
+ const __m128i result = Transpose4x4_U8(result_block);
+ // This is result_row0.
+ Store4(dest, result);
+ dest += stride;
+ const int result_row1 = _mm_extract_epi32(result, 1);
+ memcpy(dest, &result_row1, sizeof(result_row1));
+ dest += stride;
+ const int result_row2 = _mm_extract_epi32(result, 2);
+ memcpy(dest, &result_row2, sizeof(result_row2));
+ dest += stride;
+ const int result_row3 = _mm_extract_epi32(result, 3);
+ memcpy(dest, &result_row3, sizeof(result_row3));
+}
+
+template <bool upsampled, int height>
+inline void DirectionalZone3_8xH(uint8_t* dest, ptrdiff_t stride,
+ const uint8_t* const left_column,
+ const int base_left_y, const int ystep) {
+ // For use in the non-upsampled case.
+ const __m128i sampler =
+ _mm_set_epi64x(0x0807070606050504, 0x0403030202010100);
+ const int upsample_shift = static_cast<int>(upsampled);
+ const int scale_bits = 6 - upsample_shift;
+ const __m128i max_shift = _mm_set1_epi8(32);
+ // Downscaling for a weighted average whose weights sum to 32 (max_shift).
+ const int rounding_bits = 5;
+
+ __m128i result_block[8];
+ for (int x = 0, left_y = base_left_y; x < 8; x++, left_y += ystep) {
+ const int left_base_y = left_y >> scale_bits;
+ const int shift_val = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ const __m128i shift = _mm_set1_epi8(shift_val);
+ const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
+ const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
+ __m128i vals;
+ if (upsampled) {
+ vals = LoadUnaligned16(left_column + left_base_y);
+ } else {
+ const __m128i top_vals = LoadUnaligned16(left_column + left_base_y);
+ vals = _mm_shuffle_epi8(top_vals, sampler);
+ }
+ vals = _mm_maddubs_epi16(vals, shifts);
+ result_block[x] = RightShiftWithRounding_U16(vals, rounding_bits);
+ }
+ Transpose8x8_U16(result_block, result_block);
+ for (int y = 0; y < height; ++y) {
+ StoreLo8(dest, _mm_packus_epi16(result_block[y], result_block[y]));
+ dest += stride;
+ }
+}
+
+// 7.11.2.4 (9) angle > 180
+void DirectionalIntraPredictorZone3_SSE4_1(void* dest, ptrdiff_t stride,
+ const void* const left_column,
+ const int width, const int height,
+ const int ystep,
+ const bool upsampled) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ auto* dst = static_cast<uint8_t*>(dest);
+ const int upsample_shift = static_cast<int>(upsampled);
+ if (width == 4 || height == 4) {
+ const ptrdiff_t stride4 = stride << 2;
+ if (upsampled) {
+ int left_y = ystep;
+ int x = 0;
+ do {
+ uint8_t* dst_x = dst + x;
+ int y = 0;
+ do {
+ DirectionalZone3_4x4<true>(
+ dst_x, stride, left_ptr + (y << upsample_shift), left_y, ystep);
+ dst_x += stride4;
+ y += 4;
+ } while (y < height);
+ left_y += ystep << 2;
+ x += 4;
+ } while (x < width);
+ } else {
+ int left_y = ystep;
+ int x = 0;
+ do {
+ uint8_t* dst_x = dst + x;
+ int y = 0;
+ do {
+ DirectionalZone3_4x4<false>(dst_x, stride, left_ptr + y, left_y,
+ ystep);
+ dst_x += stride4;
+ y += 4;
+ } while (y < height);
+ left_y += ystep << 2;
+ x += 4;
+ } while (x < width);
+ }
+ return;
+ }
+
+ const ptrdiff_t stride8 = stride << 3;
+ if (upsampled) {
+ int left_y = ystep;
+ int x = 0;
+ do {
+ uint8_t* dst_x = dst + x;
+ int y = 0;
+ do {
+ DirectionalZone3_8xH<true, 8>(
+ dst_x, stride, left_ptr + (y << upsample_shift), left_y, ystep);
+ dst_x += stride8;
+ y += 8;
+ } while (y < height);
+ left_y += ystep << 3;
+ x += 8;
+ } while (x < width);
+ } else {
+ int left_y = ystep;
+ int x = 0;
+ do {
+ uint8_t* dst_x = dst + x;
+ int y = 0;
+ do {
+ DirectionalZone3_8xH<false, 8>(
+ dst_x, stride, left_ptr + (y << upsample_shift), left_y, ystep);
+ dst_x += stride8;
+ y += 8;
+ } while (y < height);
+ left_y += ystep << 3;
+ x += 8;
+ } while (x < width);
+ }
+}
+
+//------------------------------------------------------------------------------
+// Directional Zone 2 Functions
+// 7.11.2.4 (8)
+
+// DirectionalBlend* selectively overwrites the values written by
+// DirectionalZone2FromLeftCol*. |zone_bounds| has one 16-bit index for each
+// row.
+template <int y_selector>
+inline void DirectionalBlend4_SSE4_1(uint8_t* dest,
+ const __m128i& dest_index_vect,
+ const __m128i& vals,
+ const __m128i& zone_bounds) {
+ const __m128i max_dest_x_vect = _mm_shufflelo_epi16(zone_bounds, y_selector);
+ const __m128i use_left = _mm_cmplt_epi16(dest_index_vect, max_dest_x_vect);
+ const __m128i original_vals = _mm_cvtepu8_epi16(Load4(dest));
+ const __m128i blended_vals = _mm_blendv_epi8(vals, original_vals, use_left);
+ Store4(dest, _mm_packus_epi16(blended_vals, blended_vals));
+}
+
+inline void DirectionalBlend8_SSE4_1(uint8_t* dest,
+ const __m128i& dest_index_vect,
+ const __m128i& vals,
+ const __m128i& zone_bounds,
+ const __m128i& bounds_selector) {
+ const __m128i max_dest_x_vect =
+ _mm_shuffle_epi8(zone_bounds, bounds_selector);
+ const __m128i use_left = _mm_cmplt_epi16(dest_index_vect, max_dest_x_vect);
+ const __m128i original_vals = _mm_cvtepu8_epi16(LoadLo8(dest));
+ const __m128i blended_vals = _mm_blendv_epi8(vals, original_vals, use_left);
+ StoreLo8(dest, _mm_packus_epi16(blended_vals, blended_vals));
+}
+
+constexpr int kDirectionalWeightBits = 5;
+// |source| is packed with 4 or 8 pairs of 8-bit values from left or top.
+// |shifts| is named to match the specification, with 4 or 8 pairs of (32 -
+// shift) and shift. Shift is guaranteed to be between 0 and 32.
+inline __m128i DirectionalZone2FromSource_SSE4_1(const uint8_t* const source,
+ const __m128i& shifts,
+ const __m128i& sampler) {
+ const __m128i src_vals = LoadUnaligned16(source);
+ __m128i vals = _mm_shuffle_epi8(src_vals, sampler);
+ vals = _mm_maddubs_epi16(vals, shifts);
+ return RightShiftWithRounding_U16(vals, kDirectionalWeightBits);
+}
+
+// Because the source values "move backwards" as the row index increases, the
+// indices derived from ystep are generally negative. This is accommodated by
+// making sure the relative indices are within [-15, 0] when the function is
+// called, and sliding them into the inclusive range [0, 15], relative to a
+// lower base address.
+constexpr int kPositiveIndexOffset = 15;
+
+template <bool upsampled>
+inline void DirectionalZone2FromLeftCol_4x4_SSE4_1(
+ uint8_t* dst, ptrdiff_t stride, const uint8_t* const left_column_base,
+ __m128i left_y) {
+ const int upsample_shift = static_cast<int>(upsampled);
+ const int scale_bits = 6 - upsample_shift;
+ const __m128i max_shifts = _mm_set1_epi8(32);
+ const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
+ const __m128i index_increment = _mm_cvtsi32_si128(0x01010101);
+ const __m128i positive_offset = _mm_set1_epi8(kPositiveIndexOffset);
+ // Left_column and sampler are both offset by 15 so the indices are always
+ // positive.
+ const uint8_t* left_column = left_column_base - kPositiveIndexOffset;
+ for (int y = 0; y < 4; dst += stride, ++y) {
+ __m128i offset_y = _mm_srai_epi16(left_y, scale_bits);
+ offset_y = _mm_packs_epi16(offset_y, offset_y);
+
+ const __m128i adjacent = _mm_add_epi8(offset_y, index_increment);
+ __m128i sampler = _mm_unpacklo_epi8(offset_y, adjacent);
+ // Slide valid |offset_y| indices from range [-15, 0] to [0, 15] so they
+ // can work as shuffle indices. Some values may be out of bounds, but their
+ // pred results will be masked over by top prediction.
+ sampler = _mm_add_epi8(sampler, positive_offset);
+
+ __m128i shifts = _mm_srli_epi16(
+ _mm_and_si128(_mm_slli_epi16(left_y, upsample_shift), shift_mask), 1);
+ shifts = _mm_packus_epi16(shifts, shifts);
+ const __m128i opposite_shifts = _mm_sub_epi8(max_shifts, shifts);
+ shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
+ const __m128i vals = DirectionalZone2FromSource_SSE4_1(
+ left_column + (y << upsample_shift), shifts, sampler);
+ Store4(dst, _mm_packus_epi16(vals, vals));
+ }
+}
+
+// The height at which a load of 16 bytes will not contain enough source pixels
+// from |left_column| to supply an accurate row when computing 8 pixels at a
+// time. The values are found by inspection. By coincidence, all angles that
+// satisfy (ystep >> 6) == 2 map to the same value, so it is enough to look up
+// by ystep >> 6. The largest index for this lookup is 1023 >> 6 == 15.
+constexpr int kDirectionalZone2ShuffleInvalidHeight[16] = {
+ 1024, 1024, 16, 16, 16, 16, 0, 0, 18, 0, 0, 0, 0, 0, 0, 40};
+
+template <bool upsampled>
+inline void DirectionalZone2FromLeftCol_8x8_SSE4_1(
+ uint8_t* dst, ptrdiff_t stride, const uint8_t* const left_column,
+ __m128i left_y) {
+ const int upsample_shift = static_cast<int>(upsampled);
+ const int scale_bits = 6 - upsample_shift;
+ const __m128i max_shifts = _mm_set1_epi8(32);
+ const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
+ const __m128i index_increment = _mm_set1_epi8(1);
+ const __m128i denegation = _mm_set1_epi8(kPositiveIndexOffset);
+ for (int y = 0; y < 8; dst += stride, ++y) {
+ __m128i offset_y = _mm_srai_epi16(left_y, scale_bits);
+ offset_y = _mm_packs_epi16(offset_y, offset_y);
+ const __m128i adjacent = _mm_add_epi8(offset_y, index_increment);
+
+ // Offset the relative index because ystep is negative in Zone 2 and shuffle
+ // indices must be nonnegative.
+ __m128i sampler = _mm_unpacklo_epi8(offset_y, adjacent);
+ sampler = _mm_add_epi8(sampler, denegation);
+
+ __m128i shifts = _mm_srli_epi16(
+ _mm_and_si128(_mm_slli_epi16(left_y, upsample_shift), shift_mask), 1);
+ shifts = _mm_packus_epi16(shifts, shifts);
+ const __m128i opposite_shifts = _mm_sub_epi8(max_shifts, shifts);
+ shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
+
+ // The specification adds (y << 6) to left_y, which is subject to
+ // upsampling, but this puts sampler indices out of the 0-15 range. It is
+ // equivalent to offset the source address by (y << upsample_shift) instead.
+ const __m128i vals = DirectionalZone2FromSource_SSE4_1(
+ left_column - kPositiveIndexOffset + (y << upsample_shift), shifts,
+ sampler);
+ StoreLo8(dst, _mm_packus_epi16(vals, vals));
+ }
+}
+
+// |zone_bounds| is an epi16 of the relative x index at which base >= -(1 <<
+// upsampled_top), for each row. When there are 4 values, they can be duplicated
+// with a non-register shuffle mask.
+// |shifts| is one pair of weights that applies throughout a given row.
+template <bool upsampled_top>
+inline void DirectionalZone1Blend_4x4(
+ uint8_t* dest, const uint8_t* const top_row, ptrdiff_t stride,
+ __m128i sampler, const __m128i& zone_bounds, const __m128i& shifts,
+ const __m128i& dest_index_x, int top_x, const int xstep) {
+ const int upsample_shift = static_cast<int>(upsampled_top);
+ const int scale_bits_x = 6 - upsample_shift;
+ top_x -= xstep;
+
+ int top_base_x = (top_x >> scale_bits_x);
+ const __m128i vals0 = DirectionalZone2FromSource_SSE4_1(
+ top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0x00), sampler);
+ DirectionalBlend4_SSE4_1<0x00>(dest, dest_index_x, vals0, zone_bounds);
+ top_x -= xstep;
+ dest += stride;
+
+ top_base_x = (top_x >> scale_bits_x);
+ const __m128i vals1 = DirectionalZone2FromSource_SSE4_1(
+ top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0x55), sampler);
+ DirectionalBlend4_SSE4_1<0x55>(dest, dest_index_x, vals1, zone_bounds);
+ top_x -= xstep;
+ dest += stride;
+
+ top_base_x = (top_x >> scale_bits_x);
+ const __m128i vals2 = DirectionalZone2FromSource_SSE4_1(
+ top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0xAA), sampler);
+ DirectionalBlend4_SSE4_1<0xAA>(dest, dest_index_x, vals2, zone_bounds);
+ top_x -= xstep;
+ dest += stride;
+
+ top_base_x = (top_x >> scale_bits_x);
+ const __m128i vals3 = DirectionalZone2FromSource_SSE4_1(
+ top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0xFF), sampler);
+ DirectionalBlend4_SSE4_1<0xFF>(dest, dest_index_x, vals3, zone_bounds);
+}
+
+template <bool upsampled_top, int height>
+inline void DirectionalZone1Blend_8xH(
+ uint8_t* dest, const uint8_t* const top_row, ptrdiff_t stride,
+ __m128i sampler, const __m128i& zone_bounds, const __m128i& shifts,
+ const __m128i& dest_index_x, int top_x, const int xstep) {
+ const int upsample_shift = static_cast<int>(upsampled_top);
+ const int scale_bits_x = 6 - upsample_shift;
+
+ __m128i y_selector = _mm_set1_epi32(0x01000100);
+ const __m128i index_increment = _mm_set1_epi32(0x02020202);
+ for (int y = 0; y < height; ++y,
+ y_selector = _mm_add_epi8(y_selector, index_increment),
+ dest += stride) {
+ top_x -= xstep;
+ const int top_base_x = top_x >> scale_bits_x;
+ const __m128i vals = DirectionalZone2FromSource_SSE4_1(
+ top_row + top_base_x, _mm_shuffle_epi8(shifts, y_selector), sampler);
+ DirectionalBlend8_SSE4_1(dest, dest_index_x, vals, zone_bounds, y_selector);
+ }
+}
+
+// 7.11.2.4 (8) 90 < angle > 180
+// The strategy for this function is to know how many blocks can be processed
+// with just pixels from |top_ptr|, then handle mixed blocks, then handle only
+// blocks that take from |left_ptr|. Additionally, a fast index-shuffle
+// approach is used for pred values from |left_column| in sections that permit
+// it.
+template <bool upsampled_left, bool upsampled_top>
+inline void DirectionalZone2_SSE4_1(void* dest, ptrdiff_t stride,
+ const uint8_t* const top_row,
+ const uint8_t* const left_column,
+ const int width, const int height,
+ const int xstep, const int ystep) {
+ auto* dst = static_cast<uint8_t*>(dest);
+ const int upsample_left_shift = static_cast<int>(upsampled_left);
+ const int upsample_top_shift = static_cast<int>(upsampled_top);
+ const __m128i max_shift = _mm_set1_epi8(32);
+ const ptrdiff_t stride8 = stride << 3;
+ const __m128i dest_index_x =
+ _mm_set_epi32(0x00070006, 0x00050004, 0x00030002, 0x00010000);
+ const __m128i sampler_top =
+ upsampled_top
+ ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
+ : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
+ const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
+ // All columns from |min_top_only_x| to the right will only need |top_row| to
+ // compute. This assumes minimum |xstep| is 3.
+ const int min_top_only_x = std::min((height * xstep) >> 6, width);
+
+ // For steep angles, the source pixels from left_column may not fit in a
+ // 16-byte load for shuffling.
+ // TODO(petersonab): Find a more precise formula for this subject to x.
+ const int max_shuffle_height =
+ std::min(height, kDirectionalZone2ShuffleInvalidHeight[ystep >> 6]);
+
+ const int xstep8 = xstep << 3;
+ const __m128i xstep8_vect = _mm_set1_epi16(xstep8);
+ // Accumulate xstep across 8 rows.
+ const __m128i xstep_dup = _mm_set1_epi16(-xstep);
+ const __m128i increments = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
+ const __m128i xstep_for_shift = _mm_mullo_epi16(xstep_dup, increments);
+ // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
+ const __m128i scaled_one = _mm_set1_epi16(-64);
+ __m128i xstep_bounds_base =
+ (xstep == 64) ? _mm_sub_epi16(scaled_one, xstep_for_shift)
+ : _mm_sub_epi16(_mm_set1_epi16(-1), xstep_for_shift);
+
+ const int left_base_increment = ystep >> 6;
+ const int ystep_remainder = ystep & 0x3F;
+ const int ystep8 = ystep << 3;
+ const int left_base_increment8 = ystep8 >> 6;
+ const int ystep_remainder8 = ystep8 & 0x3F;
+ const __m128i increment_left8 = _mm_set1_epi16(-ystep_remainder8);
+
+ // If the 64 scaling is regarded as a decimal point, the first value of the
+ // left_y vector omits the portion which is covered under the left_column
+ // offset. Following values need the full ystep as a relative offset.
+ const __m128i ystep_init = _mm_set1_epi16(-ystep_remainder);
+ const __m128i ystep_dup = _mm_set1_epi16(-ystep);
+ __m128i left_y = _mm_mullo_epi16(ystep_dup, dest_index_x);
+ left_y = _mm_add_epi16(ystep_init, left_y);
+
+ const __m128i increment_top8 = _mm_set1_epi16(8 << 6);
+ int x = 0;
+
+ // This loop treats each set of 4 columns in 3 stages with y-value boundaries.
+ // The first stage, before the first y-loop, covers blocks that are only
+ // computed from the top row. The second stage, comprising two y-loops, covers
+ // blocks that have a mixture of values computed from top or left. The final
+ // stage covers blocks that are only computed from the left.
+ for (int left_offset = -left_base_increment; x < min_top_only_x;
+ x += 8,
+ xstep_bounds_base = _mm_sub_epi16(xstep_bounds_base, increment_top8),
+ // Watch left_y because it can still get big.
+ left_y = _mm_add_epi16(left_y, increment_left8),
+ left_offset -= left_base_increment8) {
+ uint8_t* dst_x = dst + x;
+
+ // Round down to the nearest multiple of 8.
+ const int max_top_only_y = std::min(((x + 1) << 6) / xstep, height) & ~7;
+ DirectionalZone1_4xH(dst_x, stride, top_row + (x << upsample_top_shift),
+ max_top_only_y, -xstep, upsampled_top);
+ DirectionalZone1_4xH(dst_x + 4, stride,
+ top_row + ((x + 4) << upsample_top_shift),
+ max_top_only_y, -xstep, upsampled_top);
+
+ int y = max_top_only_y;
+ dst_x += stride * y;
+ const int xstep_y = xstep * y;
+ const __m128i xstep_y_vect = _mm_set1_epi16(xstep_y);
+ // All rows from |min_left_only_y| down for this set of columns, only need
+ // |left_column| to compute.
+ const int min_left_only_y = std::min(((x + 8) << 6) / xstep, height);
+ // At high angles such that min_left_only_y < 8, ystep is low and xstep is
+ // high. This means that max_shuffle_height is unbounded and xstep_bounds
+ // will overflow in 16 bits. This is prevented by stopping the first
+ // blending loop at min_left_only_y for such cases, which means we skip over
+ // the second blending loop as well.
+ const int left_shuffle_stop_y =
+ std::min(max_shuffle_height, min_left_only_y);
+ __m128i xstep_bounds = _mm_add_epi16(xstep_bounds_base, xstep_y_vect);
+ __m128i xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift, xstep_y_vect);
+ int top_x = -xstep_y;
+
+ for (; y < left_shuffle_stop_y;
+ y += 8, dst_x += stride8,
+ xstep_bounds = _mm_add_epi16(xstep_bounds, xstep8_vect),
+ xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep8_vect),
+ top_x -= xstep8) {
+ DirectionalZone2FromLeftCol_8x8_SSE4_1<upsampled_left>(
+ dst_x, stride,
+ left_column + ((left_offset + y) << upsample_left_shift), left_y);
+
+ __m128i shifts = _mm_srli_epi16(
+ _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift),
+ shift_mask),
+ 1);
+ shifts = _mm_packus_epi16(shifts, shifts);
+ __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts);
+ shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
+ __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6);
+ DirectionalZone1Blend_8xH<upsampled_top, 8>(
+ dst_x, top_row + (x << upsample_top_shift), stride, sampler_top,
+ xstep_bounds_off, shifts, dest_index_x, top_x, xstep);
+ }
+ // Pick up from the last y-value, using the 10% slower but secure method for
+ // left prediction.
+ const auto base_left_y = static_cast<int16_t>(_mm_extract_epi16(left_y, 0));
+ for (; y < min_left_only_y;
+ y += 8, dst_x += stride8,
+ xstep_bounds = _mm_add_epi16(xstep_bounds, xstep8_vect),
+ xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep8_vect),
+ top_x -= xstep8) {
+ const __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6);
+
+ DirectionalZone3_8xH<upsampled_left, 8>(
+ dst_x, stride,
+ left_column + ((left_offset + y) << upsample_left_shift), base_left_y,
+ -ystep);
+
+ __m128i shifts = _mm_srli_epi16(
+ _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift),
+ shift_mask),
+ 1);
+ shifts = _mm_packus_epi16(shifts, shifts);
+ __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts);
+ shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
+ DirectionalZone1Blend_8xH<upsampled_top, 8>(
+ dst_x, top_row + (x << upsample_top_shift), stride, sampler_top,
+ xstep_bounds_off, shifts, dest_index_x, top_x, xstep);
+ }
+ // Loop over y for left_only rows.
+ for (; y < height; y += 8, dst_x += stride8) {
+ DirectionalZone3_8xH<upsampled_left, 8>(
+ dst_x, stride,
+ left_column + ((left_offset + y) << upsample_left_shift), base_left_y,
+ -ystep);
+ }
+ }
+ for (; x < width; x += 4) {
+ DirectionalZone1_4xH(dst + x, stride, top_row + (x << upsample_top_shift),
+ height, -xstep, upsampled_top);
+ }
+}
+
+template <bool upsampled_left, bool upsampled_top>
+inline void DirectionalZone2_4_SSE4_1(void* dest, ptrdiff_t stride,
+ const uint8_t* const top_row,
+ const uint8_t* const left_column,
+ const int width, const int height,
+ const int xstep, const int ystep) {
+ auto* dst = static_cast<uint8_t*>(dest);
+ const int upsample_left_shift = static_cast<int>(upsampled_left);
+ const int upsample_top_shift = static_cast<int>(upsampled_top);
+ const __m128i max_shift = _mm_set1_epi8(32);
+ const ptrdiff_t stride4 = stride << 2;
+ const __m128i dest_index_x = _mm_set_epi32(0, 0, 0x00030002, 0x00010000);
+ const __m128i sampler_top =
+ upsampled_top
+ ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
+ : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
+ // All columns from |min_top_only_x| to the right will only need |top_row| to
+ // compute.
+ assert(xstep >= 3);
+ const int min_top_only_x = std::min((height * xstep) >> 6, width);
+
+ const int xstep4 = xstep << 2;
+ const __m128i xstep4_vect = _mm_set1_epi16(xstep4);
+ const __m128i xstep_dup = _mm_set1_epi16(-xstep);
+ const __m128i increments = _mm_set_epi32(0, 0, 0x00040003, 0x00020001);
+ __m128i xstep_for_shift = _mm_mullo_epi16(xstep_dup, increments);
+ const __m128i scaled_one = _mm_set1_epi16(-64);
+ // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
+ __m128i xstep_bounds_base =
+ (xstep == 64) ? _mm_sub_epi16(scaled_one, xstep_for_shift)
+ : _mm_sub_epi16(_mm_set1_epi16(-1), xstep_for_shift);
+
+ const int left_base_increment = ystep >> 6;
+ const int ystep_remainder = ystep & 0x3F;
+ const int ystep4 = ystep << 2;
+ const int left_base_increment4 = ystep4 >> 6;
+ // This is guaranteed to be less than 64, but accumulation may bring it past
+ // 64 for higher x values.
+ const int ystep_remainder4 = ystep4 & 0x3F;
+ const __m128i increment_left4 = _mm_set1_epi16(-ystep_remainder4);
+ const __m128i increment_top4 = _mm_set1_epi16(4 << 6);
+
+ // If the 64 scaling is regarded as a decimal point, the first value of the
+ // left_y vector omits the portion which will go into the left_column offset.
+ // Following values need the full ystep as a relative offset.
+ const __m128i ystep_init = _mm_set1_epi16(-ystep_remainder);
+ const __m128i ystep_dup = _mm_set1_epi16(-ystep);
+ __m128i left_y = _mm_mullo_epi16(ystep_dup, dest_index_x);
+ left_y = _mm_add_epi16(ystep_init, left_y);
+ const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
+
+ int x = 0;
+ // Loop over x for columns with a mixture of sources.
+ for (int left_offset = -left_base_increment; x < min_top_only_x; x += 4,
+ xstep_bounds_base = _mm_sub_epi16(xstep_bounds_base, increment_top4),
+ left_y = _mm_add_epi16(left_y, increment_left4),
+ left_offset -= left_base_increment4) {
+ uint8_t* dst_x = dst + x;
+
+ // Round down to the nearest multiple of 8.
+ const int max_top_only_y = std::min((x << 6) / xstep, height) & 0xFFFFFFF4;
+ DirectionalZone1_4xH(dst_x, stride, top_row + (x << upsample_top_shift),
+ max_top_only_y, -xstep, upsampled_top);
+ int y = max_top_only_y;
+ dst_x += stride * y;
+ const int xstep_y = xstep * y;
+ const __m128i xstep_y_vect = _mm_set1_epi16(xstep_y);
+ // All rows from |min_left_only_y| down for this set of columns, only need
+ // |left_column| to compute. Rounded up to the nearest multiple of 4.
+ const int min_left_only_y = std::min(((x + 4) << 6) / xstep, height);
+
+ __m128i xstep_bounds = _mm_add_epi16(xstep_bounds_base, xstep_y_vect);
+ __m128i xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift, xstep_y_vect);
+ int top_x = -xstep_y;
+
+ // Loop over y for mixed rows.
+ for (; y < min_left_only_y;
+ y += 4, dst_x += stride4,
+ xstep_bounds = _mm_add_epi16(xstep_bounds, xstep4_vect),
+ xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep4_vect),
+ top_x -= xstep4) {
+ DirectionalZone2FromLeftCol_4x4_SSE4_1<upsampled_left>(
+ dst_x, stride,
+ left_column + ((left_offset + y) * (1 << upsample_left_shift)),
+ left_y);
+
+ __m128i shifts = _mm_srli_epi16(
+ _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift),
+ shift_mask),
+ 1);
+ shifts = _mm_packus_epi16(shifts, shifts);
+ const __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts);
+ shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
+ const __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6);
+ DirectionalZone1Blend_4x4<upsampled_top>(
+ dst_x, top_row + (x << upsample_top_shift), stride, sampler_top,
+ xstep_bounds_off, shifts, dest_index_x, top_x, xstep);
+ }
+ // Loop over y for left-only rows, if any.
+ for (; y < height; y += 4, dst_x += stride4) {
+ DirectionalZone2FromLeftCol_4x4_SSE4_1<upsampled_left>(
+ dst_x, stride,
+ left_column + ((left_offset + y) << upsample_left_shift), left_y);
+ }
+ }
+ // Loop over top-only columns, if any.
+ for (; x < width; x += 4) {
+ DirectionalZone1_4xH(dst + x, stride, top_row + (x << upsample_top_shift),
+ height, -xstep, upsampled_top);
+ }
+}
+
+void DirectionalIntraPredictorZone2_SSE4_1(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column,
+ const int width, const int height,
+ const int xstep, const int ystep,
+ const bool upsampled_top,
+ const bool upsampled_left) {
+ // Increasing the negative buffer for this function allows more rows to be
+ // processed at a time without branching in an inner loop to check the base.
+ uint8_t top_buffer[288];
+ uint8_t left_buffer[288];
+ memcpy(top_buffer + 128, static_cast<const uint8_t*>(top_row) - 16, 160);
+ memcpy(left_buffer + 128, static_cast<const uint8_t*>(left_column) - 16, 160);
+ const uint8_t* top_ptr = top_buffer + 144;
+ const uint8_t* left_ptr = left_buffer + 144;
+ if (width == 4 || height == 4) {
+ if (upsampled_left) {
+ if (upsampled_top) {
+ DirectionalZone2_4_SSE4_1<true, true>(dest, stride, top_ptr, left_ptr,
+ width, height, xstep, ystep);
+ } else {
+ DirectionalZone2_4_SSE4_1<true, false>(dest, stride, top_ptr, left_ptr,
+ width, height, xstep, ystep);
+ }
+ } else {
+ if (upsampled_top) {
+ DirectionalZone2_4_SSE4_1<false, true>(dest, stride, top_ptr, left_ptr,
+ width, height, xstep, ystep);
+ } else {
+ DirectionalZone2_4_SSE4_1<false, false>(dest, stride, top_ptr, left_ptr,
+ width, height, xstep, ystep);
+ }
+ }
+ return;
+ }
+ if (upsampled_left) {
+ if (upsampled_top) {
+ DirectionalZone2_SSE4_1<true, true>(dest, stride, top_ptr, left_ptr,
+ width, height, xstep, ystep);
+ } else {
+ DirectionalZone2_SSE4_1<true, false>(dest, stride, top_ptr, left_ptr,
+ width, height, xstep, ystep);
+ }
+ } else {
+ if (upsampled_top) {
+ DirectionalZone2_SSE4_1<false, true>(dest, stride, top_ptr, left_ptr,
+ width, height, xstep, ystep);
+ } else {
+ DirectionalZone2_SSE4_1<false, false>(dest, stride, top_ptr, left_ptr,
+ width, height, xstep, ystep);
+ }
+ }
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ static_cast<void>(dsp);
+#if DSP_ENABLED_8BPP_SSE4_1(DirectionalIntraPredictorZone1)
+ dsp->directional_intra_predictor_zone1 =
+ DirectionalIntraPredictorZone1_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(DirectionalIntraPredictorZone2)
+ dsp->directional_intra_predictor_zone2 =
+ DirectionalIntraPredictorZone2_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(DirectionalIntraPredictorZone3)
+ dsp->directional_intra_predictor_zone3 =
+ DirectionalIntraPredictorZone3_SSE4_1;
+#endif
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+//------------------------------------------------------------------------------
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+//------------------------------------------------------------------------------
+// 7.11.2.4. Directional intra prediction process
+
+// Special case: An |xstep| of 64 corresponds to an angle delta of 45, meaning
+// upsampling is ruled out. In addition, the bits masked by 0x3F for
+// |shift_val| are 0 for all multiples of 64, so the formula
+// val = top[top_base_x]*shift + top[top_base_x+1]*(32-shift), reduces to
+// val = top[top_base_x+1] << 5, meaning only the second set of pixels is
+// involved in the output. Hence |top| is offset by 1.
+inline void DirectionalZone1_Step64(uint16_t* dst, ptrdiff_t stride,
+ const uint16_t* const top, const int width,
+ const int height) {
+ ptrdiff_t offset = 1;
+ if (height == 4) {
+ memcpy(dst, top + offset, width * sizeof(dst[0]));
+ dst += stride;
+ memcpy(dst, top + offset + 1, width * sizeof(dst[0]));
+ dst += stride;
+ memcpy(dst, top + offset + 2, width * sizeof(dst[0]));
+ dst += stride;
+ memcpy(dst, top + offset + 3, width * sizeof(dst[0]));
+ return;
+ }
+ int y = height;
+ do {
+ memcpy(dst, top + offset, width * sizeof(dst[0]));
+ dst += stride;
+ memcpy(dst, top + offset + 1, width * sizeof(dst[0]));
+ dst += stride;
+ memcpy(dst, top + offset + 2, width * sizeof(dst[0]));
+ dst += stride;
+ memcpy(dst, top + offset + 3, width * sizeof(dst[0]));
+ dst += stride;
+ memcpy(dst, top + offset + 4, width * sizeof(dst[0]));
+ dst += stride;
+ memcpy(dst, top + offset + 5, width * sizeof(dst[0]));
+ dst += stride;
+ memcpy(dst, top + offset + 6, width * sizeof(dst[0]));
+ dst += stride;
+ memcpy(dst, top + offset + 7, width * sizeof(dst[0]));
+ dst += stride;
+
+ offset += 8;
+ y -= 8;
+ } while (y != 0);
+}
+
+// Produce a weighted average whose weights sum to 32.
+inline __m128i CombineTopVals4(const __m128i& top_vals, const __m128i& sampler,
+ const __m128i& shifts,
+ const __m128i& top_indices,
+ const __m128i& final_top_val,
+ const __m128i& border_index) {
+ const __m128i sampled_values = _mm_shuffle_epi8(top_vals, sampler);
+ __m128i prod = _mm_mullo_epi16(sampled_values, shifts);
+ prod = _mm_hadd_epi16(prod, prod);
+ const __m128i result = RightShiftWithRounding_U16(prod, 5 /*log2(32)*/);
+
+ const __m128i past_max = _mm_cmpgt_epi16(top_indices, border_index);
+ // Replace pixels from invalid range with top-right corner.
+ return _mm_blendv_epi8(result, final_top_val, past_max);
+}
+
+// When width is 4, only one load operation is needed per iteration. We also
+// avoid extra loop precomputations that cause too much overhead.
+inline void DirectionalZone1_4xH(uint16_t* dst, ptrdiff_t stride,
+ const uint16_t* const top, const int height,
+ const int xstep, const bool upsampled,
+ const __m128i& sampler) {
+ const int upsample_shift = static_cast<int>(upsampled);
+ const int index_scale_bits = 6 - upsample_shift;
+ const int max_base_x = (height + 3 /* width - 1 */) << upsample_shift;
+ const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
+ const __m128i final_top_val = _mm_set1_epi16(top[max_base_x]);
+
+ // Each 16-bit value here corresponds to a position that may exceed
+ // |max_base_x|. When added to the top_base_x, it is used to mask values
+ // that pass the end of |top|. Starting from 1 to simulate "cmpge" because
+ // only cmpgt is available.
+ const __m128i offsets =
+ _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
+
+ // All rows from |min_corner_only_y| down will simply use memcpy.
+ // |max_base_x| is always greater than |height|, so clipping the denominator
+ // to 1 is enough to make the logic work.
+ const int xstep_units = std::max(xstep >> index_scale_bits, 1);
+ const int min_corner_only_y = std::min(max_base_x / xstep_units, height);
+
+ int y = 0;
+ int top_x = xstep;
+ const __m128i max_shift = _mm_set1_epi16(32);
+
+ for (; y < min_corner_only_y; ++y, dst += stride, top_x += xstep) {
+ const int top_base_x = top_x >> index_scale_bits;
+
+ // Permit negative values of |top_x|.
+ const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+ const __m128i shift = _mm_set1_epi16(shift_val);
+ const __m128i opposite_shift = _mm_sub_epi16(max_shift, shift);
+ const __m128i shifts = _mm_unpacklo_epi16(opposite_shift, shift);
+ __m128i top_index_vect = _mm_set1_epi16(top_base_x);
+ top_index_vect = _mm_add_epi16(top_index_vect, offsets);
+
+ // Load 8 values because we will select the sampled values based on
+ // |upsampled|.
+ const __m128i values = LoadUnaligned16(top + top_base_x);
+ const __m128i pred =
+ CombineTopVals4(values, sampler, shifts, top_index_vect, final_top_val,
+ max_base_x_vect);
+ StoreLo8(dst, pred);
+ }
+
+ // Fill in corner-only rows.
+ for (; y < height; ++y) {
+ Memset(dst, top[max_base_x], /* width */ 4);
+ dst += stride;
+ }
+}
+
+// General purpose combine function.
+// |check_border| means the final source value has to be duplicated into the
+// result. This simplifies the loop structures that use precomputed boundaries
+// to identify sections where it is safe to compute without checking for the
+// right border.
+template <bool check_border>
+inline __m128i CombineTopVals(
+ const __m128i& top_vals_0, const __m128i& top_vals_1,
+ const __m128i& sampler, const __m128i& shifts,
+ const __m128i& top_indices = _mm_setzero_si128(),
+ const __m128i& final_top_val = _mm_setzero_si128(),
+ const __m128i& border_index = _mm_setzero_si128()) {
+ constexpr int scale_int_bits = 5;
+ const __m128i sampled_values_0 = _mm_shuffle_epi8(top_vals_0, sampler);
+ const __m128i sampled_values_1 = _mm_shuffle_epi8(top_vals_1, sampler);
+ const __m128i prod_0 = _mm_mullo_epi16(sampled_values_0, shifts);
+ const __m128i prod_1 = _mm_mullo_epi16(sampled_values_1, shifts);
+ const __m128i combined = _mm_hadd_epi16(prod_0, prod_1);
+ const __m128i result = RightShiftWithRounding_U16(combined, scale_int_bits);
+ if (check_border) {
+ const __m128i past_max = _mm_cmpgt_epi16(top_indices, border_index);
+ // Replace pixels from invalid range with top-right corner.
+ return _mm_blendv_epi8(result, final_top_val, past_max);
+ }
+ return result;
+}
+
+// 7.11.2.4 (7) angle < 90
+inline void DirectionalZone1_Large(uint16_t* dest, ptrdiff_t stride,
+ const uint16_t* const top_row,
+ const int width, const int height,
+ const int xstep, const bool upsampled,
+ const __m128i& sampler) {
+ const int upsample_shift = static_cast<int>(upsampled);
+ const int index_scale_bits = 6 - upsample_shift;
+ const int max_base_x = ((width + height) - 1) << upsample_shift;
+
+ const __m128i max_shift = _mm_set1_epi16(32);
+ const int base_step = 1 << upsample_shift;
+ const int base_step8 = base_step << 3;
+
+ // All rows from |min_corner_only_y| down will simply use memcpy.
+ // |max_base_x| is always greater than |height|, so clipping to 1 is enough
+ // to make the logic work.
+ const int xstep_units = std::max(xstep >> index_scale_bits, 1);
+ const int min_corner_only_y = std::min(max_base_x / xstep_units, height);
+
+ // Rows up to this y-value can be computed without checking for bounds.
+ const int max_no_corner_y = std::min(
+ LeftShift((max_base_x - (base_step * width)), index_scale_bits) / xstep,
+ height);
+ // No need to check for exceeding |max_base_x| in the first loop.
+ int y = 0;
+ int top_x = xstep;
+ for (; y < max_no_corner_y; ++y, dest += stride, top_x += xstep) {
+ int top_base_x = top_x >> index_scale_bits;
+ // Permit negative values of |top_x|.
+ const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+ const __m128i shift = _mm_set1_epi16(shift_val);
+ const __m128i opposite_shift = _mm_sub_epi16(max_shift, shift);
+ const __m128i shifts = _mm_unpacklo_epi16(opposite_shift, shift);
+ int x = 0;
+ do {
+ const __m128i top_vals_0 = LoadUnaligned16(top_row + top_base_x);
+ const __m128i top_vals_1 =
+ LoadUnaligned16(top_row + top_base_x + (4 << upsample_shift));
+
+ const __m128i pred =
+ CombineTopVals<false>(top_vals_0, top_vals_1, sampler, shifts);
+
+ StoreUnaligned16(dest + x, pred);
+ top_base_x += base_step8;
+ x += 8;
+ } while (x < width);
+ }
+
+ // Each 16-bit value here corresponds to a position that may exceed
+ // |max_base_x|. When added to |top_base_x|, it is used to mask values
+ // that pass the end of the |top| buffer. Starting from 1 to simulate "cmpge"
+ // which is not supported for packed integers.
+ const __m128i offsets =
+ _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
+
+ const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
+ const __m128i final_top_val = _mm_set1_epi16(top_row[max_base_x]);
+ const __m128i base_step8_vect = _mm_set1_epi16(base_step8);
+ for (; y < min_corner_only_y; ++y, dest += stride, top_x += xstep) {
+ int top_base_x = top_x >> index_scale_bits;
+
+ const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+ const __m128i shift = _mm_set1_epi16(shift_val);
+ const __m128i opposite_shift = _mm_sub_epi16(max_shift, shift);
+ const __m128i shifts = _mm_unpacklo_epi16(opposite_shift, shift);
+ __m128i top_index_vect = _mm_set1_epi16(top_base_x);
+ top_index_vect = _mm_add_epi16(top_index_vect, offsets);
+
+ int x = 0;
+ const int min_corner_only_x =
+ std::min(width, ((max_base_x - top_base_x) >> upsample_shift) + 7) & ~7;
+ for (; x < min_corner_only_x;
+ x += 8, top_base_x += base_step8,
+ top_index_vect = _mm_add_epi16(top_index_vect, base_step8_vect)) {
+ const __m128i top_vals_0 = LoadUnaligned16(top_row + top_base_x);
+ const __m128i top_vals_1 =
+ LoadUnaligned16(top_row + top_base_x + (4 << upsample_shift));
+ const __m128i pred =
+ CombineTopVals<true>(top_vals_0, top_vals_1, sampler, shifts,
+ top_index_vect, final_top_val, max_base_x_vect);
+ StoreUnaligned16(dest + x, pred);
+ }
+ // Corner-only section of the row.
+ Memset(dest + x, top_row[max_base_x], width - x);
+ }
+ // Fill in corner-only rows.
+ for (; y < height; ++y) {
+ Memset(dest, top_row[max_base_x], width);
+ dest += stride;
+ }
+}
+
+// 7.11.2.4 (7) angle < 90
+inline void DirectionalIntraPredictorZone1_SSE4_1(
+ void* dest_ptr, ptrdiff_t stride, const void* const top_ptr,
+ const int width, const int height, const int xstep, const bool upsampled) {
+ const auto* const top_row = static_cast<const uint16_t*>(top_ptr);
+ auto* dest = static_cast<uint16_t*>(dest_ptr);
+ stride /= sizeof(uint16_t);
+ const int upsample_shift = static_cast<int>(upsampled);
+ if (xstep == 64) {
+ DirectionalZone1_Step64(dest, stride, top_row, width, height);
+ return;
+ }
+ // Each base pixel paired with its following pixel, for hadd purposes.
+ const __m128i adjacency_shuffler = _mm_set_epi16(
+ 0x0908, 0x0706, 0x0706, 0x0504, 0x0504, 0x0302, 0x0302, 0x0100);
+ // This is equivalent to not shuffling at all.
+ const __m128i identity_shuffler = _mm_set_epi16(
+ 0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100);
+ // This represents a trade-off between code size and speed. When upsampled
+ // is true, no shuffle is necessary. But to avoid in-loop branching, we
+ // would need 2 copies of the main function body.
+ const __m128i sampler = upsampled ? identity_shuffler : adjacency_shuffler;
+ if (width == 4) {
+ DirectionalZone1_4xH(dest, stride, top_row, height, xstep, upsampled,
+ sampler);
+ return;
+ }
+ if (width >= 32) {
+ DirectionalZone1_Large(dest, stride, top_row, width, height, xstep,
+ upsampled, sampler);
+ return;
+ }
+ const int index_scale_bits = 6 - upsample_shift;
+ const int max_base_x = ((width + height) - 1) << upsample_shift;
+
+ const __m128i max_shift = _mm_set1_epi16(32);
+ const int base_step = 1 << upsample_shift;
+ const int base_step8 = base_step << 3;
+
+ // No need to check for exceeding |max_base_x| in the loops.
+ if (((xstep * height) >> index_scale_bits) + base_step * width < max_base_x) {
+ int top_x = xstep;
+ int y = height;
+ do {
+ int top_base_x = top_x >> index_scale_bits;
+ // Permit negative values of |top_x|.
+ const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+ const __m128i shift = _mm_set1_epi16(shift_val);
+ const __m128i opposite_shift = _mm_sub_epi16(max_shift, shift);
+ const __m128i shifts = _mm_unpacklo_epi16(opposite_shift, shift);
+ int x = 0;
+ do {
+ const __m128i top_vals_0 = LoadUnaligned16(top_row + top_base_x);
+ const __m128i top_vals_1 =
+ LoadUnaligned16(top_row + top_base_x + (4 << upsample_shift));
+ const __m128i pred =
+ CombineTopVals<false>(top_vals_0, top_vals_1, sampler, shifts);
+ StoreUnaligned16(dest + x, pred);
+ top_base_x += base_step8;
+ x += 8;
+ } while (x < width);
+ dest += stride;
+ top_x += xstep;
+ } while (--y != 0);
+ return;
+ }
+
+ // General case. Blocks with width less than 32 do not benefit from x-wise
+ // loop splitting, but do benefit from using memset on appropriate rows.
+
+ // Each 16-bit value here corresponds to a position that may exceed
+ // |max_base_x|. When added to the top_base_x, it is used to mask values
+ // that pass the end of |top|. Starting from 1 to simulate "cmpge" which is
+ // not supported for packed integers.
+ const __m128i offsets =
+ _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
+
+ const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
+ const __m128i final_top_val = _mm_set1_epi16(top_row[max_base_x]);
+ const __m128i base_step8_vect = _mm_set1_epi16(base_step8);
+
+ // All rows from |min_corner_only_y| down will simply use memcpy.
+ // |max_base_x| is always greater than |height|, so clipping the denominator
+ // to 1 is enough to make the logic work.
+ const int xstep_units = std::max(xstep >> index_scale_bits, 1);
+ const int min_corner_only_y = std::min(max_base_x / xstep_units, height);
+
+ int top_x = xstep;
+ int y = 0;
+ for (; y < min_corner_only_y; ++y, dest += stride, top_x += xstep) {
+ int top_base_x = top_x >> index_scale_bits;
+
+ const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+ const __m128i shift = _mm_set1_epi16(shift_val);
+ const __m128i opposite_shift = _mm_sub_epi16(max_shift, shift);
+ const __m128i shifts = _mm_unpacklo_epi16(opposite_shift, shift);
+ __m128i top_index_vect = _mm_set1_epi16(top_base_x);
+ top_index_vect = _mm_add_epi16(top_index_vect, offsets);
+
+ for (int x = 0; x < width; x += 8, top_base_x += base_step8,
+ top_index_vect = _mm_add_epi16(top_index_vect, base_step8_vect)) {
+ const __m128i top_vals_0 = LoadUnaligned16(top_row + top_base_x);
+ const __m128i top_vals_1 =
+ LoadUnaligned16(top_row + top_base_x + (4 << upsample_shift));
+ const __m128i pred =
+ CombineTopVals<true>(top_vals_0, top_vals_1, sampler, shifts,
+ top_index_vect, final_top_val, max_base_x_vect);
+ StoreUnaligned16(dest + x, pred);
+ }
+ }
+
+ // Fill in corner-only rows.
+ for (; y < height; ++y) {
+ Memset(dest, top_row[max_base_x], width);
+ dest += stride;
+ }
+}
+
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+ assert(dsp != nullptr);
+ static_cast<void>(dsp);
+#if DSP_ENABLED_10BPP_SSE4_1(DirectionalIntraPredictorZone1)
+ dsp->directional_intra_predictor_zone1 =
+ DirectionalIntraPredictorZone1_SSE4_1;
+#endif
+}
+
+} // namespace
+} // namespace high_bitdepth
+
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+void IntraPredDirectionalInit_SSE4_1() {
+ low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ high_bitdepth::Init10bpp();
+#endif
+}
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_TARGETING_SSE4_1
+namespace libgav1 {
+namespace dsp {
+
+void IntraPredDirectionalInit_SSE4_1() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/intrapred_directional_sse4.h b/src/dsp/x86/intrapred_directional_sse4.h
new file mode 100644
index 0000000..b352450
--- /dev/null
+++ b/src/dsp/x86/intrapred_directional_sse4.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_INTRAPRED_DIRECTIONAL_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_INTRAPRED_DIRECTIONAL_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::directional_intra_predictor_zone*, see the defines below for
+// specifics. These functions are not thread-safe.
+void IntraPredDirectionalInit_SSE4_1();
+
+} // namespace dsp
+} // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1
+#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2
+#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3
+#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone1
+#define LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone1 LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif // LIBGAV1_TARGETING_SSE4_1
+
+#endif // LIBGAV1_SRC_DSP_X86_INTRAPRED_DIRECTIONAL_SSE4_H_
diff --git a/src/dsp/x86/intrapred_filter_sse4.cc b/src/dsp/x86/intrapred_filter_sse4.cc
new file mode 100644
index 0000000..022af8d
--- /dev/null
+++ b/src/dsp/x86/intrapred_filter_sse4.cc
@@ -0,0 +1,432 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred_filter.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <xmmintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/dsp/x86/transpose_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+//------------------------------------------------------------------------------
+// FilterIntraPredictor_SSE4_1
+// Section 7.11.2.3. Recursive intra prediction process
+// This filter applies recursively to 4x2 sub-blocks within the transform block,
+// meaning that the predicted pixels in each sub-block are used as inputs to
+// sub-blocks below and to the right, if present.
+//
+// Each output value in the sub-block is predicted by a different filter applied
+// to the same array of top-left, top, and left values. If fn refers to the
+// output of the nth filter, given this block:
+// TL T0 T1 T2 T3
+// L0 f0 f1 f2 f3
+// L1 f4 f5 f6 f7
+// The filter input order is p0, p1, p2, p3, p4, p5, p6:
+// p0 p1 p2 p3 p4
+// p5 f0 f1 f2 f3
+// p6 f4 f5 f6 f7
+// Filters usually apply to 8 values for convenience, so in this case we fix
+// the 8th filter tap to 0 and disregard the value of the 8th input.
+
+// This shuffle mask selects 32-bit blocks in the order 0, 1, 0, 1, which
+// duplicates the first 8 bytes of a 128-bit vector into the second 8 bytes.
+constexpr int kDuplicateFirstHalf = 0x44;
+
+// Apply all filter taps to the given 7 packed 16-bit values, keeping the 8th
+// at zero to preserve the sum.
+// |pixels| contains p0-p7 in order as shown above.
+// |taps_0_1| contains the filter kernels used to predict f0 and f1, and so on.
+inline void Filter4x2_SSE4_1(uint8_t* dst, const ptrdiff_t stride,
+ const __m128i& pixels, const __m128i& taps_0_1,
+ const __m128i& taps_2_3, const __m128i& taps_4_5,
+ const __m128i& taps_6_7) {
+ const __m128i mul_0_01 = _mm_maddubs_epi16(pixels, taps_0_1);
+ const __m128i mul_0_23 = _mm_maddubs_epi16(pixels, taps_2_3);
+ // |output_half| contains 8 partial sums for f0-f7.
+ __m128i output_half = _mm_hadd_epi16(mul_0_01, mul_0_23);
+ __m128i output = _mm_hadd_epi16(output_half, output_half);
+ const __m128i output_row0 =
+ _mm_packus_epi16(RightShiftWithRounding_S16(output, 4),
+ /* unused half */ output);
+ Store4(dst, output_row0);
+ const __m128i mul_1_01 = _mm_maddubs_epi16(pixels, taps_4_5);
+ const __m128i mul_1_23 = _mm_maddubs_epi16(pixels, taps_6_7);
+ output_half = _mm_hadd_epi16(mul_1_01, mul_1_23);
+ output = _mm_hadd_epi16(output_half, output_half);
+ const __m128i output_row1 =
+ _mm_packus_epi16(RightShiftWithRounding_S16(output, 4),
+ /* arbitrary pack arg */ output);
+ Store4(dst + stride, output_row1);
+}
+
+// 4xH transform sizes are given special treatment because LoadLo8 goes out
+// of bounds and every block involves the left column. The top-left pixel, p0,
+// is stored in the top buffer for the first 4x2, but comes from the left buffer
+// for successive blocks. This implementation takes advantage of the fact
+// that the p5 and p6 for each sub-block come solely from the |left_ptr| buffer,
+// using shifts to arrange things to fit reusable shuffle vectors.
+inline void Filter4xH(uint8_t* dest, ptrdiff_t stride,
+ const uint8_t* const top_ptr,
+ const uint8_t* const left_ptr, FilterIntraPredictor pred,
+ const int height) {
+ // Two filter kernels per vector.
+ const __m128i taps_0_1 = LoadAligned16(kFilterIntraTaps[pred][0]);
+ const __m128i taps_2_3 = LoadAligned16(kFilterIntraTaps[pred][2]);
+ const __m128i taps_4_5 = LoadAligned16(kFilterIntraTaps[pred][4]);
+ const __m128i taps_6_7 = LoadAligned16(kFilterIntraTaps[pred][6]);
+ __m128i top = Load4(top_ptr - 1);
+ __m128i pixels = _mm_insert_epi8(top, top_ptr[3], 4);
+ __m128i left = (height == 4 ? Load4(left_ptr) : LoadLo8(left_ptr));
+ left = _mm_slli_si128(left, 5);
+
+ // Relative pixels: top[-1], top[0], top[1], top[2], top[3], left[0], left[1],
+ // left[2], left[3], left[4], left[5], left[6], left[7]
+ // Let rn represent a pixel usable as pn for the 4x2 after this one. We get:
+ // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ // p0 p1 p2 p3 p4 p5 p6 r5 r6 ...
+ // r0
+ pixels = _mm_or_si128(left, pixels);
+
+ // Two sets of the same input pixels to apply two filters at once.
+ pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
+ Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+ taps_6_7);
+ dest += stride; // Move to y = 1.
+ pixels = Load4(dest);
+
+ // Relative pixels: top[0], top[1], top[2], top[3], empty, left[-2], left[-1],
+ // left[0], left[1], ...
+ // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ // p1 p2 p3 p4 xx xx p0 p5 p6 r5 r6 ...
+ // r0
+ pixels = _mm_or_si128(left, pixels);
+
+ // This mask rearranges bytes in the order: 6, 0, 1, 2, 3, 7, 8, 15. The last
+ // byte is an unused value, which shall be multiplied by 0 when we apply the
+ // filter.
+ constexpr int64_t kInsertTopLeftFirstMask = 0x0F08070302010006;
+
+ // Insert left[-1] in front as TL and put left[0] and left[1] at the end.
+ const __m128i pixel_order1 = _mm_set1_epi64x(kInsertTopLeftFirstMask);
+ pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+ dest += stride; // Move to y = 2.
+ Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+ taps_6_7);
+ dest += stride; // Move to y = 3.
+
+ // Compute the middle 8 rows before using common code for the final 4 rows, in
+ // order to fit the assumption that |left| has the next TL at position 8.
+ if (height == 16) {
+ // This shift allows us to use pixel_order2 twice after shifting by 2 later.
+ left = _mm_slli_si128(left, 1);
+ pixels = Load4(dest);
+
+ // Relative pixels: top[0], top[1], top[2], top[3], empty, empty, left[-4],
+ // left[-3], left[-2], left[-1], left[0], left[1], left[2], left[3]
+ // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ // p1 p2 p3 p4 xx xx xx xx xx p0 p5 p6 r5 r6 ...
+ // r0
+ pixels = _mm_or_si128(left, pixels);
+
+ // This mask rearranges bytes in the order: 9, 0, 1, 2, 3, 7, 8, 15. The
+ // last byte is an unused value, as above. The top-left was shifted to
+ // position nine to keep two empty spaces after the top pixels.
+ constexpr int64_t kInsertTopLeftSecondMask = 0x0F0B0A0302010009;
+
+ // Insert (relative) left[-1] in front as TL and put left[0] and left[1] at
+ // the end.
+ const __m128i pixel_order2 = _mm_set1_epi64x(kInsertTopLeftSecondMask);
+ pixels = _mm_shuffle_epi8(pixels, pixel_order2);
+ dest += stride; // Move to y = 4.
+
+ // First 4x2 in the if body.
+ Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+ taps_6_7);
+
+ // Clear all but final pixel in the first 8 of left column.
+ __m128i keep_top_left = _mm_srli_si128(left, 13);
+ dest += stride; // Move to y = 5.
+ pixels = Load4(dest);
+ left = _mm_srli_si128(left, 2);
+
+ // Relative pixels: top[0], top[1], top[2], top[3], left[-6],
+ // left[-5], left[-4], left[-3], left[-2], left[-1], left[0], left[1]
+ // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ // p1 p2 p3 p4 xx xx xx xx xx p0 p5 p6 r5 r6 ...
+ // r0
+ pixels = _mm_or_si128(left, pixels);
+ left = LoadLo8(left_ptr + 8);
+
+ pixels = _mm_shuffle_epi8(pixels, pixel_order2);
+ dest += stride; // Move to y = 6.
+
+ // Second 4x2 in the if body.
+ Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+ taps_6_7);
+
+ // Position TL value so we can use pixel_order1.
+ keep_top_left = _mm_slli_si128(keep_top_left, 6);
+ dest += stride; // Move to y = 7.
+ pixels = Load4(dest);
+ left = _mm_slli_si128(left, 7);
+ left = _mm_or_si128(left, keep_top_left);
+
+ // Relative pixels: top[0], top[1], top[2], top[3], empty, empty,
+ // left[-1], left[0], left[1], left[2], left[3], ...
+ // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ // p1 p2 p3 p4 xx xx p0 p5 p6 r5 r6 ...
+ // r0
+ pixels = _mm_or_si128(left, pixels);
+ pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+ dest += stride; // Move to y = 8.
+
+ // Third 4x2 in the if body.
+ Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+ taps_6_7);
+ dest += stride; // Move to y = 9.
+
+ // Prepare final inputs.
+ pixels = Load4(dest);
+ left = _mm_srli_si128(left, 2);
+
+ // Relative pixels: top[0], top[1], top[2], top[3], left[-3], left[-2]
+ // left[-1], left[0], left[1], left[2], left[3], ...
+ // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ // p1 p2 p3 p4 xx xx p0 p5 p6 r5 r6 ...
+ // r0
+ pixels = _mm_or_si128(left, pixels);
+ pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+ dest += stride; // Move to y = 10.
+
+ // Fourth 4x2 in the if body.
+ Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+ taps_6_7);
+ dest += stride; // Move to y = 11.
+ }
+
+ // In both the 8 and 16 case at this point, we can assume that |left| has the
+ // next TL at position 8.
+ if (height > 4) {
+ // Erase prior left pixels by shifting TL to position 0.
+ left = _mm_srli_si128(left, 8);
+ left = _mm_slli_si128(left, 6);
+ pixels = Load4(dest);
+
+ // Relative pixels: top[0], top[1], top[2], top[3], empty, empty,
+ // left[-1], left[0], left[1], left[2], left[3], ...
+ // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ // p1 p2 p3 p4 xx xx p0 p5 p6 r5 r6 ...
+ // r0
+ pixels = _mm_or_si128(left, pixels);
+ pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+ dest += stride; // Move to y = 12 or 4.
+
+ // First of final two 4x2 blocks.
+ Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+ taps_6_7);
+ dest += stride; // Move to y = 13 or 5.
+ pixels = Load4(dest);
+ left = _mm_srli_si128(left, 2);
+
+ // Relative pixels: top[0], top[1], top[2], top[3], left[-3], left[-2]
+ // left[-1], left[0], left[1], left[2], left[3], ...
+ // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ // p1 p2 p3 p4 xx xx p0 p5 p6 r5 r6 ...
+ // r0
+ pixels = _mm_or_si128(left, pixels);
+ pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+ dest += stride; // Move to y = 14 or 6.
+
+ // Last of final two 4x2 blocks.
+ Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+ taps_6_7);
+ }
+}
+
+void FilterIntraPredictor_SSE4_1(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column,
+ FilterIntraPredictor pred, const int width,
+ const int height) {
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ auto* dst = static_cast<uint8_t*>(dest);
+ if (width == 4) {
+ Filter4xH(dst, stride, top_ptr, left_ptr, pred, height);
+ return;
+ }
+
+ // There is one set of 7 taps for each of the 4x2 output pixels.
+ const __m128i taps_0_1 = LoadAligned16(kFilterIntraTaps[pred][0]);
+ const __m128i taps_2_3 = LoadAligned16(kFilterIntraTaps[pred][2]);
+ const __m128i taps_4_5 = LoadAligned16(kFilterIntraTaps[pred][4]);
+ const __m128i taps_6_7 = LoadAligned16(kFilterIntraTaps[pred][6]);
+
+ // This mask rearranges bytes in the order: 0, 1, 2, 3, 4, 8, 9, 15. The 15 at
+ // the end is an unused value, which shall be multiplied by 0 when we apply
+ // the filter.
+ constexpr int64_t kCondenseLeftMask = 0x0F09080403020100;
+
+ // Takes the "left section" and puts it right after p0-p4.
+ const __m128i pixel_order1 = _mm_set1_epi64x(kCondenseLeftMask);
+
+ // This mask rearranges bytes in the order: 8, 0, 1, 2, 3, 9, 10, 15. The last
+ // byte is unused as above.
+ constexpr int64_t kInsertTopLeftMask = 0x0F0A090302010008;
+
+ // Shuffles the "top left" from the left section, to the front. Used when
+ // grabbing data from left_column and not top_row.
+ const __m128i pixel_order2 = _mm_set1_epi64x(kInsertTopLeftMask);
+
+ // This first pass takes care of the cases where the top left pixel comes from
+ // top_row.
+ __m128i pixels = LoadLo8(top_ptr - 1);
+ __m128i left = _mm_slli_si128(Load4(left_column), 8);
+ pixels = _mm_or_si128(pixels, left);
+
+ // Two sets of the same pixels to multiply with two sets of taps.
+ pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+ Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5, taps_6_7);
+ left = _mm_srli_si128(left, 1);
+
+ // Load
+ pixels = Load4(dst + stride);
+
+ // Because of the above shift, this OR 'invades' the final of the first 8
+ // bytes of |pixels|. This is acceptable because the 8th filter tap is always
+ // a padded 0.
+ pixels = _mm_or_si128(pixels, left);
+ pixels = _mm_shuffle_epi8(pixels, pixel_order2);
+ const ptrdiff_t stride2 = stride << 1;
+ const ptrdiff_t stride4 = stride << 2;
+ Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+ taps_6_7);
+ dst += 4;
+ for (int x = 3; x < width - 4; x += 4) {
+ pixels = Load4(top_ptr + x);
+ pixels = _mm_insert_epi8(pixels, top_ptr[x + 4], 4);
+ pixels = _mm_insert_epi8(pixels, dst[-1], 5);
+ pixels = _mm_insert_epi8(pixels, dst[stride - 1], 6);
+
+ // Duplicate bottom half into upper half.
+ pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
+ Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+ taps_6_7);
+ pixels = Load4(dst + stride - 1);
+ pixels = _mm_insert_epi8(pixels, dst[stride + 3], 4);
+ pixels = _mm_insert_epi8(pixels, dst[stride2 - 1], 5);
+ pixels = _mm_insert_epi8(pixels, dst[stride + stride2 - 1], 6);
+
+ // Duplicate bottom half into upper half.
+ pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
+ Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3,
+ taps_4_5, taps_6_7);
+ dst += 4;
+ }
+
+ // Now we handle heights that reference previous blocks rather than top_row.
+ for (int y = 4; y < height; y += 4) {
+ // Leftmost 4x4 block for this height.
+ dst -= width;
+ dst += stride4;
+
+ // Top Left is not available by offset in these leftmost blocks.
+ pixels = Load4(dst - stride);
+ left = _mm_slli_si128(Load4(left_ptr + y - 1), 8);
+ left = _mm_insert_epi8(left, left_ptr[y + 3], 12);
+ pixels = _mm_or_si128(pixels, left);
+ pixels = _mm_shuffle_epi8(pixels, pixel_order2);
+ Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+ taps_6_7);
+
+ // The bytes shifted into positions 6 and 7 will be ignored by the shuffle.
+ left = _mm_srli_si128(left, 2);
+ pixels = Load4(dst + stride);
+ pixels = _mm_or_si128(pixels, left);
+ pixels = _mm_shuffle_epi8(pixels, pixel_order2);
+ Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3,
+ taps_4_5, taps_6_7);
+
+ dst += 4;
+
+ // Remaining 4x4 blocks for this height.
+ for (int x = 4; x < width; x += 4) {
+ pixels = Load4(dst - stride - 1);
+ pixels = _mm_insert_epi8(pixels, dst[-stride + 3], 4);
+ pixels = _mm_insert_epi8(pixels, dst[-1], 5);
+ pixels = _mm_insert_epi8(pixels, dst[stride - 1], 6);
+
+ // Duplicate bottom half into upper half.
+ pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
+ Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+ taps_6_7);
+ pixels = Load4(dst + stride - 1);
+ pixels = _mm_insert_epi8(pixels, dst[stride + 3], 4);
+ pixels = _mm_insert_epi8(pixels, dst[stride2 - 1], 5);
+ pixels = _mm_insert_epi8(pixels, dst[stride2 + stride - 1], 6);
+
+ // Duplicate bottom half into upper half.
+ pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
+ Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3,
+ taps_4_5, taps_6_7);
+ dst += 4;
+ }
+ }
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ static_cast<void>(dsp);
+// These guards check if this version of the function was not superseded by
+// a higher optimization level, such as AVX. The corresponding #define also
+// prevents the C version from being added to the table.
+#if DSP_ENABLED_8BPP_SSE4_1(FilterIntraPredictor)
+ dsp->filter_intra_predictor = FilterIntraPredictor_SSE4_1;
+#endif
+}
+
+} // namespace
+
+void IntraPredFilterInit_SSE4_1() { Init8bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_TARGETING_SSE4_1
+namespace libgav1 {
+namespace dsp {
+
+void IntraPredFilterInit_SSE4_1() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/intrapred_filter_sse4.h b/src/dsp/x86/intrapred_filter_sse4.h
new file mode 100644
index 0000000..ce28f93
--- /dev/null
+++ b/src/dsp/x86/intrapred_filter_sse4.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_INTRAPRED_FILTER_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_INTRAPRED_FILTER_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::filter_intra_predictor, see the defines below for specifics.
+// These functions are not thread-safe.
+void IntraPredFilterInit_SSE4_1();
+
+} // namespace dsp
+} // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_FilterIntraPredictor
+#define LIBGAV1_Dsp8bpp_FilterIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+#endif // LIBGAV1_TARGETING_SSE4_1
+
+#endif // LIBGAV1_SRC_DSP_X86_INTRAPRED_FILTER_SSE4_H_
diff --git a/src/dsp/x86/intrapred_smooth_sse4.cc b/src/dsp/x86/intrapred_smooth_sse4.cc
index e944ea3..de9f551 100644
--- a/src/dsp/x86/intrapred_smooth_sse4.cc
+++ b/src/dsp/x86/intrapred_smooth_sse4.cc
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "src/dsp/intrapred.h"
+#include "src/dsp/intrapred_smooth.h"
#include "src/utils/cpu.h"
#if LIBGAV1_TARGETING_SSE4_1
@@ -22,12 +22,12 @@
#include <cassert>
#include <cstddef>
#include <cstdint>
-#include <cstring> // memcpy
#include "src/dsp/constants.h"
#include "src/dsp/dsp.h"
#include "src/dsp/x86/common_sse4.h"
#include "src/utils/common.h"
+#include "src/utils/constants.h"
namespace libgav1 {
namespace dsp {
@@ -67,29 +67,6 @@ inline void WriteSmoothHorizontalSum4(void* const dest, const __m128i& left,
Store4(dest, _mm_shuffle_epi8(pred, cvtepi32_epi8));
}
-template <int y_mask>
-inline __m128i SmoothVerticalSum4(const __m128i& top, const __m128i& weights,
- const __m128i& scaled_bottom_left) {
- const __m128i weights_y = _mm_shuffle_epi32(weights, y_mask);
- const __m128i weighted_top_y = _mm_mullo_epi16(top, weights_y);
- const __m128i scaled_bottom_left_y =
- _mm_shuffle_epi32(scaled_bottom_left, y_mask);
- return _mm_add_epi32(scaled_bottom_left_y, weighted_top_y);
-}
-
-template <int y_mask>
-inline void WriteSmoothVerticalSum4(uint8_t* dest, const __m128i& top,
- const __m128i& weights,
- const __m128i& scaled_bottom_left,
- const __m128i& round) {
- __m128i pred_sum =
- SmoothVerticalSum4<y_mask>(top, weights, scaled_bottom_left);
- // Equivalent to RightShiftWithRounding(pred[x][y], 8).
- pred_sum = _mm_srli_epi32(_mm_add_epi32(pred_sum, round), 8);
- const __m128i cvtepi32_epi8 = _mm_set1_epi32(0x0C080400);
- Store4(dest, _mm_shuffle_epi8(pred_sum, cvtepi32_epi8));
-}
-
// For SMOOTH_H, |pixels| is the repeated left value for the row. For SMOOTH_V,
// |pixels| is a segment of the top row or the whole top row, and |weights| is
// repeated.
diff --git a/src/dsp/x86/intrapred_smooth_sse4.h b/src/dsp/x86/intrapred_smooth_sse4.h
new file mode 100644
index 0000000..9353371
--- /dev/null
+++ b/src/dsp/x86/intrapred_smooth_sse4.h
@@ -0,0 +1,318 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_INTRAPRED_SMOOTH_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_INTRAPRED_SMOOTH_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::intra_predictors[][kIntraPredictorSmooth.*].
+// This function is not thread-safe.
+void IntraPredSmoothInit_SSE4_1();
+
+} // namespace dsp
+} // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmooth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmooth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmooth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmooth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmooth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmooth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmooth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmooth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmooth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmooth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmooth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmooth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmooth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmooth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmooth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmooth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmooth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmooth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmooth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+#endif // LIBGAV1_TARGETING_SSE4_1
+
+#endif // LIBGAV1_SRC_DSP_X86_INTRAPRED_SMOOTH_SSE4_H_
diff --git a/src/dsp/x86/intrapred_sse4.cc b/src/dsp/x86/intrapred_sse4.cc
index 9938dfe..063929d 100644
--- a/src/dsp/x86/intrapred_sse4.cc
+++ b/src/dsp/x86/intrapred_sse4.cc
@@ -23,13 +23,14 @@
#include <cassert>
#include <cstddef>
#include <cstdint>
-#include <cstring> // memcpy
+#include <cstring>
#include "src/dsp/constants.h"
#include "src/dsp/dsp.h"
#include "src/dsp/x86/common_sse4.h"
#include "src/dsp/x86/transpose_sse4.h"
#include "src/utils/common.h"
+#include "src/utils/constants.h"
namespace libgav1 {
namespace dsp {
@@ -51,10 +52,6 @@ inline __m128i DivideByMultiplyShift_U32(const __m128i dividend) {
return _mm_mulhi_epi16(interm, _mm_cvtsi32_si128(multiplier));
}
-// This shuffle mask selects 32-bit blocks in the order 0, 1, 0, 1, which
-// duplicates the first 8 bytes of a 128-bit vector into the second 8 bytes.
-constexpr int kDuplicateFirstHalf = 0x44;
-
//------------------------------------------------------------------------------
// DcPredFuncs_SSE4_1
@@ -1408,1337 +1405,6 @@ void Paeth64x64_SSE4_1(void* const dest, ptrdiff_t stride,
WritePaeth16x16(dst + 48, stride, top_left, top_3, left_3);
}
-//------------------------------------------------------------------------------
-// 7.11.2.4. Directional intra prediction process
-
-// Special case: An |xstep| of 64 corresponds to an angle delta of 45, meaning
-// upsampling is ruled out. In addition, the bits masked by 0x3F for
-// |shift_val| are 0 for all multiples of 64, so the formula
-// val = top[top_base_x]*shift + top[top_base_x+1]*(32-shift), reduces to
-// val = top[top_base_x+1] << 5, meaning only the second set of pixels is
-// involved in the output. Hence |top| is offset by 1.
-inline void DirectionalZone1_Step64(uint8_t* dst, ptrdiff_t stride,
- const uint8_t* const top, const int width,
- const int height) {
- ptrdiff_t offset = 1;
- if (height == 4) {
- memcpy(dst, top + offset, width);
- dst += stride;
- memcpy(dst, top + offset + 1, width);
- dst += stride;
- memcpy(dst, top + offset + 2, width);
- dst += stride;
- memcpy(dst, top + offset + 3, width);
- return;
- }
- int y = 0;
- do {
- memcpy(dst, top + offset, width);
- dst += stride;
- memcpy(dst, top + offset + 1, width);
- dst += stride;
- memcpy(dst, top + offset + 2, width);
- dst += stride;
- memcpy(dst, top + offset + 3, width);
- dst += stride;
- memcpy(dst, top + offset + 4, width);
- dst += stride;
- memcpy(dst, top + offset + 5, width);
- dst += stride;
- memcpy(dst, top + offset + 6, width);
- dst += stride;
- memcpy(dst, top + offset + 7, width);
- dst += stride;
-
- offset += 8;
- y += 8;
- } while (y < height);
-}
-
-inline void DirectionalZone1_4xH(uint8_t* dst, ptrdiff_t stride,
- const uint8_t* const top, const int height,
- const int xstep, const bool upsampled) {
- const int upsample_shift = static_cast<int>(upsampled);
- const int scale_bits = 6 - upsample_shift;
- const int rounding_bits = 5;
- const int max_base_x = (height + 3 /* width - 1 */) << upsample_shift;
- const __m128i final_top_val = _mm_set1_epi16(top[max_base_x]);
- const __m128i sampler = upsampled ? _mm_set_epi64x(0, 0x0706050403020100)
- : _mm_set_epi64x(0, 0x0403030202010100);
- // Each 16-bit value here corresponds to a position that may exceed
- // |max_base_x|. When added to the top_base_x, it is used to mask values
- // that pass the end of |top|. Starting from 1 to simulate "cmpge" which is
- // not supported for packed integers.
- const __m128i offsets =
- _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
-
- // All rows from |min_corner_only_y| down will simply use memcpy. |max_base_x|
- // is always greater than |height|, so clipping to 1 is enough to make the
- // logic work.
- const int xstep_units = std::max(xstep >> scale_bits, 1);
- const int min_corner_only_y = std::min(max_base_x / xstep_units, height);
-
- // Rows up to this y-value can be computed without checking for bounds.
- int y = 0;
- int top_x = xstep;
-
- for (; y < min_corner_only_y; ++y, dst += stride, top_x += xstep) {
- const int top_base_x = top_x >> scale_bits;
-
- // Permit negative values of |top_x|.
- const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
- const __m128i shift = _mm_set1_epi8(shift_val);
- const __m128i max_shift = _mm_set1_epi8(32);
- const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
- const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
- __m128i top_index_vect = _mm_set1_epi16(top_base_x);
- top_index_vect = _mm_add_epi16(top_index_vect, offsets);
- const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
-
- // Load 8 values because we will select the sampled values based on
- // |upsampled|.
- const __m128i values = LoadLo8(top + top_base_x);
- const __m128i sampled_values = _mm_shuffle_epi8(values, sampler);
- const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect);
- __m128i prod = _mm_maddubs_epi16(sampled_values, shifts);
- prod = RightShiftWithRounding_U16(prod, rounding_bits);
- // Replace pixels from invalid range with top-right corner.
- prod = _mm_blendv_epi8(prod, final_top_val, past_max);
- Store4(dst, _mm_packus_epi16(prod, prod));
- }
-
- // Fill in corner-only rows.
- for (; y < height; ++y) {
- memset(dst, top[max_base_x], /* width */ 4);
- dst += stride;
- }
-}
-
-// 7.11.2.4 (7) angle < 90
-inline void DirectionalZone1_Large(uint8_t* dest, ptrdiff_t stride,
- const uint8_t* const top_row,
- const int width, const int height,
- const int xstep, const bool upsampled) {
- const int upsample_shift = static_cast<int>(upsampled);
- const __m128i sampler =
- upsampled ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
- : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
- const int scale_bits = 6 - upsample_shift;
- const int max_base_x = ((width + height) - 1) << upsample_shift;
-
- const __m128i max_shift = _mm_set1_epi8(32);
- const int rounding_bits = 5;
- const int base_step = 1 << upsample_shift;
- const int base_step8 = base_step << 3;
-
- // All rows from |min_corner_only_y| down will simply use memcpy. |max_base_x|
- // is always greater than |height|, so clipping to 1 is enough to make the
- // logic work.
- const int xstep_units = std::max(xstep >> scale_bits, 1);
- const int min_corner_only_y = std::min(max_base_x / xstep_units, height);
-
- // Rows up to this y-value can be computed without checking for bounds.
- const int max_no_corner_y = std::min(
- LeftShift((max_base_x - (base_step * width)), scale_bits) / xstep,
- height);
- // No need to check for exceeding |max_base_x| in the first loop.
- int y = 0;
- int top_x = xstep;
- for (; y < max_no_corner_y; ++y, dest += stride, top_x += xstep) {
- int top_base_x = top_x >> scale_bits;
- // Permit negative values of |top_x|.
- const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
- const __m128i shift = _mm_set1_epi8(shift_val);
- const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
- const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
- int x = 0;
- do {
- const __m128i top_vals = LoadUnaligned16(top_row + top_base_x);
- __m128i vals = _mm_shuffle_epi8(top_vals, sampler);
- vals = _mm_maddubs_epi16(vals, shifts);
- vals = RightShiftWithRounding_U16(vals, rounding_bits);
- StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
- top_base_x += base_step8;
- x += 8;
- } while (x < width);
- }
-
- // Each 16-bit value here corresponds to a position that may exceed
- // |max_base_x|. When added to the top_base_x, it is used to mask values
- // that pass the end of |top|. Starting from 1 to simulate "cmpge" which is
- // not supported for packed integers.
- const __m128i offsets =
- _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
-
- const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
- const __m128i final_top_val = _mm_set1_epi16(top_row[max_base_x]);
- const __m128i base_step8_vect = _mm_set1_epi16(base_step8);
- for (; y < min_corner_only_y; ++y, dest += stride, top_x += xstep) {
- int top_base_x = top_x >> scale_bits;
-
- const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
- const __m128i shift = _mm_set1_epi8(shift_val);
- const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
- const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
- __m128i top_index_vect = _mm_set1_epi16(top_base_x);
- top_index_vect = _mm_add_epi16(top_index_vect, offsets);
-
- int x = 0;
- const int min_corner_only_x =
- std::min(width, ((max_base_x - top_base_x) >> upsample_shift) + 7) & ~7;
- for (; x < min_corner_only_x;
- x += 8, top_base_x += base_step8,
- top_index_vect = _mm_add_epi16(top_index_vect, base_step8_vect)) {
- const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect);
- // Assuming a buffer zone of 8 bytes at the end of top_row, this prevents
- // reading out of bounds. If all indices are past max and we don't need to
- // use the loaded bytes at all, |top_base_x| becomes 0. |top_base_x| will
- // reset for the next |y|.
- top_base_x &= ~_mm_cvtsi128_si32(past_max);
- const __m128i top_vals = LoadUnaligned16(top_row + top_base_x);
- __m128i vals = _mm_shuffle_epi8(top_vals, sampler);
- vals = _mm_maddubs_epi16(vals, shifts);
- vals = RightShiftWithRounding_U16(vals, rounding_bits);
- vals = _mm_blendv_epi8(vals, final_top_val, past_max);
- StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
- }
- // Corner-only section of the row.
- memset(dest + x, top_row[max_base_x], width - x);
- }
- // Fill in corner-only rows.
- for (; y < height; ++y) {
- memset(dest, top_row[max_base_x], width);
- dest += stride;
- }
-}
-
-// 7.11.2.4 (7) angle < 90
-inline void DirectionalZone1_SSE4_1(uint8_t* dest, ptrdiff_t stride,
- const uint8_t* const top_row,
- const int width, const int height,
- const int xstep, const bool upsampled) {
- const int upsample_shift = static_cast<int>(upsampled);
- if (xstep == 64) {
- DirectionalZone1_Step64(dest, stride, top_row, width, height);
- return;
- }
- if (width == 4) {
- DirectionalZone1_4xH(dest, stride, top_row, height, xstep, upsampled);
- return;
- }
- if (width >= 32) {
- DirectionalZone1_Large(dest, stride, top_row, width, height, xstep,
- upsampled);
- return;
- }
- const __m128i sampler =
- upsampled ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
- : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
- const int scale_bits = 6 - upsample_shift;
- const int max_base_x = ((width + height) - 1) << upsample_shift;
-
- const __m128i max_shift = _mm_set1_epi8(32);
- const int rounding_bits = 5;
- const int base_step = 1 << upsample_shift;
- const int base_step8 = base_step << 3;
-
- // No need to check for exceeding |max_base_x| in the loops.
- if (((xstep * height) >> scale_bits) + base_step * width < max_base_x) {
- int top_x = xstep;
- int y = 0;
- do {
- int top_base_x = top_x >> scale_bits;
- // Permit negative values of |top_x|.
- const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
- const __m128i shift = _mm_set1_epi8(shift_val);
- const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
- const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
- int x = 0;
- do {
- const __m128i top_vals = LoadUnaligned16(top_row + top_base_x);
- __m128i vals = _mm_shuffle_epi8(top_vals, sampler);
- vals = _mm_maddubs_epi16(vals, shifts);
- vals = RightShiftWithRounding_U16(vals, rounding_bits);
- StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
- top_base_x += base_step8;
- x += 8;
- } while (x < width);
- dest += stride;
- top_x += xstep;
- } while (++y < height);
- return;
- }
-
- // Each 16-bit value here corresponds to a position that may exceed
- // |max_base_x|. When added to the top_base_x, it is used to mask values
- // that pass the end of |top|. Starting from 1 to simulate "cmpge" which is
- // not supported for packed integers.
- const __m128i offsets =
- _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
-
- const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
- const __m128i final_top_val = _mm_set1_epi16(top_row[max_base_x]);
- const __m128i base_step8_vect = _mm_set1_epi16(base_step8);
- int top_x = xstep;
- int y = 0;
- do {
- int top_base_x = top_x >> scale_bits;
-
- if (top_base_x >= max_base_x) {
- for (int i = y; i < height; ++i) {
- memset(dest, top_row[max_base_x], width);
- dest += stride;
- }
- return;
- }
-
- const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
- const __m128i shift = _mm_set1_epi8(shift_val);
- const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
- const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
- __m128i top_index_vect = _mm_set1_epi16(top_base_x);
- top_index_vect = _mm_add_epi16(top_index_vect, offsets);
-
- int x = 0;
- for (; x < width - 8;
- x += 8, top_base_x += base_step8,
- top_index_vect = _mm_add_epi16(top_index_vect, base_step8_vect)) {
- const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect);
- // Assuming a buffer zone of 8 bytes at the end of top_row, this prevents
- // reading out of bounds. If all indices are past max and we don't need to
- // use the loaded bytes at all, |top_base_x| becomes 0. |top_base_x| will
- // reset for the next |y|.
- top_base_x &= ~_mm_cvtsi128_si32(past_max);
- const __m128i top_vals = LoadUnaligned16(top_row + top_base_x);
- __m128i vals = _mm_shuffle_epi8(top_vals, sampler);
- vals = _mm_maddubs_epi16(vals, shifts);
- vals = RightShiftWithRounding_U16(vals, rounding_bits);
- vals = _mm_blendv_epi8(vals, final_top_val, past_max);
- StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
- }
- const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect);
- __m128i vals;
- if (upsampled) {
- vals = LoadUnaligned16(top_row + top_base_x);
- } else {
- const __m128i top_vals = LoadLo8(top_row + top_base_x);
- vals = _mm_shuffle_epi8(top_vals, sampler);
- vals = _mm_insert_epi8(vals, top_row[top_base_x + 8], 15);
- }
- vals = _mm_maddubs_epi16(vals, shifts);
- vals = RightShiftWithRounding_U16(vals, rounding_bits);
- vals = _mm_blendv_epi8(vals, final_top_val, past_max);
- StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
- dest += stride;
- top_x += xstep;
- } while (++y < height);
-}
-
-void DirectionalIntraPredictorZone1_SSE4_1(void* const dest, ptrdiff_t stride,
- const void* const top_row,
- const int width, const int height,
- const int xstep,
- const bool upsampled_top) {
- const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
- auto* dst = static_cast<uint8_t*>(dest);
- DirectionalZone1_SSE4_1(dst, stride, top_ptr, width, height, xstep,
- upsampled_top);
-}
-
-template <bool upsampled>
-inline void DirectionalZone3_4x4(uint8_t* dest, ptrdiff_t stride,
- const uint8_t* const left_column,
- const int base_left_y, const int ystep) {
- // For use in the non-upsampled case.
- const __m128i sampler = _mm_set_epi64x(0, 0x0403030202010100);
- const int upsample_shift = static_cast<int>(upsampled);
- const int scale_bits = 6 - upsample_shift;
- const __m128i max_shift = _mm_set1_epi8(32);
- const int rounding_bits = 5;
-
- __m128i result_block[4];
- for (int x = 0, left_y = base_left_y; x < 4; x++, left_y += ystep) {
- const int left_base_y = left_y >> scale_bits;
- const int shift_val = ((left_y << upsample_shift) & 0x3F) >> 1;
- const __m128i shift = _mm_set1_epi8(shift_val);
- const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
- const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
- __m128i vals;
- if (upsampled) {
- vals = LoadLo8(left_column + left_base_y);
- } else {
- const __m128i top_vals = LoadLo8(left_column + left_base_y);
- vals = _mm_shuffle_epi8(top_vals, sampler);
- }
- vals = _mm_maddubs_epi16(vals, shifts);
- vals = RightShiftWithRounding_U16(vals, rounding_bits);
- result_block[x] = _mm_packus_epi16(vals, vals);
- }
- const __m128i result = Transpose4x4_U8(result_block);
- // This is result_row0.
- Store4(dest, result);
- dest += stride;
- const int result_row1 = _mm_extract_epi32(result, 1);
- memcpy(dest, &result_row1, sizeof(result_row1));
- dest += stride;
- const int result_row2 = _mm_extract_epi32(result, 2);
- memcpy(dest, &result_row2, sizeof(result_row2));
- dest += stride;
- const int result_row3 = _mm_extract_epi32(result, 3);
- memcpy(dest, &result_row3, sizeof(result_row3));
-}
-
-template <bool upsampled, int height>
-inline void DirectionalZone3_8xH(uint8_t* dest, ptrdiff_t stride,
- const uint8_t* const left_column,
- const int base_left_y, const int ystep) {
- // For use in the non-upsampled case.
- const __m128i sampler =
- _mm_set_epi64x(0x0807070606050504, 0x0403030202010100);
- const int upsample_shift = static_cast<int>(upsampled);
- const int scale_bits = 6 - upsample_shift;
- const __m128i max_shift = _mm_set1_epi8(32);
- const int rounding_bits = 5;
-
- __m128i result_block[8];
- for (int x = 0, left_y = base_left_y; x < 8; x++, left_y += ystep) {
- const int left_base_y = left_y >> scale_bits;
- const int shift_val = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
- const __m128i shift = _mm_set1_epi8(shift_val);
- const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
- const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
- __m128i vals;
- if (upsampled) {
- vals = LoadUnaligned16(left_column + left_base_y);
- } else {
- const __m128i top_vals = LoadUnaligned16(left_column + left_base_y);
- vals = _mm_shuffle_epi8(top_vals, sampler);
- }
- vals = _mm_maddubs_epi16(vals, shifts);
- result_block[x] = RightShiftWithRounding_U16(vals, rounding_bits);
- }
- Transpose8x8_U16(result_block, result_block);
- for (int y = 0; y < height; ++y) {
- StoreLo8(dest, _mm_packus_epi16(result_block[y], result_block[y]));
- dest += stride;
- }
-}
-
-// 7.11.2.4 (9) angle > 180
-void DirectionalIntraPredictorZone3_SSE4_1(void* dest, ptrdiff_t stride,
- const void* const left_column,
- const int width, const int height,
- const int ystep,
- const bool upsampled) {
- const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
- auto* dst = static_cast<uint8_t*>(dest);
- const int upsample_shift = static_cast<int>(upsampled);
- if (width == 4 || height == 4) {
- const ptrdiff_t stride4 = stride << 2;
- if (upsampled) {
- int left_y = ystep;
- int x = 0;
- do {
- uint8_t* dst_x = dst + x;
- int y = 0;
- do {
- DirectionalZone3_4x4<true>(
- dst_x, stride, left_ptr + (y << upsample_shift), left_y, ystep);
- dst_x += stride4;
- y += 4;
- } while (y < height);
- left_y += ystep << 2;
- x += 4;
- } while (x < width);
- } else {
- int left_y = ystep;
- int x = 0;
- do {
- uint8_t* dst_x = dst + x;
- int y = 0;
- do {
- DirectionalZone3_4x4<false>(dst_x, stride, left_ptr + y, left_y,
- ystep);
- dst_x += stride4;
- y += 4;
- } while (y < height);
- left_y += ystep << 2;
- x += 4;
- } while (x < width);
- }
- return;
- }
-
- const ptrdiff_t stride8 = stride << 3;
- if (upsampled) {
- int left_y = ystep;
- int x = 0;
- do {
- uint8_t* dst_x = dst + x;
- int y = 0;
- do {
- DirectionalZone3_8xH<true, 8>(
- dst_x, stride, left_ptr + (y << upsample_shift), left_y, ystep);
- dst_x += stride8;
- y += 8;
- } while (y < height);
- left_y += ystep << 3;
- x += 8;
- } while (x < width);
- } else {
- int left_y = ystep;
- int x = 0;
- do {
- uint8_t* dst_x = dst + x;
- int y = 0;
- do {
- DirectionalZone3_8xH<false, 8>(
- dst_x, stride, left_ptr + (y << upsample_shift), left_y, ystep);
- dst_x += stride8;
- y += 8;
- } while (y < height);
- left_y += ystep << 3;
- x += 8;
- } while (x < width);
- }
-}
-
-//------------------------------------------------------------------------------
-// Directional Zone 2 Functions
-// 7.11.2.4 (8)
-
-// DirectionalBlend* selectively overwrites the values written by
-// DirectionalZone2FromLeftCol*. |zone_bounds| has one 16-bit index for each
-// row.
-template <int y_selector>
-inline void DirectionalBlend4_SSE4_1(uint8_t* dest,
- const __m128i& dest_index_vect,
- const __m128i& vals,
- const __m128i& zone_bounds) {
- const __m128i max_dest_x_vect = _mm_shufflelo_epi16(zone_bounds, y_selector);
- const __m128i use_left = _mm_cmplt_epi16(dest_index_vect, max_dest_x_vect);
- const __m128i original_vals = _mm_cvtepu8_epi16(Load4(dest));
- const __m128i blended_vals = _mm_blendv_epi8(vals, original_vals, use_left);
- Store4(dest, _mm_packus_epi16(blended_vals, blended_vals));
-}
-
-inline void DirectionalBlend8_SSE4_1(uint8_t* dest,
- const __m128i& dest_index_vect,
- const __m128i& vals,
- const __m128i& zone_bounds,
- const __m128i& bounds_selector) {
- const __m128i max_dest_x_vect =
- _mm_shuffle_epi8(zone_bounds, bounds_selector);
- const __m128i use_left = _mm_cmplt_epi16(dest_index_vect, max_dest_x_vect);
- const __m128i original_vals = _mm_cvtepu8_epi16(LoadLo8(dest));
- const __m128i blended_vals = _mm_blendv_epi8(vals, original_vals, use_left);
- StoreLo8(dest, _mm_packus_epi16(blended_vals, blended_vals));
-}
-
-constexpr int kDirectionalWeightBits = 5;
-// |source| is packed with 4 or 8 pairs of 8-bit values from left or top.
-// |shifts| is named to match the specification, with 4 or 8 pairs of (32 -
-// shift) and shift. Shift is guaranteed to be between 0 and 32.
-inline __m128i DirectionalZone2FromSource_SSE4_1(const uint8_t* const source,
- const __m128i& shifts,
- const __m128i& sampler) {
- const __m128i src_vals = LoadUnaligned16(source);
- __m128i vals = _mm_shuffle_epi8(src_vals, sampler);
- vals = _mm_maddubs_epi16(vals, shifts);
- return RightShiftWithRounding_U16(vals, kDirectionalWeightBits);
-}
-
-// Because the source values "move backwards" as the row index increases, the
-// indices derived from ystep are generally negative. This is accommodated by
-// making sure the relative indices are within [-15, 0] when the function is
-// called, and sliding them into the inclusive range [0, 15], relative to a
-// lower base address.
-constexpr int kPositiveIndexOffset = 15;
-
-template <bool upsampled>
-inline void DirectionalZone2FromLeftCol_4x4_SSE4_1(
- uint8_t* dst, ptrdiff_t stride, const uint8_t* const left_column_base,
- __m128i left_y) {
- const int upsample_shift = static_cast<int>(upsampled);
- const int scale_bits = 6 - upsample_shift;
- const __m128i max_shifts = _mm_set1_epi8(32);
- const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
- const __m128i index_increment = _mm_cvtsi32_si128(0x01010101);
- const __m128i positive_offset = _mm_set1_epi8(kPositiveIndexOffset);
- // Left_column and sampler are both offset by 15 so the indices are always
- // positive.
- const uint8_t* left_column = left_column_base - kPositiveIndexOffset;
- for (int y = 0; y < 4; dst += stride, ++y) {
- __m128i offset_y = _mm_srai_epi16(left_y, scale_bits);
- offset_y = _mm_packs_epi16(offset_y, offset_y);
-
- const __m128i adjacent = _mm_add_epi8(offset_y, index_increment);
- __m128i sampler = _mm_unpacklo_epi8(offset_y, adjacent);
- // Slide valid |offset_y| indices from range [-15, 0] to [0, 15] so they
- // can work as shuffle indices. Some values may be out of bounds, but their
- // pred results will be masked over by top prediction.
- sampler = _mm_add_epi8(sampler, positive_offset);
-
- __m128i shifts = _mm_srli_epi16(
- _mm_and_si128(_mm_slli_epi16(left_y, upsample_shift), shift_mask), 1);
- shifts = _mm_packus_epi16(shifts, shifts);
- const __m128i opposite_shifts = _mm_sub_epi8(max_shifts, shifts);
- shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
- const __m128i vals = DirectionalZone2FromSource_SSE4_1(
- left_column + (y << upsample_shift), shifts, sampler);
- Store4(dst, _mm_packus_epi16(vals, vals));
- }
-}
-
-// The height at which a load of 16 bytes will not contain enough source pixels
-// from |left_column| to supply an accurate row when computing 8 pixels at a
-// time. The values are found by inspection. By coincidence, all angles that
-// satisfy (ystep >> 6) == 2 map to the same value, so it is enough to look up
-// by ystep >> 6. The largest index for this lookup is 1023 >> 6 == 15.
-constexpr int kDirectionalZone2ShuffleInvalidHeight[16] = {
- 1024, 1024, 16, 16, 16, 16, 0, 0, 18, 0, 0, 0, 0, 0, 0, 40};
-
-template <bool upsampled>
-inline void DirectionalZone2FromLeftCol_8x8_SSE4_1(
- uint8_t* dst, ptrdiff_t stride, const uint8_t* const left_column,
- __m128i left_y) {
- const int upsample_shift = static_cast<int>(upsampled);
- const int scale_bits = 6 - upsample_shift;
- const __m128i max_shifts = _mm_set1_epi8(32);
- const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
- const __m128i index_increment = _mm_set1_epi8(1);
- const __m128i denegation = _mm_set1_epi8(kPositiveIndexOffset);
- for (int y = 0; y < 8; dst += stride, ++y) {
- __m128i offset_y = _mm_srai_epi16(left_y, scale_bits);
- offset_y = _mm_packs_epi16(offset_y, offset_y);
- const __m128i adjacent = _mm_add_epi8(offset_y, index_increment);
-
- // Offset the relative index because ystep is negative in Zone 2 and shuffle
- // indices must be nonnegative.
- __m128i sampler = _mm_unpacklo_epi8(offset_y, adjacent);
- sampler = _mm_add_epi8(sampler, denegation);
-
- __m128i shifts = _mm_srli_epi16(
- _mm_and_si128(_mm_slli_epi16(left_y, upsample_shift), shift_mask), 1);
- shifts = _mm_packus_epi16(shifts, shifts);
- const __m128i opposite_shifts = _mm_sub_epi8(max_shifts, shifts);
- shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
-
- // The specification adds (y << 6) to left_y, which is subject to
- // upsampling, but this puts sampler indices out of the 0-15 range. It is
- // equivalent to offset the source address by (y << upsample_shift) instead.
- const __m128i vals = DirectionalZone2FromSource_SSE4_1(
- left_column - kPositiveIndexOffset + (y << upsample_shift), shifts,
- sampler);
- StoreLo8(dst, _mm_packus_epi16(vals, vals));
- }
-}
-
-// |zone_bounds| is an epi16 of the relative x index at which base >= -(1 <<
-// upsampled_top), for each row. When there are 4 values, they can be duplicated
-// with a non-register shuffle mask.
-// |shifts| is one pair of weights that applies throughout a given row.
-template <bool upsampled_top>
-inline void DirectionalZone1Blend_4x4(
- uint8_t* dest, const uint8_t* const top_row, ptrdiff_t stride,
- __m128i sampler, const __m128i& zone_bounds, const __m128i& shifts,
- const __m128i& dest_index_x, int top_x, const int xstep) {
- const int upsample_shift = static_cast<int>(upsampled_top);
- const int scale_bits_x = 6 - upsample_shift;
- top_x -= xstep;
-
- int top_base_x = (top_x >> scale_bits_x);
- const __m128i vals0 = DirectionalZone2FromSource_SSE4_1(
- top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0x00), sampler);
- DirectionalBlend4_SSE4_1<0x00>(dest, dest_index_x, vals0, zone_bounds);
- top_x -= xstep;
- dest += stride;
-
- top_base_x = (top_x >> scale_bits_x);
- const __m128i vals1 = DirectionalZone2FromSource_SSE4_1(
- top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0x55), sampler);
- DirectionalBlend4_SSE4_1<0x55>(dest, dest_index_x, vals1, zone_bounds);
- top_x -= xstep;
- dest += stride;
-
- top_base_x = (top_x >> scale_bits_x);
- const __m128i vals2 = DirectionalZone2FromSource_SSE4_1(
- top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0xAA), sampler);
- DirectionalBlend4_SSE4_1<0xAA>(dest, dest_index_x, vals2, zone_bounds);
- top_x -= xstep;
- dest += stride;
-
- top_base_x = (top_x >> scale_bits_x);
- const __m128i vals3 = DirectionalZone2FromSource_SSE4_1(
- top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0xFF), sampler);
- DirectionalBlend4_SSE4_1<0xFF>(dest, dest_index_x, vals3, zone_bounds);
-}
-
-template <bool upsampled_top, int height>
-inline void DirectionalZone1Blend_8xH(
- uint8_t* dest, const uint8_t* const top_row, ptrdiff_t stride,
- __m128i sampler, const __m128i& zone_bounds, const __m128i& shifts,
- const __m128i& dest_index_x, int top_x, const int xstep) {
- const int upsample_shift = static_cast<int>(upsampled_top);
- const int scale_bits_x = 6 - upsample_shift;
-
- __m128i y_selector = _mm_set1_epi32(0x01000100);
- const __m128i index_increment = _mm_set1_epi32(0x02020202);
- for (int y = 0; y < height; ++y,
- y_selector = _mm_add_epi8(y_selector, index_increment),
- dest += stride) {
- top_x -= xstep;
- const int top_base_x = top_x >> scale_bits_x;
- const __m128i vals = DirectionalZone2FromSource_SSE4_1(
- top_row + top_base_x, _mm_shuffle_epi8(shifts, y_selector), sampler);
- DirectionalBlend8_SSE4_1(dest, dest_index_x, vals, zone_bounds, y_selector);
- }
-}
-
-// 7.11.2.4 (8) 90 < angle > 180
-// The strategy for this function is to know how many blocks can be processed
-// with just pixels from |top_ptr|, then handle mixed blocks, then handle only
-// blocks that take from |left_ptr|. Additionally, a fast index-shuffle
-// approach is used for pred values from |left_column| in sections that permit
-// it.
-template <bool upsampled_left, bool upsampled_top>
-inline void DirectionalZone2_SSE4_1(void* dest, ptrdiff_t stride,
- const uint8_t* const top_row,
- const uint8_t* const left_column,
- const int width, const int height,
- const int xstep, const int ystep) {
- auto* dst = static_cast<uint8_t*>(dest);
- const int upsample_left_shift = static_cast<int>(upsampled_left);
- const int upsample_top_shift = static_cast<int>(upsampled_top);
- const __m128i max_shift = _mm_set1_epi8(32);
- const ptrdiff_t stride8 = stride << 3;
- const __m128i dest_index_x =
- _mm_set_epi32(0x00070006, 0x00050004, 0x00030002, 0x00010000);
- const __m128i sampler_top =
- upsampled_top
- ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
- : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
- const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
- // All columns from |min_top_only_x| to the right will only need |top_row| to
- // compute. This assumes minimum |xstep| is 3.
- const int min_top_only_x = std::min((height * xstep) >> 6, width);
-
- // For steep angles, the source pixels from left_column may not fit in a
- // 16-byte load for shuffling.
- // TODO(petersonab): Find a more precise formula for this subject to x.
- const int max_shuffle_height =
- std::min(height, kDirectionalZone2ShuffleInvalidHeight[ystep >> 6]);
-
- const int xstep8 = xstep << 3;
- const __m128i xstep8_vect = _mm_set1_epi16(xstep8);
- // Accumulate xstep across 8 rows.
- const __m128i xstep_dup = _mm_set1_epi16(-xstep);
- const __m128i increments = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
- const __m128i xstep_for_shift = _mm_mullo_epi16(xstep_dup, increments);
- // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
- const __m128i scaled_one = _mm_set1_epi16(-64);
- __m128i xstep_bounds_base =
- (xstep == 64) ? _mm_sub_epi16(scaled_one, xstep_for_shift)
- : _mm_sub_epi16(_mm_set1_epi16(-1), xstep_for_shift);
-
- const int left_base_increment = ystep >> 6;
- const int ystep_remainder = ystep & 0x3F;
- const int ystep8 = ystep << 3;
- const int left_base_increment8 = ystep8 >> 6;
- const int ystep_remainder8 = ystep8 & 0x3F;
- const __m128i increment_left8 = _mm_set1_epi16(-ystep_remainder8);
-
- // If the 64 scaling is regarded as a decimal point, the first value of the
- // left_y vector omits the portion which is covered under the left_column
- // offset. Following values need the full ystep as a relative offset.
- const __m128i ystep_init = _mm_set1_epi16(-ystep_remainder);
- const __m128i ystep_dup = _mm_set1_epi16(-ystep);
- __m128i left_y = _mm_mullo_epi16(ystep_dup, dest_index_x);
- left_y = _mm_add_epi16(ystep_init, left_y);
-
- const __m128i increment_top8 = _mm_set1_epi16(8 << 6);
- int x = 0;
-
- // This loop treats each set of 4 columns in 3 stages with y-value boundaries.
- // The first stage, before the first y-loop, covers blocks that are only
- // computed from the top row. The second stage, comprising two y-loops, covers
- // blocks that have a mixture of values computed from top or left. The final
- // stage covers blocks that are only computed from the left.
- for (int left_offset = -left_base_increment; x < min_top_only_x;
- x += 8,
- xstep_bounds_base = _mm_sub_epi16(xstep_bounds_base, increment_top8),
- // Watch left_y because it can still get big.
- left_y = _mm_add_epi16(left_y, increment_left8),
- left_offset -= left_base_increment8) {
- uint8_t* dst_x = dst + x;
-
- // Round down to the nearest multiple of 8.
- const int max_top_only_y = std::min(((x + 1) << 6) / xstep, height) & ~7;
- DirectionalZone1_4xH(dst_x, stride, top_row + (x << upsample_top_shift),
- max_top_only_y, -xstep, upsampled_top);
- DirectionalZone1_4xH(dst_x + 4, stride,
- top_row + ((x + 4) << upsample_top_shift),
- max_top_only_y, -xstep, upsampled_top);
-
- int y = max_top_only_y;
- dst_x += stride * y;
- const int xstep_y = xstep * y;
- const __m128i xstep_y_vect = _mm_set1_epi16(xstep_y);
- // All rows from |min_left_only_y| down for this set of columns, only need
- // |left_column| to compute.
- const int min_left_only_y = std::min(((x + 8) << 6) / xstep, height);
- // At high angles such that min_left_only_y < 8, ystep is low and xstep is
- // high. This means that max_shuffle_height is unbounded and xstep_bounds
- // will overflow in 16 bits. This is prevented by stopping the first
- // blending loop at min_left_only_y for such cases, which means we skip over
- // the second blending loop as well.
- const int left_shuffle_stop_y =
- std::min(max_shuffle_height, min_left_only_y);
- __m128i xstep_bounds = _mm_add_epi16(xstep_bounds_base, xstep_y_vect);
- __m128i xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift, xstep_y_vect);
- int top_x = -xstep_y;
-
- for (; y < left_shuffle_stop_y;
- y += 8, dst_x += stride8,
- xstep_bounds = _mm_add_epi16(xstep_bounds, xstep8_vect),
- xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep8_vect),
- top_x -= xstep8) {
- DirectionalZone2FromLeftCol_8x8_SSE4_1<upsampled_left>(
- dst_x, stride,
- left_column + ((left_offset + y) << upsample_left_shift), left_y);
-
- __m128i shifts = _mm_srli_epi16(
- _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift),
- shift_mask),
- 1);
- shifts = _mm_packus_epi16(shifts, shifts);
- __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts);
- shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
- __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6);
- DirectionalZone1Blend_8xH<upsampled_top, 8>(
- dst_x, top_row + (x << upsample_top_shift), stride, sampler_top,
- xstep_bounds_off, shifts, dest_index_x, top_x, xstep);
- }
- // Pick up from the last y-value, using the 10% slower but secure method for
- // left prediction.
- const auto base_left_y = static_cast<int16_t>(_mm_extract_epi16(left_y, 0));
- for (; y < min_left_only_y;
- y += 8, dst_x += stride8,
- xstep_bounds = _mm_add_epi16(xstep_bounds, xstep8_vect),
- xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep8_vect),
- top_x -= xstep8) {
- const __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6);
-
- DirectionalZone3_8xH<upsampled_left, 8>(
- dst_x, stride,
- left_column + ((left_offset + y) << upsample_left_shift), base_left_y,
- -ystep);
-
- __m128i shifts = _mm_srli_epi16(
- _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift),
- shift_mask),
- 1);
- shifts = _mm_packus_epi16(shifts, shifts);
- __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts);
- shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
- DirectionalZone1Blend_8xH<upsampled_top, 8>(
- dst_x, top_row + (x << upsample_top_shift), stride, sampler_top,
- xstep_bounds_off, shifts, dest_index_x, top_x, xstep);
- }
- // Loop over y for left_only rows.
- for (; y < height; y += 8, dst_x += stride8) {
- DirectionalZone3_8xH<upsampled_left, 8>(
- dst_x, stride,
- left_column + ((left_offset + y) << upsample_left_shift), base_left_y,
- -ystep);
- }
- }
- for (; x < width; x += 4) {
- DirectionalZone1_4xH(dst + x, stride, top_row + (x << upsample_top_shift),
- height, -xstep, upsampled_top);
- }
-}
-
-template <bool upsampled_left, bool upsampled_top>
-inline void DirectionalZone2_4_SSE4_1(void* dest, ptrdiff_t stride,
- const uint8_t* const top_row,
- const uint8_t* const left_column,
- const int width, const int height,
- const int xstep, const int ystep) {
- auto* dst = static_cast<uint8_t*>(dest);
- const int upsample_left_shift = static_cast<int>(upsampled_left);
- const int upsample_top_shift = static_cast<int>(upsampled_top);
- const __m128i max_shift = _mm_set1_epi8(32);
- const ptrdiff_t stride4 = stride << 2;
- const __m128i dest_index_x = _mm_set_epi32(0, 0, 0x00030002, 0x00010000);
- const __m128i sampler_top =
- upsampled_top
- ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
- : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
- // All columns from |min_top_only_x| to the right will only need |top_row| to
- // compute.
- assert(xstep >= 3);
- const int min_top_only_x = std::min((height * xstep) >> 6, width);
-
- const int xstep4 = xstep << 2;
- const __m128i xstep4_vect = _mm_set1_epi16(xstep4);
- const __m128i xstep_dup = _mm_set1_epi16(-xstep);
- const __m128i increments = _mm_set_epi32(0, 0, 0x00040003, 0x00020001);
- __m128i xstep_for_shift = _mm_mullo_epi16(xstep_dup, increments);
- const __m128i scaled_one = _mm_set1_epi16(-64);
- // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
- __m128i xstep_bounds_base =
- (xstep == 64) ? _mm_sub_epi16(scaled_one, xstep_for_shift)
- : _mm_sub_epi16(_mm_set1_epi16(-1), xstep_for_shift);
-
- const int left_base_increment = ystep >> 6;
- const int ystep_remainder = ystep & 0x3F;
- const int ystep4 = ystep << 2;
- const int left_base_increment4 = ystep4 >> 6;
- // This is guaranteed to be less than 64, but accumulation may bring it past
- // 64 for higher x values.
- const int ystep_remainder4 = ystep4 & 0x3F;
- const __m128i increment_left4 = _mm_set1_epi16(-ystep_remainder4);
- const __m128i increment_top4 = _mm_set1_epi16(4 << 6);
-
- // If the 64 scaling is regarded as a decimal point, the first value of the
- // left_y vector omits the portion which will go into the left_column offset.
- // Following values need the full ystep as a relative offset.
- const __m128i ystep_init = _mm_set1_epi16(-ystep_remainder);
- const __m128i ystep_dup = _mm_set1_epi16(-ystep);
- __m128i left_y = _mm_mullo_epi16(ystep_dup, dest_index_x);
- left_y = _mm_add_epi16(ystep_init, left_y);
- const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
-
- int x = 0;
- // Loop over x for columns with a mixture of sources.
- for (int left_offset = -left_base_increment; x < min_top_only_x; x += 4,
- xstep_bounds_base = _mm_sub_epi16(xstep_bounds_base, increment_top4),
- left_y = _mm_add_epi16(left_y, increment_left4),
- left_offset -= left_base_increment4) {
- uint8_t* dst_x = dst + x;
-
- // Round down to the nearest multiple of 8.
- const int max_top_only_y = std::min((x << 6) / xstep, height) & 0xFFFFFFF4;
- DirectionalZone1_4xH(dst_x, stride, top_row + (x << upsample_top_shift),
- max_top_only_y, -xstep, upsampled_top);
- int y = max_top_only_y;
- dst_x += stride * y;
- const int xstep_y = xstep * y;
- const __m128i xstep_y_vect = _mm_set1_epi16(xstep_y);
- // All rows from |min_left_only_y| down for this set of columns, only need
- // |left_column| to compute. Rounded up to the nearest multiple of 4.
- const int min_left_only_y = std::min(((x + 4) << 6) / xstep, height);
-
- __m128i xstep_bounds = _mm_add_epi16(xstep_bounds_base, xstep_y_vect);
- __m128i xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift, xstep_y_vect);
- int top_x = -xstep_y;
-
- // Loop over y for mixed rows.
- for (; y < min_left_only_y;
- y += 4, dst_x += stride4,
- xstep_bounds = _mm_add_epi16(xstep_bounds, xstep4_vect),
- xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep4_vect),
- top_x -= xstep4) {
- DirectionalZone2FromLeftCol_4x4_SSE4_1<upsampled_left>(
- dst_x, stride,
- left_column + ((left_offset + y) * (1 << upsample_left_shift)),
- left_y);
-
- __m128i shifts = _mm_srli_epi16(
- _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift),
- shift_mask),
- 1);
- shifts = _mm_packus_epi16(shifts, shifts);
- const __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts);
- shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
- const __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6);
- DirectionalZone1Blend_4x4<upsampled_top>(
- dst_x, top_row + (x << upsample_top_shift), stride, sampler_top,
- xstep_bounds_off, shifts, dest_index_x, top_x, xstep);
- }
- // Loop over y for left-only rows, if any.
- for (; y < height; y += 4, dst_x += stride4) {
- DirectionalZone2FromLeftCol_4x4_SSE4_1<upsampled_left>(
- dst_x, stride,
- left_column + ((left_offset + y) << upsample_left_shift), left_y);
- }
- }
- // Loop over top-only columns, if any.
- for (; x < width; x += 4) {
- DirectionalZone1_4xH(dst + x, stride, top_row + (x << upsample_top_shift),
- height, -xstep, upsampled_top);
- }
-}
-
-void DirectionalIntraPredictorZone2_SSE4_1(void* const dest, ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column,
- const int width, const int height,
- const int xstep, const int ystep,
- const bool upsampled_top,
- const bool upsampled_left) {
- // Increasing the negative buffer for this function allows more rows to be
- // processed at a time without branching in an inner loop to check the base.
- uint8_t top_buffer[288];
- uint8_t left_buffer[288];
- memcpy(top_buffer + 128, static_cast<const uint8_t*>(top_row) - 16, 160);
- memcpy(left_buffer + 128, static_cast<const uint8_t*>(left_column) - 16, 160);
- const uint8_t* top_ptr = top_buffer + 144;
- const uint8_t* left_ptr = left_buffer + 144;
- if (width == 4 || height == 4) {
- if (upsampled_left) {
- if (upsampled_top) {
- DirectionalZone2_4_SSE4_1<true, true>(dest, stride, top_ptr, left_ptr,
- width, height, xstep, ystep);
- } else {
- DirectionalZone2_4_SSE4_1<true, false>(dest, stride, top_ptr, left_ptr,
- width, height, xstep, ystep);
- }
- } else {
- if (upsampled_top) {
- DirectionalZone2_4_SSE4_1<false, true>(dest, stride, top_ptr, left_ptr,
- width, height, xstep, ystep);
- } else {
- DirectionalZone2_4_SSE4_1<false, false>(dest, stride, top_ptr, left_ptr,
- width, height, xstep, ystep);
- }
- }
- return;
- }
- if (upsampled_left) {
- if (upsampled_top) {
- DirectionalZone2_SSE4_1<true, true>(dest, stride, top_ptr, left_ptr,
- width, height, xstep, ystep);
- } else {
- DirectionalZone2_SSE4_1<true, false>(dest, stride, top_ptr, left_ptr,
- width, height, xstep, ystep);
- }
- } else {
- if (upsampled_top) {
- DirectionalZone2_SSE4_1<false, true>(dest, stride, top_ptr, left_ptr,
- width, height, xstep, ystep);
- } else {
- DirectionalZone2_SSE4_1<false, false>(dest, stride, top_ptr, left_ptr,
- width, height, xstep, ystep);
- }
- }
-}
-
-//------------------------------------------------------------------------------
-// FilterIntraPredictor_SSE4_1
-
-// Apply all filter taps to the given 7 packed 16-bit values, keeping the 8th
-// at zero to preserve the sum.
-inline void Filter4x2_SSE4_1(uint8_t* dst, const ptrdiff_t stride,
- const __m128i& pixels, const __m128i& taps_0_1,
- const __m128i& taps_2_3, const __m128i& taps_4_5,
- const __m128i& taps_6_7) {
- const __m128i mul_0_01 = _mm_maddubs_epi16(pixels, taps_0_1);
- const __m128i mul_0_23 = _mm_maddubs_epi16(pixels, taps_2_3);
- // |output_half| contains 8 partial sums.
- __m128i output_half = _mm_hadd_epi16(mul_0_01, mul_0_23);
- __m128i output = _mm_hadd_epi16(output_half, output_half);
- const __m128i output_row0 =
- _mm_packus_epi16(RightShiftWithRounding_S16(output, 4),
- /* arbitrary pack arg */ output);
- Store4(dst, output_row0);
- const __m128i mul_1_01 = _mm_maddubs_epi16(pixels, taps_4_5);
- const __m128i mul_1_23 = _mm_maddubs_epi16(pixels, taps_6_7);
- output_half = _mm_hadd_epi16(mul_1_01, mul_1_23);
- output = _mm_hadd_epi16(output_half, output_half);
- const __m128i output_row1 =
- _mm_packus_epi16(RightShiftWithRounding_S16(output, 4),
- /* arbitrary pack arg */ output);
- Store4(dst + stride, output_row1);
-}
-
-// 4xH transform sizes are given special treatment because LoadLo8 goes out
-// of bounds and every block involves the left column. This implementation
-// loads TL from the top row for the first block, so it is not
-inline void Filter4xH(uint8_t* dest, ptrdiff_t stride,
- const uint8_t* const top_ptr,
- const uint8_t* const left_ptr, FilterIntraPredictor pred,
- const int height) {
- const __m128i taps_0_1 = LoadUnaligned16(kFilterIntraTaps[pred][0]);
- const __m128i taps_2_3 = LoadUnaligned16(kFilterIntraTaps[pred][2]);
- const __m128i taps_4_5 = LoadUnaligned16(kFilterIntraTaps[pred][4]);
- const __m128i taps_6_7 = LoadUnaligned16(kFilterIntraTaps[pred][6]);
- __m128i top = Load4(top_ptr - 1);
- __m128i pixels = _mm_insert_epi8(top, top_ptr[3], 4);
- __m128i left = (height == 4 ? Load4(left_ptr) : LoadLo8(left_ptr));
- left = _mm_slli_si128(left, 5);
-
- // Relative pixels: top[-1], top[0], top[1], top[2], top[3], left[0], left[1],
- // left[2], left[3], left[4], left[5], left[6], left[7]
- pixels = _mm_or_si128(left, pixels);
-
- // Duplicate first 8 bytes.
- pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
- Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
- taps_6_7);
- dest += stride; // Move to y = 1.
- pixels = Load4(dest);
-
- // Relative pixels: top[0], top[1], top[2], top[3], empty, left[-2], left[-1],
- // left[0], left[1], ...
- pixels = _mm_or_si128(left, pixels);
-
- // This mask rearranges bytes in the order: 6, 0, 1, 2, 3, 7, 8, 15. The last
- // byte is an unused value, which shall be multiplied by 0 when we apply the
- // filter.
- constexpr int64_t kInsertTopLeftFirstMask = 0x0F08070302010006;
-
- // Insert left[-1] in front as TL and put left[0] and left[1] at the end.
- const __m128i pixel_order1 = _mm_set1_epi64x(kInsertTopLeftFirstMask);
- pixels = _mm_shuffle_epi8(pixels, pixel_order1);
- dest += stride; // Move to y = 2.
- Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
- taps_6_7);
- dest += stride; // Move to y = 3.
-
- // Compute the middle 8 rows before using common code for the final 4 rows.
- // Because the common code below this block assumes that
- if (height == 16) {
- // This shift allows us to use pixel_order2 twice after shifting by 2 later.
- left = _mm_slli_si128(left, 1);
- pixels = Load4(dest);
-
- // Relative pixels: top[0], top[1], top[2], top[3], empty, empty, left[-4],
- // left[-3], left[-2], left[-1], left[0], left[1], left[2], left[3]
- pixels = _mm_or_si128(left, pixels);
-
- // This mask rearranges bytes in the order: 9, 0, 1, 2, 3, 7, 8, 15. The
- // last byte is an unused value, as above. The top-left was shifted to
- // position nine to keep two empty spaces after the top pixels.
- constexpr int64_t kInsertTopLeftSecondMask = 0x0F0B0A0302010009;
-
- // Insert (relative) left[-1] in front as TL and put left[0] and left[1] at
- // the end.
- const __m128i pixel_order2 = _mm_set1_epi64x(kInsertTopLeftSecondMask);
- pixels = _mm_shuffle_epi8(pixels, pixel_order2);
- dest += stride; // Move to y = 4.
-
- // First 4x2 in the if body.
- Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
- taps_6_7);
-
- // Clear all but final pixel in the first 8 of left column.
- __m128i keep_top_left = _mm_srli_si128(left, 13);
- dest += stride; // Move to y = 5.
- pixels = Load4(dest);
- left = _mm_srli_si128(left, 2);
-
- // Relative pixels: top[0], top[1], top[2], top[3], left[-6],
- // left[-5], left[-4], left[-3], left[-2], left[-1], left[0], left[1]
- pixels = _mm_or_si128(left, pixels);
- left = LoadLo8(left_ptr + 8);
-
- pixels = _mm_shuffle_epi8(pixels, pixel_order2);
- dest += stride; // Move to y = 6.
-
- // Second 4x2 in the if body.
- Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
- taps_6_7);
-
- // Position TL value so we can use pixel_order1.
- keep_top_left = _mm_slli_si128(keep_top_left, 6);
- dest += stride; // Move to y = 7.
- pixels = Load4(dest);
- left = _mm_slli_si128(left, 7);
- left = _mm_or_si128(left, keep_top_left);
-
- // Relative pixels: top[0], top[1], top[2], top[3], empty, empty,
- // left[-1], left[0], left[1], left[2], left[3], ...
- pixels = _mm_or_si128(left, pixels);
- pixels = _mm_shuffle_epi8(pixels, pixel_order1);
- dest += stride; // Move to y = 8.
-
- // Third 4x2 in the if body.
- Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
- taps_6_7);
- dest += stride; // Move to y = 9.
-
- // Prepare final inputs.
- pixels = Load4(dest);
- left = _mm_srli_si128(left, 2);
-
- // Relative pixels: top[0], top[1], top[2], top[3], left[-3], left[-2]
- // left[-1], left[0], left[1], left[2], left[3], ...
- pixels = _mm_or_si128(left, pixels);
- pixels = _mm_shuffle_epi8(pixels, pixel_order1);
- dest += stride; // Move to y = 10.
-
- // Fourth 4x2 in the if body.
- Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
- taps_6_7);
- dest += stride; // Move to y = 11.
- }
-
- // In both the 8 and 16 case, we assume that the left vector has the next TL
- // at position 8.
- if (height > 4) {
- // Erase prior left pixels by shifting TL to position 0.
- left = _mm_srli_si128(left, 8);
- left = _mm_slli_si128(left, 6);
- pixels = Load4(dest);
-
- // Relative pixels: top[0], top[1], top[2], top[3], empty, empty,
- // left[-1], left[0], left[1], left[2], left[3], ...
- pixels = _mm_or_si128(left, pixels);
- pixels = _mm_shuffle_epi8(pixels, pixel_order1);
- dest += stride; // Move to y = 12 or 4.
-
- // First of final two 4x2 blocks.
- Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
- taps_6_7);
- dest += stride; // Move to y = 13 or 5.
- pixels = Load4(dest);
- left = _mm_srli_si128(left, 2);
-
- // Relative pixels: top[0], top[1], top[2], top[3], left[-3], left[-2]
- // left[-1], left[0], left[1], left[2], left[3], ...
- pixels = _mm_or_si128(left, pixels);
- pixels = _mm_shuffle_epi8(pixels, pixel_order1);
- dest += stride; // Move to y = 14 or 6.
-
- // Last of final two 4x2 blocks.
- Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
- taps_6_7);
- }
-}
-
-void FilterIntraPredictor_SSE4_1(void* const dest, ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column,
- FilterIntraPredictor pred, const int width,
- const int height) {
- const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
- const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
- auto* dst = static_cast<uint8_t*>(dest);
- if (width == 4) {
- Filter4xH(dst, stride, top_ptr, left_ptr, pred, height);
- return;
- }
-
- // There is one set of 7 taps for each of the 4x2 output pixels.
- const __m128i taps_0_1 = LoadUnaligned16(kFilterIntraTaps[pred][0]);
- const __m128i taps_2_3 = LoadUnaligned16(kFilterIntraTaps[pred][2]);
- const __m128i taps_4_5 = LoadUnaligned16(kFilterIntraTaps[pred][4]);
- const __m128i taps_6_7 = LoadUnaligned16(kFilterIntraTaps[pred][6]);
-
- // This mask rearranges bytes in the order: 0, 1, 2, 3, 4, 8, 9, 15. The 15 at
- // the end is an unused value, which shall be multiplied by 0 when we apply
- // the filter.
- constexpr int64_t kCondenseLeftMask = 0x0F09080403020100;
-
- // Takes the "left section" and puts it right after p0-p4.
- const __m128i pixel_order1 = _mm_set1_epi64x(kCondenseLeftMask);
-
- // This mask rearranges bytes in the order: 8, 0, 1, 2, 3, 9, 10, 15. The last
- // byte is unused as above.
- constexpr int64_t kInsertTopLeftMask = 0x0F0A090302010008;
-
- // Shuffles the "top left" from the left section, to the front. Used when
- // grabbing data from left_column and not top_row.
- const __m128i pixel_order2 = _mm_set1_epi64x(kInsertTopLeftMask);
-
- // This first pass takes care of the cases where the top left pixel comes from
- // top_row.
- __m128i pixels = LoadLo8(top_ptr - 1);
- __m128i left = _mm_slli_si128(Load4(left_column), 8);
- pixels = _mm_or_si128(pixels, left);
-
- // Two sets of the same pixels to multiply with two sets of taps.
- pixels = _mm_shuffle_epi8(pixels, pixel_order1);
- Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5, taps_6_7);
- left = _mm_srli_si128(left, 1);
-
- // Load
- pixels = Load4(dst + stride);
-
- // Because of the above shift, this OR 'invades' the final of the first 8
- // bytes of |pixels|. This is acceptable because the 8th filter tap is always
- // a padded 0.
- pixels = _mm_or_si128(pixels, left);
- pixels = _mm_shuffle_epi8(pixels, pixel_order2);
- const ptrdiff_t stride2 = stride << 1;
- const ptrdiff_t stride4 = stride << 2;
- Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
- taps_6_7);
- dst += 4;
- for (int x = 3; x < width - 4; x += 4) {
- pixels = Load4(top_ptr + x);
- pixels = _mm_insert_epi8(pixels, top_ptr[x + 4], 4);
- pixels = _mm_insert_epi8(pixels, dst[-1], 5);
- pixels = _mm_insert_epi8(pixels, dst[stride - 1], 6);
-
- // Duplicate bottom half into upper half.
- pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
- Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
- taps_6_7);
- pixels = Load4(dst + stride - 1);
- pixels = _mm_insert_epi8(pixels, dst[stride + 3], 4);
- pixels = _mm_insert_epi8(pixels, dst[stride2 - 1], 5);
- pixels = _mm_insert_epi8(pixels, dst[stride + stride2 - 1], 6);
-
- // Duplicate bottom half into upper half.
- pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
- Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3,
- taps_4_5, taps_6_7);
- dst += 4;
- }
-
- // Now we handle heights that reference previous blocks rather than top_row.
- for (int y = 4; y < height; y += 4) {
- // Leftmost 4x4 block for this height.
- dst -= width;
- dst += stride4;
-
- // Top Left is not available by offset in these leftmost blocks.
- pixels = Load4(dst - stride);
- left = _mm_slli_si128(Load4(left_ptr + y - 1), 8);
- left = _mm_insert_epi8(left, left_ptr[y + 3], 12);
- pixels = _mm_or_si128(pixels, left);
- pixels = _mm_shuffle_epi8(pixels, pixel_order2);
- Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
- taps_6_7);
-
- // The bytes shifted into positions 6 and 7 will be ignored by the shuffle.
- left = _mm_srli_si128(left, 2);
- pixels = Load4(dst + stride);
- pixels = _mm_or_si128(pixels, left);
- pixels = _mm_shuffle_epi8(pixels, pixel_order2);
- Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3,
- taps_4_5, taps_6_7);
-
- dst += 4;
-
- // Remaining 4x4 blocks for this height.
- for (int x = 4; x < width; x += 4) {
- pixels = Load4(dst - stride - 1);
- pixels = _mm_insert_epi8(pixels, dst[-stride + 3], 4);
- pixels = _mm_insert_epi8(pixels, dst[-1], 5);
- pixels = _mm_insert_epi8(pixels, dst[stride - 1], 6);
-
- // Duplicate bottom half into upper half.
- pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
- Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
- taps_6_7);
- pixels = Load4(dst + stride - 1);
- pixels = _mm_insert_epi8(pixels, dst[stride + 3], 4);
- pixels = _mm_insert_epi8(pixels, dst[stride2 - 1], 5);
- pixels = _mm_insert_epi8(pixels, dst[stride2 + stride - 1], 6);
-
- // Duplicate bottom half into upper half.
- pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
- Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3,
- taps_4_5, taps_6_7);
- dst += 4;
- }
- }
-}
-
void Init8bpp() {
Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
assert(dsp != nullptr);
@@ -2746,21 +1412,6 @@ void Init8bpp() {
// These guards check if this version of the function was not superseded by
// a higher optimization level, such as AVX. The corresponding #define also
// prevents the C version from being added to the table.
-#if DSP_ENABLED_8BPP_SSE4_1(FilterIntraPredictor)
- dsp->filter_intra_predictor = FilterIntraPredictor_SSE4_1;
-#endif
-#if DSP_ENABLED_8BPP_SSE4_1(DirectionalIntraPredictorZone1)
- dsp->directional_intra_predictor_zone1 =
- DirectionalIntraPredictorZone1_SSE4_1;
-#endif
-#if DSP_ENABLED_8BPP_SSE4_1(DirectionalIntraPredictorZone2)
- dsp->directional_intra_predictor_zone2 =
- DirectionalIntraPredictorZone2_SSE4_1;
-#endif
-#if DSP_ENABLED_8BPP_SSE4_1(DirectionalIntraPredictorZone3)
- dsp->directional_intra_predictor_zone3 =
- DirectionalIntraPredictorZone3_SSE4_1;
-#endif
#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorDcTop)
dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcTop] =
DcDefs::_4x4::DcTop;
@@ -3524,7 +2175,7 @@ void IntraPredInit_SSE4_1() {
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_TARGETING_SSE4_1
+#else // !LIBGAV1_TARGETING_SSE4_1
namespace libgav1 {
namespace dsp {
diff --git a/src/dsp/x86/intrapred_sse4.h b/src/dsp/x86/intrapred_sse4.h
index 7f4fcd7..1f6f30a 100644
--- a/src/dsp/x86/intrapred_sse4.h
+++ b/src/dsp/x86/intrapred_sse4.h
@@ -23,13 +23,9 @@
namespace libgav1 {
namespace dsp {
-// Initializes Dsp::intra_predictors, Dsp::directional_intra_predictor_zone*,
-// Dsp::cfl_intra_predictors, Dsp::cfl_subsamplers and
-// Dsp::filter_intra_predictor, see the defines below for specifics. These
-// functions are not thread-safe.
+// Initializes Dsp::intra_predictors. See the defines below for specifics.
+// These functions are not thread-safe.
void IntraPredInit_SSE4_1();
-void IntraPredCflInit_SSE4_1();
-void IntraPredSmoothInit_SSE4_1();
} // namespace dsp
} // namespace libgav1
@@ -37,22 +33,6 @@ void IntraPredSmoothInit_SSE4_1();
// If sse4 is enabled and the baseline isn't set due to a higher level of
// optimization being enabled, signal the sse4 implementation should be used.
#if LIBGAV1_TARGETING_SSE4_1
-#ifndef LIBGAV1_Dsp8bpp_FilterIntraPredictor
-#define LIBGAV1_Dsp8bpp_FilterIntraPredictor LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1
-#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2
-#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3
-#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3 LIBGAV1_CPU_SSE4_1
-#endif
-
#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcTop
#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1
#endif
@@ -138,174 +118,6 @@ void IntraPredSmoothInit_SSE4_1();
LIBGAV1_CPU_SSE4_1
#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler420
-#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler420 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler420
-#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler420
-#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler420
-#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler420 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler420
-#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler420
-#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler420
-#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler420 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler420
-#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler420 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler420
-#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler420
-#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler420
-#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler420 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler420
-#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler420
-#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler420
-#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler420 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler444
-#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler444 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler444
-#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler444
-#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler444
-#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler444 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler444
-#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler444
-#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler444
-#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler444 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler444
-#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler444 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler444
-#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler444
-#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler444
-#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler444 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler444
-#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler444
-#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler444
-#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler444 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflIntraPredictor
-#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflIntraPredictor LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflIntraPredictor
-#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflIntraPredictor
-#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflIntraPredictor
-#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflIntraPredictor LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflIntraPredictor
-#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflIntraPredictor
-#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflIntraPredictor
-#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflIntraPredictor LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflIntraPredictor
-#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflIntraPredictor LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflIntraPredictor
-#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflIntraPredictor
-#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflIntraPredictor
-#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflIntraPredictor LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflIntraPredictor
-#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflIntraPredictor
-#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflIntraPredictor
-#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflIntraPredictor LIBGAV1_CPU_SSE4_1
-#endif
-
#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcLeft
#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcLeft LIBGAV1_CPU_SSE4_1
#endif
@@ -658,287 +470,6 @@ void IntraPredSmoothInit_SSE4_1();
LIBGAV1_CPU_SSE4_1
#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmooth LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmooth LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmooth \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmooth LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmooth LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmooth \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmooth \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmooth \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmooth \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmooth \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmooth \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmooth \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmooth \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmooth \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmooth \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmooth \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmooth \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmooth \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmooth \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_SSE4_1
-#endif
-
//------------------------------------------------------------------------------
// 10bpp
diff --git a/src/dsp/x86/inverse_transform_sse4.cc b/src/dsp/x86/inverse_transform_sse4.cc
index 787d706..12c008f 100644
--- a/src/dsp/x86/inverse_transform_sse4.cc
+++ b/src/dsp/x86/inverse_transform_sse4.cc
@@ -94,8 +94,7 @@ LIBGAV1_ALWAYS_INLINE void ButterflyRotation_4(__m128i* a, __m128i* b,
static_cast<uint16_t>(cos128) | (static_cast<uint32_t>(sin128) << 16));
const __m128i ba = _mm_unpacklo_epi16(*a, *b);
const __m128i ab = _mm_unpacklo_epi16(*b, *a);
- const __m128i sign =
- _mm_set_epi32(0x80000001, 0x80000001, 0x80000001, 0x80000001);
+ const __m128i sign = _mm_set1_epi32(static_cast<int>(0x80000001));
// -sin cos, -sin cos, -sin cos, -sin cos
const __m128i msin_pcos = _mm_sign_epi16(psin_pcos, sign);
const __m128i x0 = _mm_madd_epi16(ba, msin_pcos);
@@ -121,8 +120,7 @@ LIBGAV1_ALWAYS_INLINE void ButterflyRotation_8(__m128i* a, __m128i* b,
const int16_t sin128 = Sin128(angle);
const __m128i psin_pcos = _mm_set1_epi32(
static_cast<uint16_t>(cos128) | (static_cast<uint32_t>(sin128) << 16));
- const __m128i sign =
- _mm_set_epi32(0x80000001, 0x80000001, 0x80000001, 0x80000001);
+ const __m128i sign = _mm_set1_epi32(static_cast<int>(0x80000001));
// -sin cos, -sin cos, -sin cos, -sin cos
const __m128i msin_pcos = _mm_sign_epi16(psin_pcos, sign);
const __m128i ba = _mm_unpacklo_epi16(*a, *b);
@@ -229,7 +227,8 @@ LIBGAV1_ALWAYS_INLINE bool DctDcOnly(void* dest, int adjusted_tx_height,
const __m128i v_src_lo = _mm_shufflelo_epi16(_mm_cvtsi32_si128(dst[0]), 0);
const __m128i v_src =
(width == 4) ? v_src_lo : _mm_shuffle_epi32(v_src_lo, 0);
- const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0);
+ const __m128i v_mask =
+ _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0);
const __m128i v_kTransformRowMultiplier =
_mm_set1_epi16(kTransformRowMultiplier << 3);
const __m128i v_src_round =
@@ -1039,7 +1038,8 @@ LIBGAV1_ALWAYS_INLINE bool Adst4DcOnly(void* dest, int adjusted_tx_height,
auto* dst = static_cast<int16_t*>(dest);
const __m128i v_src =
_mm_shuffle_epi32(_mm_shufflelo_epi16(_mm_cvtsi32_si128(dst[0]), 0), 0);
- const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0);
+ const __m128i v_mask =
+ _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0);
const __m128i v_kTransformRowMultiplier =
_mm_set1_epi16(kTransformRowMultiplier << 3);
const __m128i v_src_round =
@@ -1194,7 +1194,8 @@ LIBGAV1_ALWAYS_INLINE bool Adst8DcOnly(void* dest, int adjusted_tx_height,
__m128i s[8];
const __m128i v_src = _mm_shufflelo_epi16(_mm_cvtsi32_si128(dst[0]), 0);
- const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0);
+ const __m128i v_mask =
+ _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0);
const __m128i v_kTransformRowMultiplier =
_mm_set1_epi16(kTransformRowMultiplier << 3);
const __m128i v_src_round =
@@ -1519,7 +1520,8 @@ LIBGAV1_ALWAYS_INLINE bool Adst16DcOnly(void* dest, int adjusted_tx_height,
__m128i x[16];
const __m128i v_src = _mm_shufflelo_epi16(_mm_cvtsi32_si128(dst[0]), 0);
- const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0);
+ const __m128i v_mask =
+ _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0);
const __m128i v_kTransformRowMultiplier =
_mm_set1_epi16(kTransformRowMultiplier << 3);
const __m128i v_src_round =
@@ -1615,7 +1617,8 @@ LIBGAV1_ALWAYS_INLINE bool Identity4DcOnly(void* dest, int adjusted_tx_height,
auto* dst = static_cast<int16_t*>(dest);
const __m128i v_src0 = _mm_cvtsi32_si128(dst[0]);
- const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0);
+ const __m128i v_mask =
+ _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0);
const __m128i v_kTransformRowMultiplier =
_mm_set1_epi16(kTransformRowMultiplier << 3);
const __m128i v_src_round =
@@ -1767,7 +1770,8 @@ LIBGAV1_ALWAYS_INLINE bool Identity8DcOnly(void* dest, int adjusted_tx_height,
auto* dst = static_cast<int16_t*>(dest);
const __m128i v_src0 = _mm_cvtsi32_si128(dst[0]);
- const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0);
+ const __m128i v_mask =
+ _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0);
const __m128i v_kTransformRowMultiplier =
_mm_set1_epi16(kTransformRowMultiplier << 3);
const __m128i v_src_round =
@@ -1859,7 +1863,8 @@ LIBGAV1_ALWAYS_INLINE bool Identity16DcOnly(void* dest, int adjusted_tx_height,
auto* dst = static_cast<int16_t*>(dest);
const __m128i v_src0 = _mm_cvtsi32_si128(dst[0]);
- const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0);
+ const __m128i v_mask =
+ _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0);
const __m128i v_kTransformRowMultiplier =
_mm_set1_epi16(kTransformRowMultiplier << 3);
const __m128i v_src_round0 =
@@ -2918,75 +2923,11 @@ void Wht4TransformLoopColumn_SSE4_1(TransformType tx_type,
//------------------------------------------------------------------------------
-template <typename Residual, typename Pixel>
-void InitAll(Dsp* const dsp) {
- // Maximum transform size for Dct is 64.
- dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kRow] =
- Dct4TransformLoopRow_SSE4_1;
- dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kColumn] =
- Dct4TransformLoopColumn_SSE4_1;
- dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kRow] =
- Dct8TransformLoopRow_SSE4_1;
- dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kColumn] =
- Dct8TransformLoopColumn_SSE4_1;
- dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kRow] =
- Dct16TransformLoopRow_SSE4_1;
- dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kColumn] =
- Dct16TransformLoopColumn_SSE4_1;
- dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kRow] =
- Dct32TransformLoopRow_SSE4_1;
- dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kColumn] =
- Dct32TransformLoopColumn_SSE4_1;
- dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kRow] =
- Dct64TransformLoopRow_SSE4_1;
- dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kColumn] =
- Dct64TransformLoopColumn_SSE4_1;
-
- // Maximum transform size for Adst is 16.
- dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kRow] =
- Adst4TransformLoopRow_SSE4_1;
- dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kColumn] =
- Adst4TransformLoopColumn_SSE4_1;
- dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kRow] =
- Adst8TransformLoopRow_SSE4_1;
- dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kColumn] =
- Adst8TransformLoopColumn_SSE4_1;
- dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kRow] =
- Adst16TransformLoopRow_SSE4_1;
- dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kColumn] =
- Adst16TransformLoopColumn_SSE4_1;
-
- // Maximum transform size for Identity transform is 32.
- dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kRow] =
- Identity4TransformLoopRow_SSE4_1;
- dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kColumn] =
- Identity4TransformLoopColumn_SSE4_1;
- dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kRow] =
- Identity8TransformLoopRow_SSE4_1;
- dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kColumn] =
- Identity8TransformLoopColumn_SSE4_1;
- dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kRow] =
- Identity16TransformLoopRow_SSE4_1;
- dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kColumn] =
- Identity16TransformLoopColumn_SSE4_1;
- dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kRow] =
- Identity32TransformLoopRow_SSE4_1;
- dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kColumn] =
- Identity32TransformLoopColumn_SSE4_1;
-
- // Maximum transform size for Wht is 4.
- dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kRow] =
- Wht4TransformLoopRow_SSE4_1;
- dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kColumn] =
- Wht4TransformLoopColumn_SSE4_1;
-}
-
void Init8bpp() {
Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
assert(dsp != nullptr);
-#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
- InitAll<int16_t, uint8_t>(dsp);
-#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+
+ // Maximum transform size for Dct is 64.
#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize4_1DTransformDct)
dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kRow] =
Dct4TransformLoopRow_SSE4_1;
@@ -3017,6 +2958,8 @@ void Init8bpp() {
dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kColumn] =
Dct64TransformLoopColumn_SSE4_1;
#endif
+
+ // Maximum transform size for Adst is 16.
#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize4_1DTransformAdst)
dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kRow] =
Adst4TransformLoopRow_SSE4_1;
@@ -3035,6 +2978,8 @@ void Init8bpp() {
dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kColumn] =
Adst16TransformLoopColumn_SSE4_1;
#endif
+
+ // Maximum transform size for Identity transform is 32.
#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize4_1DTransformIdentity)
dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kRow] =
Identity4TransformLoopRow_SSE4_1;
@@ -3059,13 +3004,14 @@ void Init8bpp() {
dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kColumn] =
Identity32TransformLoopColumn_SSE4_1;
#endif
+
+ // Maximum transform size for Wht is 4.
#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize4_1DTransformWht)
dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kRow] =
Wht4TransformLoopRow_SSE4_1;
dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kColumn] =
Wht4TransformLoopColumn_SSE4_1;
#endif
-#endif
}
} // namespace
@@ -3075,7 +3021,7 @@ void InverseTransformInit_SSE4_1() { low_bitdepth::Init8bpp(); }
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_TARGETING_SSE4_1
+#else // !LIBGAV1_TARGETING_SSE4_1
namespace libgav1 {
namespace dsp {
diff --git a/src/dsp/x86/loop_filter_sse4.cc b/src/dsp/x86/loop_filter_sse4.cc
index d67b450..b9da2d5 100644
--- a/src/dsp/x86/loop_filter_sse4.cc
+++ b/src/dsp/x86/loop_filter_sse4.cc
@@ -350,7 +350,7 @@ void Horizontal6(void* dest, ptrdiff_t stride, int outer_thresh,
const __m128i v_mask =
_mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat3_mask), 0);
- if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi8(v_mask, v_mask)) == 0) {
+ if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
__m128i oqp1_f6;
__m128i oqp0_f6;
@@ -454,7 +454,7 @@ void Vertical6(void* dest, ptrdiff_t stride, int outer_thresh, int inner_thresh,
const __m128i v_mask =
_mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat3_mask), 0);
- if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi8(v_mask, v_mask)) == 0) {
+ if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
__m128i oqp1_f6;
__m128i oqp0_f6;
@@ -595,7 +595,7 @@ void Horizontal8(void* dest, ptrdiff_t stride, int outer_thresh,
const __m128i v_mask =
_mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat4_mask), 0);
- if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi8(v_mask, v_mask)) == 0) {
+ if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
__m128i oqp2_f8;
__m128i oqp1_f8;
__m128i oqp0_f8;
@@ -697,7 +697,7 @@ void Vertical8(void* dest, ptrdiff_t stride, int outer_thresh, int inner_thresh,
const __m128i v_mask =
_mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat4_mask), 0);
- if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi8(v_mask, v_mask)) == 0) {
+ if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
__m128i oqp2_f8;
__m128i oqp1_f8;
__m128i oqp0_f8;
@@ -838,7 +838,7 @@ void Horizontal14(void* dest, ptrdiff_t stride, int outer_thresh,
const __m128i v_mask =
_mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat4_mask), 0);
- if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi8(v_mask, v_mask)) == 0) {
+ if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
const __m128i p6 = Load4(dst - 7 * stride);
const __m128i p5 = Load4(dst - 6 * stride);
const __m128i p4 = Load4(dst - 5 * stride);
@@ -864,8 +864,7 @@ void Horizontal14(void* dest, ptrdiff_t stride, int outer_thresh,
oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask);
oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask);
- if (_mm_test_all_zeros(v_flat4_mask,
- _mm_cmpeq_epi8(v_flat4_mask, v_flat4_mask)) == 0) {
+ if (_mm_test_all_zeros(v_flat4_mask, v_flat4_mask) == 0) {
__m128i oqp5_f14;
__m128i oqp4_f14;
__m128i oqp3_f14;
@@ -1050,7 +1049,7 @@ void Vertical14(void* dest, ptrdiff_t stride, int outer_thresh,
const __m128i v_mask =
_mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat4_mask), 0);
- if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi8(v_mask, v_mask)) == 0) {
+ if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
const __m128i v_isflatouter4_mask =
IsFlat4(qp6, qp5, qp4, qp0, v_flat_thresh);
const __m128i v_flat4_mask =
@@ -1066,8 +1065,7 @@ void Vertical14(void* dest, ptrdiff_t stride, int outer_thresh,
oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask);
oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask);
- if (_mm_test_all_zeros(v_flat4_mask,
- _mm_cmpeq_epi8(v_flat4_mask, v_flat4_mask)) == 0) {
+ if (_mm_test_all_zeros(v_flat4_mask, v_flat4_mask) == 0) {
__m128i oqp5_f14;
__m128i oqp4_f14;
__m128i oqp3_f14;
@@ -1458,7 +1456,7 @@ void LoopFilterFuncs_SSE4_1<bitdepth>::Horizontal6(void* dest,
const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat3_mask);
const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo);
- if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi16(v_mask, v_mask)) == 0) {
+ if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
__m128i oqp1_f6;
__m128i oqp0_f6;
@@ -1572,7 +1570,7 @@ void LoopFilterFuncs_SSE4_1<bitdepth>::Vertical6(void* dest, ptrdiff_t stride8,
const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat3_mask);
const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo);
- if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi16(v_mask, v_mask)) == 0) {
+ if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
__m128i oqp1_f6;
__m128i oqp0_f6;
@@ -1711,7 +1709,7 @@ void LoopFilterFuncs_SSE4_1<bitdepth>::Horizontal8(void* dest,
const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat4_mask);
const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo);
- if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi16(v_mask, v_mask)) == 0) {
+ if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
__m128i oqp2_f8;
__m128i oqp1_f8;
__m128i oqp0_f8;
@@ -1821,7 +1819,7 @@ void LoopFilterFuncs_SSE4_1<bitdepth>::Vertical8(void* dest, ptrdiff_t stride8,
const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat4_mask);
const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo);
- if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi16(v_mask, v_mask)) == 0) {
+ if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
__m128i oqp2_f8;
__m128i oqp1_f8;
__m128i oqp0_f8;
@@ -1957,7 +1955,7 @@ void LoopFilterFuncs_SSE4_1<bitdepth>::Horizontal14(void* dest,
const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat4_mask);
const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo);
- if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi16(v_mask, v_mask)) == 0) {
+ if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
const __m128i p6 = LoadLo8(dst - 7 * stride);
const __m128i p5 = LoadLo8(dst - 6 * stride);
const __m128i p4 = LoadLo8(dst - 5 * stride);
@@ -1984,8 +1982,7 @@ void LoopFilterFuncs_SSE4_1<bitdepth>::Horizontal14(void* dest,
oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask);
oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask);
- if (_mm_test_all_zeros(v_flat4_mask,
- _mm_cmpeq_epi16(v_flat4_mask, v_flat4_mask)) == 0) {
+ if (_mm_test_all_zeros(v_flat4_mask, v_flat4_mask) == 0) {
__m128i oqp5_f14;
__m128i oqp4_f14;
__m128i oqp3_f14;
@@ -2133,7 +2130,7 @@ void LoopFilterFuncs_SSE4_1<bitdepth>::Vertical14(void* dest, ptrdiff_t stride8,
const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat4_mask);
const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo);
- if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi16(v_mask, v_mask)) == 0) {
+ if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
const __m128i v_isflatouter4_mask =
IsFlat4(qp6, qp5, qp4, qp0, v_flat_thresh);
const __m128i v_flat4_mask_lo = _mm_and_si128(v_mask, v_isflatouter4_mask);
@@ -2150,8 +2147,7 @@ void LoopFilterFuncs_SSE4_1<bitdepth>::Vertical14(void* dest, ptrdiff_t stride8,
oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask);
oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask);
- if (_mm_test_all_zeros(v_flat4_mask,
- _mm_cmpeq_epi16(v_flat4_mask, v_flat4_mask)) == 0) {
+ if (_mm_test_all_zeros(v_flat4_mask, v_flat4_mask) == 0) {
__m128i oqp5_f14;
__m128i oqp4_f14;
__m128i oqp3_f14;
@@ -2245,7 +2241,7 @@ void LoopFilterInit_SSE4_1() {
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_TARGETING_SSE4_1
+#else // !LIBGAV1_TARGETING_SSE4_1
namespace libgav1 {
namespace dsp {
diff --git a/src/dsp/x86/loop_restoration_10bit_avx2.cc b/src/dsp/x86/loop_restoration_10bit_avx2.cc
index 702bdea..b38f322 100644
--- a/src/dsp/x86/loop_restoration_10bit_avx2.cc
+++ b/src/dsp/x86/loop_restoration_10bit_avx2.cc
@@ -28,7 +28,6 @@
#include "src/dsp/constants.h"
#include "src/dsp/dsp.h"
#include "src/dsp/x86/common_avx2.h"
-#include "src/dsp/x86/common_sse4.h"
#include "src/utils/common.h"
#include "src/utils/constants.h"
@@ -472,12 +471,12 @@ inline void WienerVerticalTap1(const int16_t* wiener_buffer,
}
}
-void WienerFilter_AVX2(const RestorationUnitInfo& restoration_info,
- const void* const source, const void* const top_border,
- const void* const bottom_border, const ptrdiff_t stride,
- const int width, const int height,
- RestorationBuffer* const restoration_buffer,
- void* const dest) {
+void WienerFilter_AVX2(
+ const RestorationUnitInfo& restoration_info, const void* const source,
+ const ptrdiff_t stride, const void* const top_border,
+ const ptrdiff_t top_border_stride, const void* const bottom_border,
+ const ptrdiff_t bottom_border_stride, const int width, const int height,
+ RestorationBuffer* const restoration_buffer, void* const dest) {
const int16_t* const number_leading_zero_coefficients =
restoration_info.wiener_info.number_leading_zero_coefficients;
const int number_rows_to_skip = std::max(
@@ -502,39 +501,42 @@ void WienerFilter_AVX2(const RestorationUnitInfo& restoration_info,
LoadLo8(restoration_info.wiener_info.filter[WienerInfo::kHorizontal]);
const __m256i coefficients_horizontal = _mm256_broadcastq_epi64(c);
if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) {
- WienerHorizontalTap7(top + (2 - height_extra) * stride - 3, stride,
- wiener_stride, height_extra, &coefficients_horizontal,
- &wiener_buffer_horizontal);
- WienerHorizontalTap7(src - 3, stride, wiener_stride, height,
+ WienerHorizontalTap7(top + (2 - height_extra) * top_border_stride - 3,
+ top_border_stride, wiener_stride, height_extra,
&coefficients_horizontal, &wiener_buffer_horizontal);
- WienerHorizontalTap7(bottom - 3, stride, wiener_stride, height_extra,
+ WienerHorizontalTap7(src - 3, stride, wiener_stride, height,
&coefficients_horizontal, &wiener_buffer_horizontal);
- } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
- WienerHorizontalTap5(top + (2 - height_extra) * stride - 2, stride,
- wiener_stride, height_extra, &coefficients_horizontal,
+ WienerHorizontalTap7(bottom - 3, bottom_border_stride, wiener_stride,
+ height_extra, &coefficients_horizontal,
&wiener_buffer_horizontal);
- WienerHorizontalTap5(src - 2, stride, wiener_stride, height,
+ } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
+ WienerHorizontalTap5(top + (2 - height_extra) * top_border_stride - 2,
+ top_border_stride, wiener_stride, height_extra,
&coefficients_horizontal, &wiener_buffer_horizontal);
- WienerHorizontalTap5(bottom - 2, stride, wiener_stride, height_extra,
+ WienerHorizontalTap5(src - 2, stride, wiener_stride, height,
&coefficients_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap5(bottom - 2, bottom_border_stride, wiener_stride,
+ height_extra, &coefficients_horizontal,
+ &wiener_buffer_horizontal);
} else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) {
// The maximum over-reads happen here.
- WienerHorizontalTap3(top + (2 - height_extra) * stride - 1, stride,
- wiener_stride, height_extra, &coefficients_horizontal,
- &wiener_buffer_horizontal);
- WienerHorizontalTap3(src - 1, stride, wiener_stride, height,
+ WienerHorizontalTap3(top + (2 - height_extra) * top_border_stride - 1,
+ top_border_stride, wiener_stride, height_extra,
&coefficients_horizontal, &wiener_buffer_horizontal);
- WienerHorizontalTap3(bottom - 1, stride, wiener_stride, height_extra,
+ WienerHorizontalTap3(src - 1, stride, wiener_stride, height,
&coefficients_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap3(bottom - 1, bottom_border_stride, wiener_stride,
+ height_extra, &coefficients_horizontal,
+ &wiener_buffer_horizontal);
} else {
assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3);
- WienerHorizontalTap1(top + (2 - height_extra) * stride, stride,
- wiener_stride, height_extra,
+ WienerHorizontalTap1(top + (2 - height_extra) * top_border_stride,
+ top_border_stride, wiener_stride, height_extra,
&wiener_buffer_horizontal);
WienerHorizontalTap1(src, stride, wiener_stride, height,
&wiener_buffer_horizontal);
- WienerHorizontalTap1(bottom, stride, wiener_stride, height_extra,
- &wiener_buffer_horizontal);
+ WienerHorizontalTap1(bottom, bottom_border_stride, wiener_stride,
+ height_extra, &wiener_buffer_horizontal);
}
// vertical filtering.
@@ -566,12 +568,2575 @@ void WienerFilter_AVX2(const RestorationUnitInfo& restoration_info,
}
}
+//------------------------------------------------------------------------------
+// SGR
+
+constexpr int kSumOffset = 24;
+
+// SIMD overreads the number of pixels in SIMD registers - (width % 8) - 2 *
+// padding pixels, where padding is 3 for Pass 1 and 2 for Pass 2. The number of
+// bytes in SIMD registers is 16 for SSE4.1 and 32 for AVX2.
+constexpr int kOverreadInBytesPass1_128 = 4;
+constexpr int kOverreadInBytesPass2_128 = 8;
+constexpr int kOverreadInBytesPass1_256 = kOverreadInBytesPass1_128 + 16;
+constexpr int kOverreadInBytesPass2_256 = kOverreadInBytesPass2_128 + 16;
+
+inline void LoadAligned16x2U16(const uint16_t* const src[2], const ptrdiff_t x,
+ __m128i dst[2]) {
+ dst[0] = LoadAligned16(src[0] + x);
+ dst[1] = LoadAligned16(src[1] + x);
+}
+
+inline void LoadAligned32x2U16(const uint16_t* const src[2], const ptrdiff_t x,
+ __m256i dst[2]) {
+ dst[0] = LoadAligned32(src[0] + x);
+ dst[1] = LoadAligned32(src[1] + x);
+}
+
+inline void LoadAligned32x2U16Msan(const uint16_t* const src[2],
+ const ptrdiff_t x, const ptrdiff_t border,
+ __m256i dst[2]) {
+ dst[0] = LoadAligned32Msan(src[0] + x, sizeof(**src) * (x + 16 - border));
+ dst[1] = LoadAligned32Msan(src[1] + x, sizeof(**src) * (x + 16 - border));
+}
+
+inline void LoadAligned16x3U16(const uint16_t* const src[3], const ptrdiff_t x,
+ __m128i dst[3]) {
+ dst[0] = LoadAligned16(src[0] + x);
+ dst[1] = LoadAligned16(src[1] + x);
+ dst[2] = LoadAligned16(src[2] + x);
+}
+
+inline void LoadAligned32x3U16(const uint16_t* const src[3], const ptrdiff_t x,
+ __m256i dst[3]) {
+ dst[0] = LoadAligned32(src[0] + x);
+ dst[1] = LoadAligned32(src[1] + x);
+ dst[2] = LoadAligned32(src[2] + x);
+}
+
+inline void LoadAligned32x3U16Msan(const uint16_t* const src[3],
+ const ptrdiff_t x, const ptrdiff_t border,
+ __m256i dst[3]) {
+ dst[0] = LoadAligned32Msan(src[0] + x, sizeof(**src) * (x + 16 - border));
+ dst[1] = LoadAligned32Msan(src[1] + x, sizeof(**src) * (x + 16 - border));
+ dst[2] = LoadAligned32Msan(src[2] + x, sizeof(**src) * (x + 16 - border));
+}
+
+inline void LoadAligned32U32(const uint32_t* const src, __m128i dst[2]) {
+ dst[0] = LoadAligned16(src + 0);
+ dst[1] = LoadAligned16(src + 4);
+}
+
+inline void LoadAligned32x2U32(const uint32_t* const src[2], const ptrdiff_t x,
+ __m128i dst[2][2]) {
+ LoadAligned32U32(src[0] + x, dst[0]);
+ LoadAligned32U32(src[1] + x, dst[1]);
+}
+
+inline void LoadAligned64x2U32(const uint32_t* const src[2], const ptrdiff_t x,
+ __m256i dst[2][2]) {
+ LoadAligned64(src[0] + x, dst[0]);
+ LoadAligned64(src[1] + x, dst[1]);
+}
+
+inline void LoadAligned64x2U32Msan(const uint32_t* const src[2],
+ const ptrdiff_t x, const ptrdiff_t border,
+ __m256i dst[2][2]) {
+ LoadAligned64Msan(src[0] + x, sizeof(**src) * (x + 16 - border), dst[0]);
+ LoadAligned64Msan(src[1] + x, sizeof(**src) * (x + 16 - border), dst[1]);
+}
+
+inline void LoadAligned32x3U32(const uint32_t* const src[3], const ptrdiff_t x,
+ __m128i dst[3][2]) {
+ LoadAligned32U32(src[0] + x, dst[0]);
+ LoadAligned32U32(src[1] + x, dst[1]);
+ LoadAligned32U32(src[2] + x, dst[2]);
+}
+
+inline void LoadAligned64x3U32(const uint32_t* const src[3], const ptrdiff_t x,
+ __m256i dst[3][2]) {
+ LoadAligned64(src[0] + x, dst[0]);
+ LoadAligned64(src[1] + x, dst[1]);
+ LoadAligned64(src[2] + x, dst[2]);
+}
+
+inline void LoadAligned64x3U32Msan(const uint32_t* const src[3],
+ const ptrdiff_t x, const ptrdiff_t border,
+ __m256i dst[3][2]) {
+ LoadAligned64Msan(src[0] + x, sizeof(**src) * (x + 16 - border), dst[0]);
+ LoadAligned64Msan(src[1] + x, sizeof(**src) * (x + 16 - border), dst[1]);
+ LoadAligned64Msan(src[2] + x, sizeof(**src) * (x + 16 - border), dst[2]);
+}
+
+inline void StoreAligned32U32(uint32_t* const dst, const __m128i src[2]) {
+ StoreAligned16(dst + 0, src[0]);
+ StoreAligned16(dst + 4, src[1]);
+}
+
+// The AVX2 ymm register holds ma[0], ma[1], ..., ma[7], and ma[16], ma[17],
+// ..., ma[23].
+// There is an 8 pixel gap between the first half and the second half.
+constexpr int kMaStoreOffset = 8;
+
+inline void StoreAligned32_ma(uint16_t* src, const __m256i v) {
+ StoreAligned16(src + 0 * 8, _mm256_extracti128_si256(v, 0));
+ StoreAligned16(src + 2 * 8, _mm256_extracti128_si256(v, 1));
+}
+
+inline void StoreAligned64_ma(uint16_t* src, const __m256i v[2]) {
+ // The next 4 lines are much faster than:
+ // StoreAligned32(src + 0, _mm256_permute2x128_si256(v[0], v[1], 0x20));
+ // StoreAligned32(src + 16, _mm256_permute2x128_si256(v[0], v[1], 0x31));
+ StoreAligned16(src + 0 * 8, _mm256_extracti128_si256(v[0], 0));
+ StoreAligned16(src + 1 * 8, _mm256_extracti128_si256(v[1], 0));
+ StoreAligned16(src + 2 * 8, _mm256_extracti128_si256(v[0], 1));
+ StoreAligned16(src + 3 * 8, _mm256_extracti128_si256(v[1], 1));
+}
+
+// Don't use _mm_cvtepu8_epi16() or _mm_cvtepu16_epi32() in the following
+// functions. Some compilers may generate super inefficient code and the whole
+// decoder could be 15% slower.
+
+inline __m256i VaddlLo8(const __m256i src0, const __m256i src1) {
+ const __m256i s0 = _mm256_unpacklo_epi8(src0, _mm256_setzero_si256());
+ const __m256i s1 = _mm256_unpacklo_epi8(src1, _mm256_setzero_si256());
+ return _mm256_add_epi16(s0, s1);
+}
+
+inline __m256i VaddlHi8(const __m256i src0, const __m256i src1) {
+ const __m256i s0 = _mm256_unpackhi_epi8(src0, _mm256_setzero_si256());
+ const __m256i s1 = _mm256_unpackhi_epi8(src1, _mm256_setzero_si256());
+ return _mm256_add_epi16(s0, s1);
+}
+
+inline __m256i VaddwLo8(const __m256i src0, const __m256i src1) {
+ const __m256i s1 = _mm256_unpacklo_epi8(src1, _mm256_setzero_si256());
+ return _mm256_add_epi16(src0, s1);
+}
+
+inline __m256i VaddwHi8(const __m256i src0, const __m256i src1) {
+ const __m256i s1 = _mm256_unpackhi_epi8(src1, _mm256_setzero_si256());
+ return _mm256_add_epi16(src0, s1);
+}
+
+inline __m256i VmullNLo8(const __m256i src0, const int src1) {
+ const __m256i s0 = _mm256_unpacklo_epi16(src0, _mm256_setzero_si256());
+ return _mm256_madd_epi16(s0, _mm256_set1_epi32(src1));
+}
+
+inline __m256i VmullNHi8(const __m256i src0, const int src1) {
+ const __m256i s0 = _mm256_unpackhi_epi16(src0, _mm256_setzero_si256());
+ return _mm256_madd_epi16(s0, _mm256_set1_epi32(src1));
+}
+
+inline __m128i VmullLo16(const __m128i src0, const __m128i src1) {
+ const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128());
+ const __m128i s1 = _mm_unpacklo_epi16(src1, _mm_setzero_si128());
+ return _mm_madd_epi16(s0, s1);
+}
+
+inline __m256i VmullLo16(const __m256i src0, const __m256i src1) {
+ const __m256i s0 = _mm256_unpacklo_epi16(src0, _mm256_setzero_si256());
+ const __m256i s1 = _mm256_unpacklo_epi16(src1, _mm256_setzero_si256());
+ return _mm256_madd_epi16(s0, s1);
+}
+
+inline __m128i VmullHi16(const __m128i src0, const __m128i src1) {
+ const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128());
+ const __m128i s1 = _mm_unpackhi_epi16(src1, _mm_setzero_si128());
+ return _mm_madd_epi16(s0, s1);
+}
+
+inline __m256i VmullHi16(const __m256i src0, const __m256i src1) {
+ const __m256i s0 = _mm256_unpackhi_epi16(src0, _mm256_setzero_si256());
+ const __m256i s1 = _mm256_unpackhi_epi16(src1, _mm256_setzero_si256());
+ return _mm256_madd_epi16(s0, s1);
+}
+
+inline __m128i VrshrU16(const __m128i src0, const int src1) {
+ const __m128i sum = _mm_add_epi16(src0, _mm_set1_epi16(1 << (src1 - 1)));
+ return _mm_srli_epi16(sum, src1);
+}
+
+inline __m256i VrshrU16(const __m256i src0, const int src1) {
+ const __m256i sum =
+ _mm256_add_epi16(src0, _mm256_set1_epi16(1 << (src1 - 1)));
+ return _mm256_srli_epi16(sum, src1);
+}
+
+inline __m256i VrshrS32(const __m256i src0, const int src1) {
+ const __m256i sum =
+ _mm256_add_epi32(src0, _mm256_set1_epi32(1 << (src1 - 1)));
+ return _mm256_srai_epi32(sum, src1);
+}
+
+inline __m128i VrshrU32(const __m128i src0, const int src1) {
+ const __m128i sum = _mm_add_epi32(src0, _mm_set1_epi32(1 << (src1 - 1)));
+ return _mm_srli_epi32(sum, src1);
+}
+
+inline __m256i VrshrU32(const __m256i src0, const int src1) {
+ const __m256i sum =
+ _mm256_add_epi32(src0, _mm256_set1_epi32(1 << (src1 - 1)));
+ return _mm256_srli_epi32(sum, src1);
+}
+
+inline void Square(const __m128i src, __m128i dst[2]) {
+ const __m128i s0 = _mm_unpacklo_epi16(src, _mm_setzero_si128());
+ const __m128i s1 = _mm_unpackhi_epi16(src, _mm_setzero_si128());
+ dst[0] = _mm_madd_epi16(s0, s0);
+ dst[1] = _mm_madd_epi16(s1, s1);
+}
+
+inline void Square(const __m256i src, __m256i dst[2]) {
+ const __m256i s0 = _mm256_unpacklo_epi16(src, _mm256_setzero_si256());
+ const __m256i s1 = _mm256_unpackhi_epi16(src, _mm256_setzero_si256());
+ dst[0] = _mm256_madd_epi16(s0, s0);
+ dst[1] = _mm256_madd_epi16(s1, s1);
+}
+
+inline void Prepare3_8(const __m256i src[2], __m256i dst[3]) {
+ dst[0] = _mm256_alignr_epi8(src[1], src[0], 0);
+ dst[1] = _mm256_alignr_epi8(src[1], src[0], 1);
+ dst[2] = _mm256_alignr_epi8(src[1], src[0], 2);
+}
+
+inline void Prepare3_16(const __m128i src[2], __m128i dst[3]) {
+ dst[0] = src[0];
+ dst[1] = _mm_alignr_epi8(src[1], src[0], 2);
+ dst[2] = _mm_alignr_epi8(src[1], src[0], 4);
+}
+
+inline void Prepare3_32(const __m128i src[2], __m128i dst[3]) {
+ dst[0] = src[0];
+ dst[1] = _mm_alignr_epi8(src[1], src[0], 4);
+ dst[2] = _mm_alignr_epi8(src[1], src[0], 8);
+}
+
+inline void Prepare3_32(const __m256i src[2], __m256i dst[3]) {
+ dst[0] = src[0];
+ dst[1] = _mm256_alignr_epi8(src[1], src[0], 4);
+ dst[2] = _mm256_alignr_epi8(src[1], src[0], 8);
+}
+
+inline void Prepare5_16(const __m128i src[2], __m128i dst[5]) {
+ Prepare3_16(src, dst);
+ dst[3] = _mm_alignr_epi8(src[1], src[0], 6);
+ dst[4] = _mm_alignr_epi8(src[1], src[0], 8);
+}
+
+inline void Prepare5_32(const __m128i src[2], __m128i dst[5]) {
+ Prepare3_32(src, dst);
+ dst[3] = _mm_alignr_epi8(src[1], src[0], 12);
+ dst[4] = src[1];
+}
+
+inline void Prepare5_32(const __m256i src[2], __m256i dst[5]) {
+ Prepare3_32(src, dst);
+ dst[3] = _mm256_alignr_epi8(src[1], src[0], 12);
+ dst[4] = src[1];
+}
+
+inline __m128i Sum3_16(const __m128i src0, const __m128i src1,
+ const __m128i src2) {
+ const __m128i sum = _mm_add_epi16(src0, src1);
+ return _mm_add_epi16(sum, src2);
+}
+
+inline __m256i Sum3_16(const __m256i src0, const __m256i src1,
+ const __m256i src2) {
+ const __m256i sum = _mm256_add_epi16(src0, src1);
+ return _mm256_add_epi16(sum, src2);
+}
+
+inline __m128i Sum3_16(const __m128i src[3]) {
+ return Sum3_16(src[0], src[1], src[2]);
+}
+
+inline __m256i Sum3_16(const __m256i src[3]) {
+ return Sum3_16(src[0], src[1], src[2]);
+}
+
+inline __m128i Sum3_32(const __m128i src0, const __m128i src1,
+ const __m128i src2) {
+ const __m128i sum = _mm_add_epi32(src0, src1);
+ return _mm_add_epi32(sum, src2);
+}
+
+inline __m256i Sum3_32(const __m256i src0, const __m256i src1,
+ const __m256i src2) {
+ const __m256i sum = _mm256_add_epi32(src0, src1);
+ return _mm256_add_epi32(sum, src2);
+}
+
+inline __m128i Sum3_32(const __m128i src[3]) {
+ return Sum3_32(src[0], src[1], src[2]);
+}
+
+inline __m256i Sum3_32(const __m256i src[3]) {
+ return Sum3_32(src[0], src[1], src[2]);
+}
+
+inline void Sum3_32(const __m128i src[3][2], __m128i dst[2]) {
+ dst[0] = Sum3_32(src[0][0], src[1][0], src[2][0]);
+ dst[1] = Sum3_32(src[0][1], src[1][1], src[2][1]);
+}
+
+inline void Sum3_32(const __m256i src[3][2], __m256i dst[2]) {
+ dst[0] = Sum3_32(src[0][0], src[1][0], src[2][0]);
+ dst[1] = Sum3_32(src[0][1], src[1][1], src[2][1]);
+}
+
+inline __m256i Sum3WLo16(const __m256i src[3]) {
+ const __m256i sum = VaddlLo8(src[0], src[1]);
+ return VaddwLo8(sum, src[2]);
+}
+
+inline __m256i Sum3WHi16(const __m256i src[3]) {
+ const __m256i sum = VaddlHi8(src[0], src[1]);
+ return VaddwHi8(sum, src[2]);
+}
+
+inline __m128i Sum5_16(const __m128i src[5]) {
+ const __m128i sum01 = _mm_add_epi16(src[0], src[1]);
+ const __m128i sum23 = _mm_add_epi16(src[2], src[3]);
+ const __m128i sum = _mm_add_epi16(sum01, sum23);
+ return _mm_add_epi16(sum, src[4]);
+}
+
+inline __m256i Sum5_16(const __m256i src[5]) {
+ const __m256i sum01 = _mm256_add_epi16(src[0], src[1]);
+ const __m256i sum23 = _mm256_add_epi16(src[2], src[3]);
+ const __m256i sum = _mm256_add_epi16(sum01, sum23);
+ return _mm256_add_epi16(sum, src[4]);
+}
+
+inline __m128i Sum5_32(const __m128i* const src0, const __m128i* const src1,
+ const __m128i* const src2, const __m128i* const src3,
+ const __m128i* const src4) {
+ const __m128i sum01 = _mm_add_epi32(*src0, *src1);
+ const __m128i sum23 = _mm_add_epi32(*src2, *src3);
+ const __m128i sum = _mm_add_epi32(sum01, sum23);
+ return _mm_add_epi32(sum, *src4);
+}
+
+inline __m256i Sum5_32(const __m256i* const src0, const __m256i* const src1,
+ const __m256i* const src2, const __m256i* const src3,
+ const __m256i* const src4) {
+ const __m256i sum01 = _mm256_add_epi32(*src0, *src1);
+ const __m256i sum23 = _mm256_add_epi32(*src2, *src3);
+ const __m256i sum = _mm256_add_epi32(sum01, sum23);
+ return _mm256_add_epi32(sum, *src4);
+}
+
+inline __m128i Sum5_32(const __m128i src[5]) {
+ return Sum5_32(&src[0], &src[1], &src[2], &src[3], &src[4]);
+}
+
+inline __m256i Sum5_32(const __m256i src[5]) {
+ return Sum5_32(&src[0], &src[1], &src[2], &src[3], &src[4]);
+}
+
+inline void Sum5_32(const __m128i src[5][2], __m128i dst[2]) {
+ dst[0] = Sum5_32(&src[0][0], &src[1][0], &src[2][0], &src[3][0], &src[4][0]);
+ dst[1] = Sum5_32(&src[0][1], &src[1][1], &src[2][1], &src[3][1], &src[4][1]);
+}
+
+inline void Sum5_32(const __m256i src[5][2], __m256i dst[2]) {
+ dst[0] = Sum5_32(&src[0][0], &src[1][0], &src[2][0], &src[3][0], &src[4][0]);
+ dst[1] = Sum5_32(&src[0][1], &src[1][1], &src[2][1], &src[3][1], &src[4][1]);
+}
+
+inline __m128i Sum3Horizontal16(const __m128i src[2]) {
+ __m128i s[3];
+ Prepare3_16(src, s);
+ return Sum3_16(s);
+}
+
+inline __m256i Sum3Horizontal16(const uint16_t* const src,
+ const ptrdiff_t over_read_in_bytes) {
+ __m256i s[3];
+ s[0] = LoadUnaligned32Msan(src + 0, over_read_in_bytes + 0);
+ s[1] = LoadUnaligned32Msan(src + 1, over_read_in_bytes + 2);
+ s[2] = LoadUnaligned32Msan(src + 2, over_read_in_bytes + 4);
+ return Sum3_16(s);
+}
+
+inline __m128i Sum5Horizontal16(const __m128i src[2]) {
+ __m128i s[5];
+ Prepare5_16(src, s);
+ return Sum5_16(s);
+}
+
+inline __m256i Sum5Horizontal16(const uint16_t* const src,
+ const ptrdiff_t over_read_in_bytes) {
+ __m256i s[5];
+ s[0] = LoadUnaligned32Msan(src + 0, over_read_in_bytes + 0);
+ s[1] = LoadUnaligned32Msan(src + 1, over_read_in_bytes + 2);
+ s[2] = LoadUnaligned32Msan(src + 2, over_read_in_bytes + 4);
+ s[3] = LoadUnaligned32Msan(src + 3, over_read_in_bytes + 6);
+ s[4] = LoadUnaligned32Msan(src + 4, over_read_in_bytes + 8);
+ return Sum5_16(s);
+}
+
+inline void SumHorizontal16(const uint16_t* const src,
+ const ptrdiff_t over_read_in_bytes,
+ __m256i* const row3, __m256i* const row5) {
+ __m256i s[5];
+ s[0] = LoadUnaligned32Msan(src + 0, over_read_in_bytes + 0);
+ s[1] = LoadUnaligned32Msan(src + 1, over_read_in_bytes + 2);
+ s[2] = LoadUnaligned32Msan(src + 2, over_read_in_bytes + 4);
+ s[3] = LoadUnaligned32Msan(src + 3, over_read_in_bytes + 6);
+ s[4] = LoadUnaligned32Msan(src + 4, over_read_in_bytes + 8);
+ const __m256i sum04 = _mm256_add_epi16(s[0], s[4]);
+ *row3 = Sum3_16(s + 1);
+ *row5 = _mm256_add_epi16(sum04, *row3);
+}
+
+inline void SumHorizontal16(const uint16_t* const src,
+ const ptrdiff_t over_read_in_bytes,
+ __m256i* const row3_0, __m256i* const row3_1,
+ __m256i* const row5_0, __m256i* const row5_1) {
+ SumHorizontal16(src + 0, over_read_in_bytes + 0, row3_0, row5_0);
+ SumHorizontal16(src + 16, over_read_in_bytes + 32, row3_1, row5_1);
+}
+
+inline void SumHorizontal32(const __m128i src[5], __m128i* const row_sq3,
+ __m128i* const row_sq5) {
+ const __m128i sum04 = _mm_add_epi32(src[0], src[4]);
+ *row_sq3 = Sum3_32(src + 1);
+ *row_sq5 = _mm_add_epi32(sum04, *row_sq3);
+}
+
+inline void SumHorizontal32(const __m256i src[5], __m256i* const row_sq3,
+ __m256i* const row_sq5) {
+ const __m256i sum04 = _mm256_add_epi32(src[0], src[4]);
+ *row_sq3 = Sum3_32(src + 1);
+ *row_sq5 = _mm256_add_epi32(sum04, *row_sq3);
+}
+
+inline void SumHorizontal32(const __m128i src[3], __m128i* const row_sq3_0,
+ __m128i* const row_sq3_1, __m128i* const row_sq5_0,
+ __m128i* const row_sq5_1) {
+ __m128i s[5];
+ Prepare5_32(src + 0, s);
+ SumHorizontal32(s, row_sq3_0, row_sq5_0);
+ Prepare5_32(src + 1, s);
+ SumHorizontal32(s, row_sq3_1, row_sq5_1);
+}
+
+inline void SumHorizontal32(const __m256i src[3], __m256i* const row_sq3_0,
+ __m256i* const row_sq3_1, __m256i* const row_sq5_0,
+ __m256i* const row_sq5_1) {
+ __m256i s[5];
+ Prepare5_32(src + 0, s);
+ SumHorizontal32(s, row_sq3_0, row_sq5_0);
+ Prepare5_32(src + 1, s);
+ SumHorizontal32(s, row_sq3_1, row_sq5_1);
+}
+
+inline void Sum3Horizontal32(const __m128i src[3], __m128i dst[2]) {
+ __m128i s[3];
+ Prepare3_32(src + 0, s);
+ dst[0] = Sum3_32(s);
+ Prepare3_32(src + 1, s);
+ dst[1] = Sum3_32(s);
+}
+
+inline void Sum3Horizontal32(const __m256i src[3], __m256i dst[2]) {
+ __m256i s[3];
+ Prepare3_32(src + 0, s);
+ dst[0] = Sum3_32(s);
+ Prepare3_32(src + 1, s);
+ dst[1] = Sum3_32(s);
+}
+
+inline void Sum5Horizontal32(const __m128i src[3], __m128i dst[2]) {
+ __m128i s[5];
+ Prepare5_32(src + 0, s);
+ dst[0] = Sum5_32(s);
+ Prepare5_32(src + 1, s);
+ dst[1] = Sum5_32(s);
+}
+
+inline void Sum5Horizontal32(const __m256i src[3], __m256i dst[2]) {
+ __m256i s[5];
+ Prepare5_32(src + 0, s);
+ dst[0] = Sum5_32(s);
+ Prepare5_32(src + 1, s);
+ dst[1] = Sum5_32(s);
+}
+
+void SumHorizontal16(const __m128i src[2], __m128i* const row3,
+ __m128i* const row5) {
+ __m128i s[5];
+ Prepare5_16(src, s);
+ const __m128i sum04 = _mm_add_epi16(s[0], s[4]);
+ *row3 = Sum3_16(s + 1);
+ *row5 = _mm_add_epi16(sum04, *row3);
+}
+
+inline __m256i Sum343Lo(const __m256i ma3[3]) {
+ const __m256i sum = Sum3WLo16(ma3);
+ const __m256i sum3 = Sum3_16(sum, sum, sum);
+ return VaddwLo8(sum3, ma3[1]);
+}
+
+inline __m256i Sum343Hi(const __m256i ma3[3]) {
+ const __m256i sum = Sum3WHi16(ma3);
+ const __m256i sum3 = Sum3_16(sum, sum, sum);
+ return VaddwHi8(sum3, ma3[1]);
+}
+
+inline __m256i Sum343(const __m256i src[3]) {
+ const __m256i sum = Sum3_32(src);
+ const __m256i sum3 = Sum3_32(sum, sum, sum);
+ return _mm256_add_epi32(sum3, src[1]);
+}
+
+inline void Sum343(const __m256i src[3], __m256i dst[2]) {
+ __m256i s[3];
+ Prepare3_32(src + 0, s);
+ dst[0] = Sum343(s);
+ Prepare3_32(src + 1, s);
+ dst[1] = Sum343(s);
+}
+
+inline __m256i Sum565Lo(const __m256i src[3]) {
+ const __m256i sum = Sum3WLo16(src);
+ const __m256i sum4 = _mm256_slli_epi16(sum, 2);
+ const __m256i sum5 = _mm256_add_epi16(sum4, sum);
+ return VaddwLo8(sum5, src[1]);
+}
+
+inline __m256i Sum565Hi(const __m256i src[3]) {
+ const __m256i sum = Sum3WHi16(src);
+ const __m256i sum4 = _mm256_slli_epi16(sum, 2);
+ const __m256i sum5 = _mm256_add_epi16(sum4, sum);
+ return VaddwHi8(sum5, src[1]);
+}
+
+inline __m256i Sum565(const __m256i src[3]) {
+ const __m256i sum = Sum3_32(src);
+ const __m256i sum4 = _mm256_slli_epi32(sum, 2);
+ const __m256i sum5 = _mm256_add_epi32(sum4, sum);
+ return _mm256_add_epi32(sum5, src[1]);
+}
+
+inline void Sum565(const __m256i src[3], __m256i dst[2]) {
+ __m256i s[3];
+ Prepare3_32(src + 0, s);
+ dst[0] = Sum565(s);
+ Prepare3_32(src + 1, s);
+ dst[1] = Sum565(s);
+}
+
+inline void BoxSum(const uint16_t* src, const ptrdiff_t src_stride,
+ const ptrdiff_t width, const ptrdiff_t sum_stride,
+ const ptrdiff_t sum_width, uint16_t* sum3, uint16_t* sum5,
+ uint32_t* square_sum3, uint32_t* square_sum5) {
+ const ptrdiff_t overread_in_bytes_128 =
+ kOverreadInBytesPass1_128 - sizeof(*src) * width;
+ const ptrdiff_t overread_in_bytes_256 =
+ kOverreadInBytesPass1_256 - sizeof(*src) * width;
+ int y = 2;
+ do {
+ __m128i s0[2], sq_128[4], s3, s5, sq3[2], sq5[2];
+ __m256i sq[8];
+ s0[0] = LoadUnaligned16Msan(src + 0, overread_in_bytes_128 + 0);
+ s0[1] = LoadUnaligned16Msan(src + 8, overread_in_bytes_128 + 16);
+ Square(s0[0], sq_128 + 0);
+ Square(s0[1], sq_128 + 2);
+ SumHorizontal16(s0, &s3, &s5);
+ StoreAligned16(sum3, s3);
+ StoreAligned16(sum5, s5);
+ SumHorizontal32(sq_128, &sq3[0], &sq3[1], &sq5[0], &sq5[1]);
+ StoreAligned32U32(square_sum3, sq3);
+ StoreAligned32U32(square_sum5, sq5);
+ src += 8;
+ sum3 += 8;
+ sum5 += 8;
+ square_sum3 += 8;
+ square_sum5 += 8;
+ sq[0] = SetrM128i(sq_128[2], sq_128[2]);
+ sq[1] = SetrM128i(sq_128[3], sq_128[3]);
+ ptrdiff_t x = sum_width;
+ do {
+ __m256i s[2], row3[2], row5[2], row_sq3[2], row_sq5[2];
+ s[0] = LoadUnaligned32Msan(
+ src + 8, overread_in_bytes_256 + sizeof(*src) * (sum_width - x + 8));
+ s[1] = LoadUnaligned32Msan(
+ src + 24,
+ overread_in_bytes_256 + sizeof(*src) * (sum_width - x + 24));
+ Square(s[0], sq + 2);
+ Square(s[1], sq + 6);
+ sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
+ sq[1] = _mm256_permute2x128_si256(sq[1], sq[3], 0x21);
+ sq[4] = _mm256_permute2x128_si256(sq[2], sq[6], 0x21);
+ sq[5] = _mm256_permute2x128_si256(sq[3], sq[7], 0x21);
+ SumHorizontal16(
+ src, overread_in_bytes_256 + sizeof(*src) * (sum_width - x + 8),
+ &row3[0], &row3[1], &row5[0], &row5[1]);
+ StoreAligned64(sum3, row3);
+ StoreAligned64(sum5, row5);
+ SumHorizontal32(sq + 0, &row_sq3[0], &row_sq3[1], &row_sq5[0],
+ &row_sq5[1]);
+ StoreAligned64(square_sum3 + 0, row_sq3);
+ StoreAligned64(square_sum5 + 0, row_sq5);
+ SumHorizontal32(sq + 4, &row_sq3[0], &row_sq3[1], &row_sq5[0],
+ &row_sq5[1]);
+ StoreAligned64(square_sum3 + 16, row_sq3);
+ StoreAligned64(square_sum5 + 16, row_sq5);
+ sq[0] = sq[6];
+ sq[1] = sq[7];
+ src += 32;
+ sum3 += 32;
+ sum5 += 32;
+ square_sum3 += 32;
+ square_sum5 += 32;
+ x -= 32;
+ } while (x != 0);
+ src += src_stride - sum_width - 8;
+ sum3 += sum_stride - sum_width - 8;
+ sum5 += sum_stride - sum_width - 8;
+ square_sum3 += sum_stride - sum_width - 8;
+ square_sum5 += sum_stride - sum_width - 8;
+ } while (--y != 0);
+}
+
+template <int size>
+inline void BoxSum(const uint16_t* src, const ptrdiff_t src_stride,
+ const ptrdiff_t width, const ptrdiff_t sum_stride,
+ const ptrdiff_t sum_width, uint16_t* sums,
+ uint32_t* square_sums) {
+ static_assert(size == 3 || size == 5, "");
+ int overread_in_bytes_128, overread_in_bytes_256;
+ if (size == 3) {
+ overread_in_bytes_128 = kOverreadInBytesPass2_128;
+ overread_in_bytes_256 = kOverreadInBytesPass2_256;
+ } else {
+ overread_in_bytes_128 = kOverreadInBytesPass1_128;
+ overread_in_bytes_256 = kOverreadInBytesPass1_256;
+ }
+ overread_in_bytes_128 -= sizeof(*src) * width;
+ overread_in_bytes_256 -= sizeof(*src) * width;
+ int y = 2;
+ do {
+ __m128i s_128[2], ss, sq_128[4], sqs[2];
+ __m256i sq[8];
+ s_128[0] = LoadUnaligned16Msan(src + 0, overread_in_bytes_128);
+ s_128[1] = LoadUnaligned16Msan(src + 8, overread_in_bytes_128 + 16);
+ Square(s_128[0], sq_128 + 0);
+ Square(s_128[1], sq_128 + 2);
+ if (size == 3) {
+ ss = Sum3Horizontal16(s_128);
+ Sum3Horizontal32(sq_128, sqs);
+ } else {
+ ss = Sum5Horizontal16(s_128);
+ Sum5Horizontal32(sq_128, sqs);
+ }
+ StoreAligned16(sums, ss);
+ StoreAligned32U32(square_sums, sqs);
+ src += 8;
+ sums += 8;
+ square_sums += 8;
+ sq[0] = SetrM128i(sq_128[2], sq_128[2]);
+ sq[1] = SetrM128i(sq_128[3], sq_128[3]);
+ ptrdiff_t x = sum_width;
+ do {
+ __m256i s[2], row[2], row_sq[4];
+ s[0] = LoadUnaligned32Msan(
+ src + 8, overread_in_bytes_256 + sizeof(*src) * (sum_width - x + 8));
+ s[1] = LoadUnaligned32Msan(
+ src + 24,
+ overread_in_bytes_256 + sizeof(*src) * (sum_width - x + 24));
+ Square(s[0], sq + 2);
+ Square(s[1], sq + 6);
+ sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
+ sq[1] = _mm256_permute2x128_si256(sq[1], sq[3], 0x21);
+ sq[4] = _mm256_permute2x128_si256(sq[2], sq[6], 0x21);
+ sq[5] = _mm256_permute2x128_si256(sq[3], sq[7], 0x21);
+ if (size == 3) {
+ row[0] = Sum3Horizontal16(
+ src, overread_in_bytes_256 + sizeof(*src) * (sum_width - x + 8));
+ row[1] =
+ Sum3Horizontal16(src + 16, overread_in_bytes_256 +
+ sizeof(*src) * (sum_width - x + 24));
+ Sum3Horizontal32(sq + 0, row_sq + 0);
+ Sum3Horizontal32(sq + 4, row_sq + 2);
+ } else {
+ row[0] = Sum5Horizontal16(
+ src, overread_in_bytes_256 + sizeof(*src) * (sum_width - x + 8));
+ row[1] =
+ Sum5Horizontal16(src + 16, overread_in_bytes_256 +
+ sizeof(*src) * (sum_width - x + 24));
+ Sum5Horizontal32(sq + 0, row_sq + 0);
+ Sum5Horizontal32(sq + 4, row_sq + 2);
+ }
+ StoreAligned64(sums, row);
+ StoreAligned64(square_sums + 0, row_sq + 0);
+ StoreAligned64(square_sums + 16, row_sq + 2);
+ sq[0] = sq[6];
+ sq[1] = sq[7];
+ src += 32;
+ sums += 32;
+ square_sums += 32;
+ x -= 32;
+ } while (x != 0);
+ src += src_stride - sum_width - 8;
+ sums += sum_stride - sum_width - 8;
+ square_sums += sum_stride - sum_width - 8;
+ } while (--y != 0);
+}
+
+template <int n>
+inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq,
+ const uint32_t scale) {
+ static_assert(n == 9 || n == 25, "");
+ // a = |sum_sq|
+ // d = |sum|
+ // p = (a * n < d * d) ? 0 : a * n - d * d;
+ const __m128i dxd = _mm_madd_epi16(sum, sum);
+ // _mm_mullo_epi32() has high latency. Using shifts and additions instead.
+ // Some compilers could do this for us but we make this explicit.
+ // return _mm_mullo_epi32(sum_sq, _mm_set1_epi32(n));
+ __m128i axn = _mm_add_epi32(sum_sq, _mm_slli_epi32(sum_sq, 3));
+ if (n == 25) axn = _mm_add_epi32(axn, _mm_slli_epi32(sum_sq, 4));
+ const __m128i sub = _mm_sub_epi32(axn, dxd);
+ const __m128i p = _mm_max_epi32(sub, _mm_setzero_si128());
+ const __m128i pxs = _mm_mullo_epi32(p, _mm_set1_epi32(scale));
+ return VrshrU32(pxs, kSgrProjScaleBits);
+}
+
+template <int n>
+inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq[2],
+ const uint32_t scale) {
+ static_assert(n == 9 || n == 25, "");
+ const __m128i b = VrshrU16(sum, 2);
+ const __m128i sum_lo = _mm_unpacklo_epi16(b, _mm_setzero_si128());
+ const __m128i sum_hi = _mm_unpackhi_epi16(b, _mm_setzero_si128());
+ const __m128i z0 = CalculateMa<n>(sum_lo, VrshrU32(sum_sq[0], 4), scale);
+ const __m128i z1 = CalculateMa<n>(sum_hi, VrshrU32(sum_sq[1], 4), scale);
+ return _mm_packus_epi32(z0, z1);
+}
+
+template <int n>
+inline __m256i CalculateMa(const __m256i sum, const __m256i sum_sq,
+ const uint32_t scale) {
+ static_assert(n == 9 || n == 25, "");
+ // a = |sum_sq|
+ // d = |sum|
+ // p = (a * n < d * d) ? 0 : a * n - d * d;
+ const __m256i dxd = _mm256_madd_epi16(sum, sum);
+ // _mm256_mullo_epi32() has high latency. Using shifts and additions instead.
+ // Some compilers could do this for us but we make this explicit.
+ // return _mm256_mullo_epi32(sum_sq, _mm256_set1_epi32(n));
+ __m256i axn = _mm256_add_epi32(sum_sq, _mm256_slli_epi32(sum_sq, 3));
+ if (n == 25) axn = _mm256_add_epi32(axn, _mm256_slli_epi32(sum_sq, 4));
+ const __m256i sub = _mm256_sub_epi32(axn, dxd);
+ const __m256i p = _mm256_max_epi32(sub, _mm256_setzero_si256());
+ const __m256i pxs = _mm256_mullo_epi32(p, _mm256_set1_epi32(scale));
+ return VrshrU32(pxs, kSgrProjScaleBits);
+}
+
+template <int n>
+inline __m256i CalculateMa(const __m256i sum, const __m256i sum_sq[2],
+ const uint32_t scale) {
+ static_assert(n == 9 || n == 25, "");
+ const __m256i b = VrshrU16(sum, 2);
+ const __m256i sum_lo = _mm256_unpacklo_epi16(b, _mm256_setzero_si256());
+ const __m256i sum_hi = _mm256_unpackhi_epi16(b, _mm256_setzero_si256());
+ const __m256i z0 = CalculateMa<n>(sum_lo, VrshrU32(sum_sq[0], 4), scale);
+ const __m256i z1 = CalculateMa<n>(sum_hi, VrshrU32(sum_sq[1], 4), scale);
+ return _mm256_packus_epi32(z0, z1);
+}
+
+inline void CalculateB5(const __m128i sum, const __m128i ma, __m128i b[2]) {
+ // one_over_n == 164.
+ constexpr uint32_t one_over_n =
+ ((1 << kSgrProjReciprocalBits) + (25 >> 1)) / 25;
+ // one_over_n_quarter == 41.
+ constexpr uint32_t one_over_n_quarter = one_over_n >> 2;
+ static_assert(one_over_n == one_over_n_quarter << 2, "");
+ // |ma| is in range [0, 255].
+ const __m128i m = _mm_maddubs_epi16(ma, _mm_set1_epi16(one_over_n_quarter));
+ const __m128i m0 = VmullLo16(m, sum);
+ const __m128i m1 = VmullHi16(m, sum);
+ b[0] = VrshrU32(m0, kSgrProjReciprocalBits - 2);
+ b[1] = VrshrU32(m1, kSgrProjReciprocalBits - 2);
+}
+
+inline void CalculateB5(const __m256i sum, const __m256i ma, __m256i b[2]) {
+ // one_over_n == 164.
+ constexpr uint32_t one_over_n =
+ ((1 << kSgrProjReciprocalBits) + (25 >> 1)) / 25;
+ // one_over_n_quarter == 41.
+ constexpr uint32_t one_over_n_quarter = one_over_n >> 2;
+ static_assert(one_over_n == one_over_n_quarter << 2, "");
+ // |ma| is in range [0, 255].
+ const __m256i m =
+ _mm256_maddubs_epi16(ma, _mm256_set1_epi16(one_over_n_quarter));
+ const __m256i m0 = VmullLo16(m, sum);
+ const __m256i m1 = VmullHi16(m, sum);
+ b[0] = VrshrU32(m0, kSgrProjReciprocalBits - 2);
+ b[1] = VrshrU32(m1, kSgrProjReciprocalBits - 2);
+}
+
+inline void CalculateB3(const __m128i sum, const __m128i ma, __m128i b[2]) {
+ // one_over_n == 455.
+ constexpr uint32_t one_over_n =
+ ((1 << kSgrProjReciprocalBits) + (9 >> 1)) / 9;
+ const __m128i m0 = VmullLo16(ma, sum);
+ const __m128i m1 = VmullHi16(ma, sum);
+ const __m128i m2 = _mm_mullo_epi32(m0, _mm_set1_epi32(one_over_n));
+ const __m128i m3 = _mm_mullo_epi32(m1, _mm_set1_epi32(one_over_n));
+ b[0] = VrshrU32(m2, kSgrProjReciprocalBits);
+ b[1] = VrshrU32(m3, kSgrProjReciprocalBits);
+}
+
+inline void CalculateB3(const __m256i sum, const __m256i ma, __m256i b[2]) {
+ // one_over_n == 455.
+ constexpr uint32_t one_over_n =
+ ((1 << kSgrProjReciprocalBits) + (9 >> 1)) / 9;
+ const __m256i m0 = VmullLo16(ma, sum);
+ const __m256i m1 = VmullHi16(ma, sum);
+ const __m256i m2 = _mm256_mullo_epi32(m0, _mm256_set1_epi32(one_over_n));
+ const __m256i m3 = _mm256_mullo_epi32(m1, _mm256_set1_epi32(one_over_n));
+ b[0] = VrshrU32(m2, kSgrProjReciprocalBits);
+ b[1] = VrshrU32(m3, kSgrProjReciprocalBits);
+}
+
+inline void CalculateSumAndIndex5(const __m128i s5[5], const __m128i sq5[5][2],
+ const uint32_t scale, __m128i* const sum,
+ __m128i* const index) {
+ __m128i sum_sq[2];
+ *sum = Sum5_16(s5);
+ Sum5_32(sq5, sum_sq);
+ *index = CalculateMa<25>(*sum, sum_sq, scale);
+}
+
+inline void CalculateSumAndIndex5(const __m256i s5[5], const __m256i sq5[5][2],
+ const uint32_t scale, __m256i* const sum,
+ __m256i* const index) {
+ __m256i sum_sq[2];
+ *sum = Sum5_16(s5);
+ Sum5_32(sq5, sum_sq);
+ *index = CalculateMa<25>(*sum, sum_sq, scale);
+}
+
+inline void CalculateSumAndIndex3(const __m128i s3[3], const __m128i sq3[3][2],
+ const uint32_t scale, __m128i* const sum,
+ __m128i* const index) {
+ __m128i sum_sq[2];
+ *sum = Sum3_16(s3);
+ Sum3_32(sq3, sum_sq);
+ *index = CalculateMa<9>(*sum, sum_sq, scale);
+}
+
+inline void CalculateSumAndIndex3(const __m256i s3[3], const __m256i sq3[3][2],
+ const uint32_t scale, __m256i* const sum,
+ __m256i* const index) {
+ __m256i sum_sq[2];
+ *sum = Sum3_16(s3);
+ Sum3_32(sq3, sum_sq);
+ *index = CalculateMa<9>(*sum, sum_sq, scale);
+}
+
+template <int n>
+inline void LookupIntermediate(const __m128i sum, const __m128i index,
+ __m128i* const ma, __m128i b[2]) {
+ static_assert(n == 9 || n == 25, "");
+ const __m128i idx = _mm_packus_epi16(index, index);
+ // Actually it's not stored and loaded. The compiler will use a 64-bit
+ // general-purpose register to process. Faster than using _mm_extract_epi8().
+ uint8_t temp[8];
+ StoreLo8(temp, idx);
+ *ma = _mm_cvtsi32_si128(kSgrMaLookup[temp[0]]);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[1]], 1);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[2]], 2);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[3]], 3);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[4]], 4);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[5]], 5);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[6]], 6);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[7]], 7);
+ // b = ma * b * one_over_n
+ // |ma| = [0, 255]
+ // |sum| is a box sum with radius 1 or 2.
+ // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+ // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+ // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+ // When radius is 2 |n| is 25. |one_over_n| is 164.
+ // When radius is 1 |n| is 9. |one_over_n| is 455.
+ // |kSgrProjReciprocalBits| is 12.
+ // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+ // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+ const __m128i maq = _mm_unpacklo_epi8(*ma, _mm_setzero_si128());
+ if (n == 9) {
+ CalculateB3(sum, maq, b);
+ } else {
+ CalculateB5(sum, maq, b);
+ }
+}
+
+// Repeat the first 48 elements in kSgrMaLookup with a period of 16.
+alignas(32) constexpr uint8_t kSgrMaLookupAvx2[96] = {
+ 255, 128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16,
+ 255, 128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16,
+ 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8,
+ 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8,
+ 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 5, 5,
+ 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 5, 5};
+
+// Set the shuffle control mask of indices out of range [0, 15] to (1xxxxxxx)b
+// to get value 0 as the shuffle result. The most significiant bit 1 comes
+// either from the comparison instruction, or from the sign bit of the index.
+inline __m128i ShuffleIndex(const __m128i table, const __m128i index) {
+ __m128i mask;
+ mask = _mm_cmpgt_epi8(index, _mm_set1_epi8(15));
+ mask = _mm_or_si128(mask, index);
+ return _mm_shuffle_epi8(table, mask);
+}
+
+inline __m256i ShuffleIndex(const __m256i table, const __m256i index) {
+ __m256i mask;
+ mask = _mm256_cmpgt_epi8(index, _mm256_set1_epi8(15));
+ mask = _mm256_or_si256(mask, index);
+ return _mm256_shuffle_epi8(table, mask);
+}
+
+inline __m128i AdjustValue(const __m128i value, const __m128i index,
+ const int threshold) {
+ const __m128i thresholds = _mm_set1_epi8(threshold - 128);
+ const __m128i offset = _mm_cmpgt_epi8(index, thresholds);
+ return _mm_add_epi8(value, offset);
+}
+
+inline __m256i AdjustValue(const __m256i value, const __m256i index,
+ const int threshold) {
+ const __m256i thresholds = _mm256_set1_epi8(threshold - 128);
+ const __m256i offset = _mm256_cmpgt_epi8(index, thresholds);
+ return _mm256_add_epi8(value, offset);
+}
+
+inline void CalculateIntermediate(const __m128i sum[2], const __m128i index[2],
+ __m128i* const ma, __m128i b0[2],
+ __m128i b1[2]) {
+ // Use table lookup to read elements whose indices are less than 48.
+ const __m128i c0 = LoadAligned16(kSgrMaLookup + 0 * 16);
+ const __m128i c1 = LoadAligned16(kSgrMaLookup + 1 * 16);
+ const __m128i c2 = LoadAligned16(kSgrMaLookup + 2 * 16);
+ const __m128i indices = _mm_packus_epi16(index[0], index[1]);
+ __m128i idx;
+ // Clip idx to 127 to apply signed comparison instructions.
+ idx = _mm_min_epu8(indices, _mm_set1_epi8(127));
+ // All elements whose indices are less than 48 are set to 0.
+ // Get shuffle results for indices in range [0, 15].
+ *ma = ShuffleIndex(c0, idx);
+ // Get shuffle results for indices in range [16, 31].
+ // Subtract 16 to utilize the sign bit of the index.
+ idx = _mm_sub_epi8(idx, _mm_set1_epi8(16));
+ const __m128i res1 = ShuffleIndex(c1, idx);
+ // Use OR instruction to combine shuffle results together.
+ *ma = _mm_or_si128(*ma, res1);
+ // Get shuffle results for indices in range [32, 47].
+ // Subtract 16 to utilize the sign bit of the index.
+ idx = _mm_sub_epi8(idx, _mm_set1_epi8(16));
+ const __m128i res2 = ShuffleIndex(c2, idx);
+ *ma = _mm_or_si128(*ma, res2);
+
+ // For elements whose indices are larger than 47, since they seldom change
+ // values with the increase of the index, we use comparison and arithmetic
+ // operations to calculate their values.
+ // Add -128 to apply signed comparison instructions.
+ idx = _mm_add_epi8(indices, _mm_set1_epi8(-128));
+ // Elements whose indices are larger than 47 (with value 0) are set to 5.
+ *ma = _mm_max_epu8(*ma, _mm_set1_epi8(5));
+ *ma = AdjustValue(*ma, idx, 55); // 55 is the last index which value is 5.
+ *ma = AdjustValue(*ma, idx, 72); // 72 is the last index which value is 4.
+ *ma = AdjustValue(*ma, idx, 101); // 101 is the last index which value is 3.
+ *ma = AdjustValue(*ma, idx, 169); // 169 is the last index which value is 2.
+ *ma = AdjustValue(*ma, idx, 254); // 254 is the last index which value is 1.
+
+ // b = ma * b * one_over_n
+ // |ma| = [0, 255]
+ // |sum| is a box sum with radius 1 or 2.
+ // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+ // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+ // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+ // When radius is 2 |n| is 25. |one_over_n| is 164.
+ // When radius is 1 |n| is 9. |one_over_n| is 455.
+ // |kSgrProjReciprocalBits| is 12.
+ // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+ // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+ const __m128i maq0 = _mm_unpacklo_epi8(*ma, _mm_setzero_si128());
+ CalculateB3(sum[0], maq0, b0);
+ const __m128i maq1 = _mm_unpackhi_epi8(*ma, _mm_setzero_si128());
+ CalculateB3(sum[1], maq1, b1);
+}
+
+template <int n>
+inline void CalculateIntermediate(const __m256i sum[2], const __m256i index[2],
+ __m256i ma[3], __m256i b0[2], __m256i b1[2]) {
+ static_assert(n == 9 || n == 25, "");
+ // Use table lookup to read elements whose indices are less than 48.
+ const __m256i c0 = LoadAligned32(kSgrMaLookupAvx2 + 0 * 32);
+ const __m256i c1 = LoadAligned32(kSgrMaLookupAvx2 + 1 * 32);
+ const __m256i c2 = LoadAligned32(kSgrMaLookupAvx2 + 2 * 32);
+ const __m256i indices = _mm256_packus_epi16(index[0], index[1]); // 0 2 1 3
+ __m256i idx, mas;
+ // Clip idx to 127 to apply signed comparison instructions.
+ idx = _mm256_min_epu8(indices, _mm256_set1_epi8(127));
+ // All elements whose indices are less than 48 are set to 0.
+ // Get shuffle results for indices in range [0, 15].
+ mas = ShuffleIndex(c0, idx);
+ // Get shuffle results for indices in range [16, 31].
+ // Subtract 16 to utilize the sign bit of the index.
+ idx = _mm256_sub_epi8(idx, _mm256_set1_epi8(16));
+ const __m256i res1 = ShuffleIndex(c1, idx);
+ // Use OR instruction to combine shuffle results together.
+ mas = _mm256_or_si256(mas, res1);
+ // Get shuffle results for indices in range [32, 47].
+ // Subtract 16 to utilize the sign bit of the index.
+ idx = _mm256_sub_epi8(idx, _mm256_set1_epi8(16));
+ const __m256i res2 = ShuffleIndex(c2, idx);
+ mas = _mm256_or_si256(mas, res2);
+
+ // For elements whose indices are larger than 47, since they seldom change
+ // values with the increase of the index, we use comparison and arithmetic
+ // operations to calculate their values.
+ // Add -128 to apply signed comparison instructions.
+ idx = _mm256_add_epi8(indices, _mm256_set1_epi8(-128));
+ // Elements whose indices are larger than 47 (with value 0) are set to 5.
+ mas = _mm256_max_epu8(mas, _mm256_set1_epi8(5));
+ mas = AdjustValue(mas, idx, 55); // 55 is the last index which value is 5.
+ mas = AdjustValue(mas, idx, 72); // 72 is the last index which value is 4.
+ mas = AdjustValue(mas, idx, 101); // 101 is the last index which value is 3.
+ mas = AdjustValue(mas, idx, 169); // 169 is the last index which value is 2.
+ mas = AdjustValue(mas, idx, 254); // 254 is the last index which value is 1.
+
+ ma[2] = _mm256_permute4x64_epi64(mas, 0x63); // 32-39 8-15 16-23 24-31
+ ma[0] = _mm256_blend_epi32(ma[0], ma[2], 0xfc); // 0-7 8-15 16-23 24-31
+ ma[1] = _mm256_permute2x128_si256(ma[0], ma[2], 0x21);
+
+ // b = ma * b * one_over_n
+ // |ma| = [0, 255]
+ // |sum| is a box sum with radius 1 or 2.
+ // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+ // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+ // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+ // When radius is 2 |n| is 25. |one_over_n| is 164.
+ // When radius is 1 |n| is 9. |one_over_n| is 455.
+ // |kSgrProjReciprocalBits| is 12.
+ // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+ // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+ const __m256i maq0 = _mm256_unpackhi_epi8(ma[0], _mm256_setzero_si256());
+ const __m256i maq1 = _mm256_unpacklo_epi8(ma[1], _mm256_setzero_si256());
+ __m256i sums[2];
+ sums[0] = _mm256_permute2x128_si256(sum[0], sum[1], 0x20);
+ sums[1] = _mm256_permute2x128_si256(sum[0], sum[1], 0x31);
+ if (n == 9) {
+ CalculateB3(sums[0], maq0, b0);
+ CalculateB3(sums[1], maq1, b1);
+ } else {
+ CalculateB5(sums[0], maq0, b0);
+ CalculateB5(sums[1], maq1, b1);
+ }
+}
+
+inline void CalculateIntermediate5(const __m128i s5[5], const __m128i sq5[5][2],
+ const uint32_t scale, __m128i* const ma,
+ __m128i b[2]) {
+ __m128i sum, index;
+ CalculateSumAndIndex5(s5, sq5, scale, &sum, &index);
+ LookupIntermediate<25>(sum, index, ma, b);
+}
+
+inline void CalculateIntermediate3(const __m128i s3[3], const __m128i sq3[3][2],
+ const uint32_t scale, __m128i* const ma,
+ __m128i b[2]) {
+ __m128i sum, index;
+ CalculateSumAndIndex3(s3, sq3, scale, &sum, &index);
+ LookupIntermediate<9>(sum, index, ma, b);
+}
+
+inline void Store343_444(const __m256i b3[3], const ptrdiff_t x,
+ __m256i sum_b343[2], __m256i sum_b444[2],
+ uint32_t* const b343, uint32_t* const b444) {
+ __m256i b[3], sum_b111[2];
+ Prepare3_32(b3 + 0, b);
+ sum_b111[0] = Sum3_32(b);
+ sum_b444[0] = _mm256_slli_epi32(sum_b111[0], 2);
+ sum_b343[0] = _mm256_sub_epi32(sum_b444[0], sum_b111[0]);
+ sum_b343[0] = _mm256_add_epi32(sum_b343[0], b[1]);
+ Prepare3_32(b3 + 1, b);
+ sum_b111[1] = Sum3_32(b);
+ sum_b444[1] = _mm256_slli_epi32(sum_b111[1], 2);
+ sum_b343[1] = _mm256_sub_epi32(sum_b444[1], sum_b111[1]);
+ sum_b343[1] = _mm256_add_epi32(sum_b343[1], b[1]);
+ StoreAligned64(b444 + x, sum_b444);
+ StoreAligned64(b343 + x, sum_b343);
+}
+
+inline void Store343_444Lo(const __m256i ma3[3], const __m256i b3[2],
+ const ptrdiff_t x, __m256i* const sum_ma343,
+ __m256i* const sum_ma444, __m256i sum_b343[2],
+ __m256i sum_b444[2], uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ const __m256i sum_ma111 = Sum3WLo16(ma3);
+ *sum_ma444 = _mm256_slli_epi16(sum_ma111, 2);
+ StoreAligned32_ma(ma444 + x, *sum_ma444);
+ const __m256i sum333 = _mm256_sub_epi16(*sum_ma444, sum_ma111);
+ *sum_ma343 = VaddwLo8(sum333, ma3[1]);
+ StoreAligned32_ma(ma343 + x, *sum_ma343);
+ Store343_444(b3, x, sum_b343, sum_b444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m256i ma3[3], const __m256i b3[2],
+ const ptrdiff_t x, __m256i* const sum_ma343,
+ __m256i* const sum_ma444, __m256i sum_b343[2],
+ __m256i sum_b444[2], uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ const __m256i sum_ma111 = Sum3WHi16(ma3);
+ *sum_ma444 = _mm256_slli_epi16(sum_ma111, 2);
+ StoreAligned32_ma(ma444 + x, *sum_ma444);
+ const __m256i sum333 = _mm256_sub_epi16(*sum_ma444, sum_ma111);
+ *sum_ma343 = VaddwHi8(sum333, ma3[1]);
+ StoreAligned32_ma(ma343 + x, *sum_ma343);
+ Store343_444(b3, x + kMaStoreOffset, sum_b343, sum_b444, b343, b444);
+}
+
+inline void Store343_444Lo(const __m256i ma3[3], const __m256i b3[2],
+ const ptrdiff_t x, __m256i* const sum_ma343,
+ __m256i sum_b343[2], uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ __m256i sum_ma444, sum_b444[2];
+ Store343_444Lo(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
+ ma444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m256i ma3[3], const __m256i b3[2],
+ const ptrdiff_t x, __m256i* const sum_ma343,
+ __m256i sum_b343[2], uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ __m256i sum_ma444, sum_b444[2];
+ Store343_444Hi(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
+ ma444, b343, b444);
+}
+
+inline void Store343_444Lo(const __m256i ma3[3], const __m256i b3[2],
+ const ptrdiff_t x, uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ __m256i sum_ma343, sum_b343[2];
+ Store343_444Lo(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m256i ma3[3], const __m256i b3[2],
+ const ptrdiff_t x, uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ __m256i sum_ma343, sum_b343[2];
+ Store343_444Hi(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
+}
+
+// Don't combine the following 2 functions, which would be slower.
+inline void Store343_444(const __m256i ma3[3], const __m256i b3[6],
+ const ptrdiff_t x, __m256i* const sum_ma343_lo,
+ __m256i* const sum_ma343_hi,
+ __m256i* const sum_ma444_lo,
+ __m256i* const sum_ma444_hi, __m256i sum_b343_lo[2],
+ __m256i sum_b343_hi[2], __m256i sum_b444_lo[2],
+ __m256i sum_b444_hi[2], uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ __m256i sum_mat343[2], sum_mat444[2];
+ const __m256i sum_ma111_lo = Sum3WLo16(ma3);
+ sum_mat444[0] = _mm256_slli_epi16(sum_ma111_lo, 2);
+ const __m256i sum333_lo = _mm256_sub_epi16(sum_mat444[0], sum_ma111_lo);
+ sum_mat343[0] = VaddwLo8(sum333_lo, ma3[1]);
+ Store343_444(b3, x, sum_b343_lo, sum_b444_lo, b343, b444);
+ const __m256i sum_ma111_hi = Sum3WHi16(ma3);
+ sum_mat444[1] = _mm256_slli_epi16(sum_ma111_hi, 2);
+ *sum_ma444_lo = _mm256_permute2x128_si256(sum_mat444[0], sum_mat444[1], 0x20);
+ *sum_ma444_hi = _mm256_permute2x128_si256(sum_mat444[0], sum_mat444[1], 0x31);
+ StoreAligned32(ma444 + x + 0, *sum_ma444_lo);
+ StoreAligned32(ma444 + x + 16, *sum_ma444_hi);
+ const __m256i sum333_hi = _mm256_sub_epi16(sum_mat444[1], sum_ma111_hi);
+ sum_mat343[1] = VaddwHi8(sum333_hi, ma3[1]);
+ *sum_ma343_lo = _mm256_permute2x128_si256(sum_mat343[0], sum_mat343[1], 0x20);
+ *sum_ma343_hi = _mm256_permute2x128_si256(sum_mat343[0], sum_mat343[1], 0x31);
+ StoreAligned32(ma343 + x + 0, *sum_ma343_lo);
+ StoreAligned32(ma343 + x + 16, *sum_ma343_hi);
+ Store343_444(b3 + 3, x + 16, sum_b343_hi, sum_b444_hi, b343, b444);
+}
+
+inline void Store343_444(const __m256i ma3[3], const __m256i b3[6],
+ const ptrdiff_t x, __m256i* const sum_ma343_lo,
+ __m256i* const sum_ma343_hi, __m256i sum_b343_lo[2],
+ __m256i sum_b343_hi[2], uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ __m256i sum_ma444[2], sum_b444[2], sum_mat343[2];
+ const __m256i sum_ma111_lo = Sum3WLo16(ma3);
+ sum_ma444[0] = _mm256_slli_epi16(sum_ma111_lo, 2);
+ const __m256i sum333_lo = _mm256_sub_epi16(sum_ma444[0], sum_ma111_lo);
+ sum_mat343[0] = VaddwLo8(sum333_lo, ma3[1]);
+ Store343_444(b3, x, sum_b343_lo, sum_b444, b343, b444);
+ const __m256i sum_ma111_hi = Sum3WHi16(ma3);
+ sum_ma444[1] = _mm256_slli_epi16(sum_ma111_hi, 2);
+ StoreAligned64_ma(ma444 + x, sum_ma444);
+ const __m256i sum333_hi = _mm256_sub_epi16(sum_ma444[1], sum_ma111_hi);
+ sum_mat343[1] = VaddwHi8(sum333_hi, ma3[1]);
+ *sum_ma343_lo = _mm256_permute2x128_si256(sum_mat343[0], sum_mat343[1], 0x20);
+ *sum_ma343_hi = _mm256_permute2x128_si256(sum_mat343[0], sum_mat343[1], 0x31);
+ StoreAligned32(ma343 + x + 0, *sum_ma343_lo);
+ StoreAligned32(ma343 + x + 16, *sum_ma343_hi);
+ Store343_444(b3 + 3, x + 16, sum_b343_hi, sum_b444, b343, b444);
+}
+
+inline void PermuteB(const __m256i t[4], __m256i b[7]) {
+ // Input:
+ // 0 1 2 3 // b[0]
+ // 4 5 6 7 // b[1]
+ // 8 9 10 11 24 25 26 27 // t[0]
+ // 12 13 14 15 28 29 30 31 // t[1]
+ // 16 17 18 19 32 33 34 35 // t[2]
+ // 20 21 22 23 36 37 38 39 // t[3]
+
+ // Output:
+ // 0 1 2 3 8 9 10 11 // b[0]
+ // 4 5 6 7 12 13 14 15 // b[1]
+ // 8 9 10 11 16 17 18 19 // b[2]
+ // 16 17 18 19 24 25 26 27 // b[3]
+ // 20 21 22 23 28 29 30 31 // b[4]
+ // 24 25 26 27 32 33 34 35 // b[5]
+ // 20 21 22 23 36 37 38 39 // b[6]
+ b[0] = _mm256_permute2x128_si256(b[0], t[0], 0x21);
+ b[1] = _mm256_permute2x128_si256(b[1], t[1], 0x21);
+ b[2] = _mm256_permute2x128_si256(t[0], t[2], 0x20);
+ b[3] = _mm256_permute2x128_si256(t[2], t[0], 0x30);
+ b[4] = _mm256_permute2x128_si256(t[3], t[1], 0x30);
+ b[5] = _mm256_permute2x128_si256(t[0], t[2], 0x31);
+ b[6] = t[3];
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5Lo(
+ const __m128i s[2][2], const uint32_t scale, uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5], __m128i sq[2][4], __m128i* const ma,
+ __m128i b[2]) {
+ __m128i s5[2][5], sq5[5][2];
+ Square(s[0][1], sq[0] + 2);
+ Square(s[1][1], sq[1] + 2);
+ s5[0][3] = Sum5Horizontal16(s[0]);
+ StoreAligned16(sum5[3], s5[0][3]);
+ s5[0][4] = Sum5Horizontal16(s[1]);
+ StoreAligned16(sum5[4], s5[0][4]);
+ Sum5Horizontal32(sq[0], sq5[3]);
+ StoreAligned32U32(square_sum5[3], sq5[3]);
+ Sum5Horizontal32(sq[1], sq5[4]);
+ StoreAligned32U32(square_sum5[4], sq5[4]);
+ LoadAligned16x3U16(sum5, 0, s5[0]);
+ LoadAligned32x3U32(square_sum5, 0, sq5);
+ CalculateIntermediate5(s5[0], sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5(
+ const uint16_t* const src0, const uint16_t* const src1,
+ const ptrdiff_t over_read_in_bytes, const ptrdiff_t sum_width,
+ const ptrdiff_t x, const uint32_t scale, uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5], __m256i sq[2][8], __m256i ma[3],
+ __m256i b[3]) {
+ __m256i s[2], s5[2][5], sq5[5][2], sum[2], index[2], t[4];
+ s[0] = LoadUnaligned32Msan(src0 + 8, over_read_in_bytes + 16);
+ s[1] = LoadUnaligned32Msan(src1 + 8, over_read_in_bytes + 16);
+ Square(s[0], sq[0] + 2);
+ Square(s[1], sq[1] + 2);
+ sq[0][0] = _mm256_permute2x128_si256(sq[0][0], sq[0][2], 0x21);
+ sq[0][1] = _mm256_permute2x128_si256(sq[0][1], sq[0][3], 0x21);
+ sq[1][0] = _mm256_permute2x128_si256(sq[1][0], sq[1][2], 0x21);
+ sq[1][1] = _mm256_permute2x128_si256(sq[1][1], sq[1][3], 0x21);
+ s5[0][3] = Sum5Horizontal16(src0 + 0, over_read_in_bytes + 0);
+ s5[1][3] = Sum5Horizontal16(src0 + 16, over_read_in_bytes + 32);
+ s5[0][4] = Sum5Horizontal16(src1 + 0, over_read_in_bytes + 0);
+ s5[1][4] = Sum5Horizontal16(src1 + 16, over_read_in_bytes + 32);
+ StoreAligned32(sum5[3] + x + 0, s5[0][3]);
+ StoreAligned32(sum5[3] + x + 16, s5[1][3]);
+ StoreAligned32(sum5[4] + x + 0, s5[0][4]);
+ StoreAligned32(sum5[4] + x + 16, s5[1][4]);
+ Sum5Horizontal32(sq[0], sq5[3]);
+ StoreAligned64(square_sum5[3] + x, sq5[3]);
+ Sum5Horizontal32(sq[1], sq5[4]);
+ StoreAligned64(square_sum5[4] + x, sq5[4]);
+ LoadAligned32x3U16(sum5, x, s5[0]);
+ LoadAligned64x3U32(square_sum5, x, sq5);
+ CalculateSumAndIndex5(s5[0], sq5, scale, &sum[0], &index[0]);
+
+ s[0] = LoadUnaligned32Msan(src0 + 24, over_read_in_bytes + 48);
+ s[1] = LoadUnaligned32Msan(src1 + 24, over_read_in_bytes + 48);
+ Square(s[0], sq[0] + 6);
+ Square(s[1], sq[1] + 6);
+ sq[0][4] = _mm256_permute2x128_si256(sq[0][2], sq[0][6], 0x21);
+ sq[0][5] = _mm256_permute2x128_si256(sq[0][3], sq[0][7], 0x21);
+ sq[1][4] = _mm256_permute2x128_si256(sq[1][2], sq[1][6], 0x21);
+ sq[1][5] = _mm256_permute2x128_si256(sq[1][3], sq[1][7], 0x21);
+ Sum5Horizontal32(sq[0] + 4, sq5[3]);
+ StoreAligned64(square_sum5[3] + x + 16, sq5[3]);
+ Sum5Horizontal32(sq[1] + 4, sq5[4]);
+ StoreAligned64(square_sum5[4] + x + 16, sq5[4]);
+ LoadAligned32x3U16Msan(sum5, x + 16, sum_width, s5[1]);
+ LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5);
+ CalculateSumAndIndex5(s5[1], sq5, scale, &sum[1], &index[1]);
+ CalculateIntermediate<25>(sum, index, ma, t, t + 2);
+ PermuteB(t, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRowLo(
+ const __m128i s[2], const uint32_t scale, const uint16_t* const sum5[5],
+ const uint32_t* const square_sum5[5], __m128i sq[4], __m128i* const ma,
+ __m128i b[2]) {
+ __m128i s5[5], sq5[5][2];
+ Square(s[1], sq + 2);
+ s5[3] = s5[4] = Sum5Horizontal16(s);
+ Sum5Horizontal32(sq, sq5[3]);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ LoadAligned16x3U16(sum5, 0, s5);
+ LoadAligned32x3U32(square_sum5, 0, sq5);
+ CalculateIntermediate5(s5, sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRow(
+ const uint16_t* const src, const ptrdiff_t over_read_in_bytes,
+ const ptrdiff_t sum_width, const ptrdiff_t x, const uint32_t scale,
+ const uint16_t* const sum5[5], const uint32_t* const square_sum5[5],
+ __m256i sq[3], __m256i ma[3], __m256i b[3]) {
+ const __m256i s0 = LoadUnaligned32Msan(src + 8, over_read_in_bytes + 16);
+ __m256i s5[2][5], sq5[5][2], sum[2], index[2], t[4];
+ Square(s0, sq + 2);
+ sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
+ sq[1] = _mm256_permute2x128_si256(sq[1], sq[3], 0x21);
+ s5[0][3] = Sum5Horizontal16(src + 0, over_read_in_bytes + 0);
+ s5[1][3] = Sum5Horizontal16(src + 16, over_read_in_bytes + 32);
+ s5[0][4] = s5[0][3];
+ s5[1][4] = s5[1][3];
+ Sum5Horizontal32(sq, sq5[3]);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ LoadAligned32x3U16(sum5, x, s5[0]);
+ LoadAligned64x3U32(square_sum5, x, sq5);
+ CalculateSumAndIndex5(s5[0], sq5, scale, &sum[0], &index[0]);
+
+ const __m256i s1 = LoadUnaligned32Msan(src + 24, over_read_in_bytes + 48);
+ Square(s1, sq + 6);
+ sq[4] = _mm256_permute2x128_si256(sq[2], sq[6], 0x21);
+ sq[5] = _mm256_permute2x128_si256(sq[3], sq[7], 0x21);
+ Sum5Horizontal32(sq + 4, sq5[3]);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ LoadAligned32x3U16Msan(sum5, x + 16, sum_width, s5[1]);
+ LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5);
+ CalculateSumAndIndex5(s5[1], sq5, scale, &sum[1], &index[1]);
+ CalculateIntermediate<25>(sum, index, ma, t, t + 2);
+ PermuteB(t, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3Lo(
+ const __m128i s[2], const uint32_t scale, uint16_t* const sum3[3],
+ uint32_t* const square_sum3[3], __m128i sq[4], __m128i* const ma,
+ __m128i b[2]) {
+ __m128i s3[3], sq3[3][2];
+ Square(s[1], sq + 2);
+ s3[2] = Sum3Horizontal16(s);
+ StoreAligned16(sum3[2], s3[2]);
+ Sum3Horizontal32(sq, sq3[2]);
+ StoreAligned32U32(square_sum3[2], sq3[2]);
+ LoadAligned16x2U16(sum3, 0, s3);
+ LoadAligned32x2U32(square_sum3, 0, sq3);
+ CalculateIntermediate3(s3, sq3, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3(
+ const uint16_t* const src, const ptrdiff_t over_read_in_bytes,
+ const ptrdiff_t x, const ptrdiff_t sum_width, const uint32_t scale,
+ uint16_t* const sum3[3], uint32_t* const square_sum3[3], __m256i sq[8],
+ __m256i ma[3], __m256i b[7]) {
+ __m256i s[2], s3[4], sq3[3][2], sum[2], index[2], t[4];
+ s[0] = LoadUnaligned32Msan(src + 8, over_read_in_bytes + 16);
+ s[1] = LoadUnaligned32Msan(src + 24, over_read_in_bytes + 48);
+ Square(s[0], sq + 2);
+ sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
+ sq[1] = _mm256_permute2x128_si256(sq[1], sq[3], 0x21);
+ s3[2] = Sum3Horizontal16(src, over_read_in_bytes);
+ s3[3] = Sum3Horizontal16(src + 16, over_read_in_bytes + 32);
+ StoreAligned64(sum3[2] + x, s3 + 2);
+ Sum3Horizontal32(sq + 0, sq3[2]);
+ StoreAligned64(square_sum3[2] + x, sq3[2]);
+ LoadAligned32x2U16(sum3, x, s3);
+ LoadAligned64x2U32(square_sum3, x, sq3);
+ CalculateSumAndIndex3(s3, sq3, scale, &sum[0], &index[0]);
+
+ Square(s[1], sq + 6);
+ sq[4] = _mm256_permute2x128_si256(sq[2], sq[6], 0x21);
+ sq[5] = _mm256_permute2x128_si256(sq[3], sq[7], 0x21);
+ Sum3Horizontal32(sq + 4, sq3[2]);
+ StoreAligned64(square_sum3[2] + x + 16, sq3[2]);
+ LoadAligned32x2U16Msan(sum3, x + 16, sum_width, s3 + 1);
+ LoadAligned64x2U32Msan(square_sum3, x + 16, sum_width, sq3);
+ CalculateSumAndIndex3(s3 + 1, sq3, scale, &sum[1], &index[1]);
+ CalculateIntermediate<9>(sum, index, ma, t, t + 2);
+ PermuteB(t, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLo(
+ const __m128i s[2][4], const uint16_t scales[2], uint16_t* const sum3[4],
+ uint16_t* const sum5[5], uint32_t* const square_sum3[4],
+ uint32_t* const square_sum5[5], __m128i sq[2][8], __m128i ma3[2][3],
+ __m128i b3[2][10], __m128i* const ma5, __m128i b5[2]) {
+ __m128i s3[4], s5[5], sq3[4][2], sq5[5][2], sum[2], index[2];
+ Square(s[0][1], sq[0] + 2);
+ Square(s[1][1], sq[1] + 2);
+ SumHorizontal16(s[0], &s3[2], &s5[3]);
+ SumHorizontal16(s[1], &s3[3], &s5[4]);
+ StoreAligned16(sum3[2], s3[2]);
+ StoreAligned16(sum3[3], s3[3]);
+ StoreAligned16(sum5[3], s5[3]);
+ StoreAligned16(sum5[4], s5[4]);
+ SumHorizontal32(sq[0], &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ StoreAligned32U32(square_sum3[2], sq3[2]);
+ StoreAligned32U32(square_sum5[3], sq5[3]);
+ SumHorizontal32(sq[1], &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+ StoreAligned32U32(square_sum3[3], sq3[3]);
+ StoreAligned32U32(square_sum5[4], sq5[4]);
+ LoadAligned16x2U16(sum3, 0, s3);
+ LoadAligned32x2U32(square_sum3, 0, sq3);
+ LoadAligned16x3U16(sum5, 0, s5);
+ LoadAligned32x3U32(square_sum5, 0, sq5);
+ CalculateSumAndIndex3(s3 + 0, sq3 + 0, scales[1], &sum[0], &index[0]);
+ CalculateSumAndIndex3(s3 + 1, sq3 + 1, scales[1], &sum[1], &index[1]);
+ CalculateIntermediate(sum, index, &ma3[0][0], b3[0], b3[1]);
+ ma3[1][0] = _mm_srli_si128(ma3[0][0], 8);
+ CalculateIntermediate5(s5, sq5, scales[0], ma5, b5);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess(
+ const uint16_t* const src0, const uint16_t* const src1,
+ const ptrdiff_t over_read_in_bytes, const ptrdiff_t x,
+ const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
+ uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+ const ptrdiff_t sum_width, __m256i sq[2][8], __m256i ma3[2][3],
+ __m256i b3[2][7], __m256i ma5[3], __m256i b5[5]) {
+ __m256i s[2], s3[2][4], s5[2][5], sq3[4][2], sq5[5][2], sum_3[2][2],
+ index_3[2][2], sum_5[2], index_5[2], t[4];
+ s[0] = LoadUnaligned32Msan(src0 + 8, over_read_in_bytes + 16);
+ s[1] = LoadUnaligned32Msan(src1 + 8, over_read_in_bytes + 16);
+ Square(s[0], sq[0] + 2);
+ Square(s[1], sq[1] + 2);
+ sq[0][0] = _mm256_permute2x128_si256(sq[0][0], sq[0][2], 0x21);
+ sq[0][1] = _mm256_permute2x128_si256(sq[0][1], sq[0][3], 0x21);
+ sq[1][0] = _mm256_permute2x128_si256(sq[1][0], sq[1][2], 0x21);
+ sq[1][1] = _mm256_permute2x128_si256(sq[1][1], sq[1][3], 0x21);
+ SumHorizontal16(src0, over_read_in_bytes, &s3[0][2], &s3[1][2], &s5[0][3],
+ &s5[1][3]);
+ SumHorizontal16(src1, over_read_in_bytes, &s3[0][3], &s3[1][3], &s5[0][4],
+ &s5[1][4]);
+ StoreAligned32(sum3[2] + x + 0, s3[0][2]);
+ StoreAligned32(sum3[2] + x + 16, s3[1][2]);
+ StoreAligned32(sum3[3] + x + 0, s3[0][3]);
+ StoreAligned32(sum3[3] + x + 16, s3[1][3]);
+ StoreAligned32(sum5[3] + x + 0, s5[0][3]);
+ StoreAligned32(sum5[3] + x + 16, s5[1][3]);
+ StoreAligned32(sum5[4] + x + 0, s5[0][4]);
+ StoreAligned32(sum5[4] + x + 16, s5[1][4]);
+ SumHorizontal32(sq[0], &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ SumHorizontal32(sq[1], &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+ StoreAligned64(square_sum3[2] + x, sq3[2]);
+ StoreAligned64(square_sum5[3] + x, sq5[3]);
+ StoreAligned64(square_sum3[3] + x, sq3[3]);
+ StoreAligned64(square_sum5[4] + x, sq5[4]);
+ LoadAligned32x2U16(sum3, x, s3[0]);
+ LoadAligned64x2U32(square_sum3, x, sq3);
+ CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum_3[0][0], &index_3[0][0]);
+ CalculateSumAndIndex3(s3[0] + 1, sq3 + 1, scales[1], &sum_3[1][0],
+ &index_3[1][0]);
+ LoadAligned32x3U16(sum5, x, s5[0]);
+ LoadAligned64x3U32(square_sum5, x, sq5);
+ CalculateSumAndIndex5(s5[0], sq5, scales[0], &sum_5[0], &index_5[0]);
+
+ s[0] = LoadUnaligned32Msan(src0 + 24, over_read_in_bytes + 48);
+ s[1] = LoadUnaligned32Msan(src1 + 24, over_read_in_bytes + 48);
+ Square(s[0], sq[0] + 6);
+ Square(s[1], sq[1] + 6);
+ sq[0][4] = _mm256_permute2x128_si256(sq[0][2], sq[0][6], 0x21);
+ sq[0][5] = _mm256_permute2x128_si256(sq[0][3], sq[0][7], 0x21);
+ sq[1][4] = _mm256_permute2x128_si256(sq[1][2], sq[1][6], 0x21);
+ sq[1][5] = _mm256_permute2x128_si256(sq[1][3], sq[1][7], 0x21);
+ SumHorizontal32(sq[0] + 4, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ SumHorizontal32(sq[1] + 4, &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+ StoreAligned64(square_sum3[2] + x + 16, sq3[2]);
+ StoreAligned64(square_sum5[3] + x + 16, sq5[3]);
+ StoreAligned64(square_sum3[3] + x + 16, sq3[3]);
+ StoreAligned64(square_sum5[4] + x + 16, sq5[4]);
+ LoadAligned32x2U16Msan(sum3, x + 16, sum_width, s3[1]);
+ LoadAligned64x2U32Msan(square_sum3, x + 16, sum_width, sq3);
+ CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum_3[0][1], &index_3[0][1]);
+ CalculateSumAndIndex3(s3[1] + 1, sq3 + 1, scales[1], &sum_3[1][1],
+ &index_3[1][1]);
+ CalculateIntermediate<9>(sum_3[0], index_3[0], ma3[0], t, t + 2);
+ PermuteB(t, b3[0]);
+ CalculateIntermediate<9>(sum_3[1], index_3[1], ma3[1], t, t + 2);
+ PermuteB(t, b3[1]);
+ LoadAligned32x3U16Msan(sum5, x + 16, sum_width, s5[1]);
+ LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5);
+ CalculateSumAndIndex5(s5[1], sq5, scales[0], &sum_5[1], &index_5[1]);
+ CalculateIntermediate<25>(sum_5, index_5, ma5, t, t + 2);
+ PermuteB(t, b5);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRowLo(
+ const __m128i s[2], const uint16_t scales[2], const uint16_t* const sum3[4],
+ const uint16_t* const sum5[5], const uint32_t* const square_sum3[4],
+ const uint32_t* const square_sum5[5], __m128i sq[4], __m128i* const ma3,
+ __m128i* const ma5, __m128i b3[2], __m128i b5[2]) {
+ __m128i s3[3], s5[5], sq3[3][2], sq5[5][2];
+ Square(s[1], sq + 2);
+ SumHorizontal16(s, &s3[2], &s5[3]);
+ SumHorizontal32(sq, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ LoadAligned16x3U16(sum5, 0, s5);
+ s5[4] = s5[3];
+ LoadAligned32x3U32(square_sum5, 0, sq5);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ CalculateIntermediate5(s5, sq5, scales[0], ma5, b5);
+ LoadAligned16x2U16(sum3, 0, s3);
+ LoadAligned32x2U32(square_sum3, 0, sq3);
+ CalculateIntermediate3(s3, sq3, scales[1], ma3, b3);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRow(
+ const uint16_t* const src, const ptrdiff_t over_read_in_bytes,
+ const ptrdiff_t sum_width, const ptrdiff_t x, const uint16_t scales[2],
+ const uint16_t* const sum3[4], const uint16_t* const sum5[5],
+ const uint32_t* const square_sum3[4], const uint32_t* const square_sum5[5],
+ __m256i sq[6], __m256i ma3[2], __m256i ma5[2], __m256i b3[5],
+ __m256i b5[5]) {
+ const __m256i s0 = LoadUnaligned32Msan(src + 8, over_read_in_bytes + 16);
+ __m256i s3[2][3], s5[2][5], sq3[4][2], sq5[5][2], sum_3[2], index_3[2],
+ sum_5[2], index_5[2], t[4];
+ Square(s0, sq + 2);
+ sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
+ sq[1] = _mm256_permute2x128_si256(sq[1], sq[3], 0x21);
+ SumHorizontal16(src, over_read_in_bytes, &s3[0][2], &s3[1][2], &s5[0][3],
+ &s5[1][3]);
+ SumHorizontal32(sq, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ LoadAligned32x2U16(sum3, x, s3[0]);
+ LoadAligned64x2U32(square_sum3, x, sq3);
+ CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum_3[0], &index_3[0]);
+ LoadAligned32x3U16(sum5, x, s5[0]);
+ s5[0][4] = s5[0][3];
+ LoadAligned64x3U32(square_sum5, x, sq5);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ CalculateSumAndIndex5(s5[0], sq5, scales[0], &sum_5[0], &index_5[0]);
+
+ const __m256i s1 = LoadUnaligned32Msan(src + 24, over_read_in_bytes + 48);
+ Square(s1, sq + 6);
+ sq[4] = _mm256_permute2x128_si256(sq[2], sq[6], 0x21);
+ sq[5] = _mm256_permute2x128_si256(sq[3], sq[7], 0x21);
+ SumHorizontal32(sq + 4, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ LoadAligned32x2U16Msan(sum3, x + 16, sum_width, s3[1]);
+ LoadAligned64x2U32Msan(square_sum3, x + 16, sum_width, sq3);
+ CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum_3[1], &index_3[1]);
+ CalculateIntermediate<9>(sum_3, index_3, ma3, t, t + 2);
+ PermuteB(t, b3);
+ LoadAligned32x3U16Msan(sum5, x + 16, sum_width, s5[1]);
+ s5[1][4] = s5[1][3];
+ LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ CalculateSumAndIndex5(s5[1], sq5, scales[0], &sum_5[1], &index_5[1]);
+ CalculateIntermediate<25>(sum_5, index_5, ma5, t, t + 2);
+ PermuteB(t, b5);
+}
+
+inline void BoxSumFilterPreProcess5(const uint16_t* const src0,
+ const uint16_t* const src1, const int width,
+ const uint32_t scale,
+ uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5],
+ const ptrdiff_t sum_width, uint16_t* ma565,
+ uint32_t* b565) {
+ const ptrdiff_t overread_in_bytes =
+ kOverreadInBytesPass1_128 - sizeof(*src0) * width;
+ __m128i s[2][2], ma0, sq_128[2][4], b0[2];
+ __m256i mas[3], sq[2][8], bs[10];
+ s[0][0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+ s[0][1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+ s[1][0] = LoadUnaligned16Msan(src1 + 0, overread_in_bytes + 0);
+ s[1][1] = LoadUnaligned16Msan(src1 + 8, overread_in_bytes + 16);
+ Square(s[0][0], sq_128[0]);
+ Square(s[1][0], sq_128[1]);
+ BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq_128, &ma0, b0);
+ sq[0][0] = SetrM128i(sq_128[0][2], sq_128[0][2]);
+ sq[0][1] = SetrM128i(sq_128[0][3], sq_128[0][3]);
+ sq[1][0] = SetrM128i(sq_128[1][2], sq_128[1][2]);
+ sq[1][1] = SetrM128i(sq_128[1][3], sq_128[1][3]);
+ mas[0] = SetrM128i(ma0, ma0);
+ bs[0] = SetrM128i(b0[0], b0[0]);
+ bs[1] = SetrM128i(b0[1], b0[1]);
+
+ int x = 0;
+ do {
+ __m256i ma5[3], ma[2], b[4];
+ BoxFilterPreProcess5(
+ src0 + x + 8, src1 + x + 8,
+ kOverreadInBytesPass1_256 + sizeof(*src0) * (x + 8 - width), sum_width,
+ x + 8, scale, sum5, square_sum5, sq, mas, bs);
+ Prepare3_8(mas, ma5);
+ ma[0] = Sum565Lo(ma5);
+ ma[1] = Sum565Hi(ma5);
+ StoreAligned64_ma(ma565, ma);
+ Sum565(bs + 0, b + 0);
+ Sum565(bs + 3, b + 2);
+ StoreAligned64(b565, b + 0);
+ StoreAligned64(b565 + 16, b + 2);
+ sq[0][0] = sq[0][6];
+ sq[0][1] = sq[0][7];
+ sq[1][0] = sq[1][6];
+ sq[1][1] = sq[1][7];
+ mas[0] = mas[2];
+ bs[0] = bs[5];
+ bs[1] = bs[6];
+ ma565 += 32;
+ b565 += 32;
+ x += 32;
+ } while (x < width);
+}
+
+template <bool calculate444>
+LIBGAV1_ALWAYS_INLINE void BoxSumFilterPreProcess3(
+ const uint16_t* const src, const int width, const uint32_t scale,
+ uint16_t* const sum3[3], uint32_t* const square_sum3[3],
+ const ptrdiff_t sum_width, uint16_t* ma343, uint16_t* ma444, uint32_t* b343,
+ uint32_t* b444) {
+ const ptrdiff_t overread_in_bytes_128 =
+ kOverreadInBytesPass2_128 - sizeof(*src) * width;
+ __m128i s[2], ma0, sq_128[4], b0[2];
+ __m256i mas[3], sq[8], bs[7];
+ s[0] = LoadUnaligned16Msan(src + 0, overread_in_bytes_128 + 0);
+ s[1] = LoadUnaligned16Msan(src + 8, overread_in_bytes_128 + 16);
+ Square(s[0], sq_128);
+ BoxFilterPreProcess3Lo(s, scale, sum3, square_sum3, sq_128, &ma0, b0);
+ sq[0] = SetrM128i(sq_128[2], sq_128[2]);
+ sq[1] = SetrM128i(sq_128[3], sq_128[3]);
+ mas[0] = SetrM128i(ma0, ma0);
+ bs[0] = SetrM128i(b0[0], b0[0]);
+ bs[1] = SetrM128i(b0[1], b0[1]);
+
+ int x = 0;
+ do {
+ __m256i ma3[3];
+ BoxFilterPreProcess3(
+ src + x + 8, kOverreadInBytesPass2_256 + sizeof(*src) * (x + 8 - width),
+ x + 8, sum_width, scale, sum3, square_sum3, sq, mas, bs);
+ Prepare3_8(mas, ma3);
+ if (calculate444) { // NOLINT(readability-simplify-boolean-expr)
+ Store343_444Lo(ma3, bs + 0, 0, ma343, ma444, b343, b444);
+ Store343_444Hi(ma3, bs + 3, kMaStoreOffset, ma343, ma444, b343, b444);
+ ma444 += 32;
+ b444 += 32;
+ } else {
+ __m256i ma[2], b[4];
+ ma[0] = Sum343Lo(ma3);
+ ma[1] = Sum343Hi(ma3);
+ StoreAligned64_ma(ma343, ma);
+ Sum343(bs + 0, b + 0);
+ Sum343(bs + 3, b + 2);
+ StoreAligned64(b343 + 0, b + 0);
+ StoreAligned64(b343 + 16, b + 2);
+ }
+ sq[0] = sq[6];
+ sq[1] = sq[7];
+ mas[0] = mas[2];
+ bs[0] = bs[5];
+ bs[1] = bs[6];
+ ma343 += 32;
+ b343 += 32;
+ x += 32;
+ } while (x < width);
+}
+
+inline void BoxSumFilterPreProcess(
+ const uint16_t* const src0, const uint16_t* const src1, const int width,
+ const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
+ uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+ const ptrdiff_t sum_width, uint16_t* const ma343[4], uint16_t* const ma444,
+ uint16_t* ma565, uint32_t* const b343[4], uint32_t* const b444,
+ uint32_t* b565) {
+ const ptrdiff_t overread_in_bytes =
+ kOverreadInBytesPass1_128 - sizeof(*src0) * width;
+ __m128i s[2][4], ma3_128[2][3], ma5_128[3], sq_128[2][8], b3_128[2][10],
+ b5_128[10];
+ __m256i ma3[2][3], ma5[3], sq[2][8], b3[2][7], b5[7];
+ s[0][0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+ s[0][1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+ s[1][0] = LoadUnaligned16Msan(src1 + 0, overread_in_bytes + 0);
+ s[1][1] = LoadUnaligned16Msan(src1 + 8, overread_in_bytes + 16);
+ Square(s[0][0], sq_128[0]);
+ Square(s[1][0], sq_128[1]);
+ BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq_128,
+ ma3_128, b3_128, &ma5_128[0], b5_128);
+ sq[0][0] = SetrM128i(sq_128[0][2], sq_128[0][2]);
+ sq[0][1] = SetrM128i(sq_128[0][3], sq_128[0][3]);
+ sq[1][0] = SetrM128i(sq_128[1][2], sq_128[1][2]);
+ sq[1][1] = SetrM128i(sq_128[1][3], sq_128[1][3]);
+ ma3[0][0] = SetrM128i(ma3_128[0][0], ma3_128[0][0]);
+ ma3[1][0] = SetrM128i(ma3_128[1][0], ma3_128[1][0]);
+ ma5[0] = SetrM128i(ma5_128[0], ma5_128[0]);
+ b3[0][0] = SetrM128i(b3_128[0][0], b3_128[0][0]);
+ b3[0][1] = SetrM128i(b3_128[0][1], b3_128[0][1]);
+ b3[1][0] = SetrM128i(b3_128[1][0], b3_128[1][0]);
+ b3[1][1] = SetrM128i(b3_128[1][1], b3_128[1][1]);
+ b5[0] = SetrM128i(b5_128[0], b5_128[0]);
+ b5[1] = SetrM128i(b5_128[1], b5_128[1]);
+
+ int x = 0;
+ do {
+ __m256i ma[2], b[4], ma3x[3], ma5x[3];
+ BoxFilterPreProcess(
+ src0 + x + 8, src1 + x + 8,
+ kOverreadInBytesPass1_256 + sizeof(*src0) * (x + 8 - width), x + 8,
+ scales, sum3, sum5, square_sum3, square_sum5, sum_width, sq, ma3, b3,
+ ma5, b5);
+ Prepare3_8(ma3[0], ma3x);
+ ma[0] = Sum343Lo(ma3x);
+ ma[1] = Sum343Hi(ma3x);
+ StoreAligned64_ma(ma343[0] + x, ma);
+ Sum343(b3[0], b);
+ Sum343(b3[0] + 3, b + 2);
+ StoreAligned64(b343[0] + x, b);
+ StoreAligned64(b343[0] + x + 16, b + 2);
+ Prepare3_8(ma3[1], ma3x);
+ Store343_444Lo(ma3x, b3[1], x, ma343[1], ma444, b343[1], b444);
+ Store343_444Hi(ma3x, b3[1] + 3, x + kMaStoreOffset, ma343[1], ma444,
+ b343[1], b444);
+ Prepare3_8(ma5, ma5x);
+ ma[0] = Sum565Lo(ma5x);
+ ma[1] = Sum565Hi(ma5x);
+ StoreAligned64_ma(ma565, ma);
+ Sum565(b5, b);
+ StoreAligned64(b565, b);
+ Sum565(b5 + 3, b);
+ StoreAligned64(b565 + 16, b);
+ sq[0][0] = sq[0][6];
+ sq[0][1] = sq[0][7];
+ sq[1][0] = sq[1][6];
+ sq[1][1] = sq[1][7];
+ ma3[0][0] = ma3[0][2];
+ ma3[1][0] = ma3[1][2];
+ ma5[0] = ma5[2];
+ b3[0][0] = b3[0][5];
+ b3[0][1] = b3[0][6];
+ b3[1][0] = b3[1][5];
+ b3[1][1] = b3[1][6];
+ b5[0] = b5[5];
+ b5[1] = b5[6];
+ ma565 += 32;
+ b565 += 32;
+ x += 32;
+ } while (x < width);
+}
+
+template <int shift>
+inline __m256i FilterOutput(const __m256i ma_x_src, const __m256i b) {
+ // ma: 255 * 32 = 8160 (13 bits)
+ // b: 65088 * 32 = 2082816 (21 bits)
+ // v: b - ma * 255 (22 bits)
+ const __m256i v = _mm256_sub_epi32(b, ma_x_src);
+ // kSgrProjSgrBits = 8
+ // kSgrProjRestoreBits = 4
+ // shift = 4 or 5
+ // v >> 8 or 9 (13 bits)
+ return VrshrS32(v, kSgrProjSgrBits + shift - kSgrProjRestoreBits);
+}
+
+template <int shift>
+inline __m256i CalculateFilteredOutput(const __m256i src, const __m256i ma,
+ const __m256i b[2]) {
+ const __m256i ma_x_src_lo = VmullLo16(ma, src);
+ const __m256i ma_x_src_hi = VmullHi16(ma, src);
+ const __m256i dst_lo = FilterOutput<shift>(ma_x_src_lo, b[0]);
+ const __m256i dst_hi = FilterOutput<shift>(ma_x_src_hi, b[1]);
+ return _mm256_packs_epi32(dst_lo, dst_hi); // 13 bits
+}
+
+inline __m256i CalculateFilteredOutputPass1(const __m256i src,
+ const __m256i ma[2],
+ const __m256i b[2][2]) {
+ const __m256i ma_sum = _mm256_add_epi16(ma[0], ma[1]);
+ __m256i b_sum[2];
+ b_sum[0] = _mm256_add_epi32(b[0][0], b[1][0]);
+ b_sum[1] = _mm256_add_epi32(b[0][1], b[1][1]);
+ return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
+}
+
+inline __m256i CalculateFilteredOutputPass2(const __m256i src,
+ const __m256i ma[3],
+ const __m256i b[3][2]) {
+ const __m256i ma_sum = Sum3_16(ma);
+ __m256i b_sum[2];
+ Sum3_32(b, b_sum);
+ return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
+}
+
+inline __m256i SelfGuidedFinal(const __m256i src, const __m256i v[2]) {
+ const __m256i v_lo =
+ VrshrS32(v[0], kSgrProjRestoreBits + kSgrProjPrecisionBits);
+ const __m256i v_hi =
+ VrshrS32(v[1], kSgrProjRestoreBits + kSgrProjPrecisionBits);
+ const __m256i vv = _mm256_packs_epi32(v_lo, v_hi);
+ return _mm256_add_epi16(src, vv);
+}
+
+inline __m256i SelfGuidedDoubleMultiplier(const __m256i src,
+ const __m256i filter[2], const int w0,
+ const int w2) {
+ __m256i v[2];
+ const __m256i w0_w2 =
+ _mm256_set1_epi32((w2 << 16) | static_cast<uint16_t>(w0));
+ const __m256i f_lo = _mm256_unpacklo_epi16(filter[0], filter[1]);
+ const __m256i f_hi = _mm256_unpackhi_epi16(filter[0], filter[1]);
+ v[0] = _mm256_madd_epi16(w0_w2, f_lo);
+ v[1] = _mm256_madd_epi16(w0_w2, f_hi);
+ return SelfGuidedFinal(src, v);
+}
+
+inline __m256i SelfGuidedSingleMultiplier(const __m256i src,
+ const __m256i filter, const int w0) {
+ // weight: -96 to 96 (Sgrproj_Xqd_Min/Max)
+ __m256i v[2];
+ v[0] = VmullNLo8(filter, w0);
+ v[1] = VmullNHi8(filter, w0);
+ return SelfGuidedFinal(src, v);
+}
+
+inline void ClipAndStore(uint16_t* const dst, const __m256i val) {
+ const __m256i val0 = _mm256_max_epi16(val, _mm256_setzero_si256());
+ const __m256i val1 = _mm256_min_epi16(val0, _mm256_set1_epi16(1023));
+ StoreUnaligned32(dst, val1);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass1(
+ const uint16_t* const src, const uint16_t* const src0,
+ const uint16_t* const src1, const ptrdiff_t stride, uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5], const int width, const ptrdiff_t sum_width,
+ const uint32_t scale, const int16_t w0, uint16_t* const ma565[2],
+ uint32_t* const b565[2], uint16_t* const dst) {
+ const ptrdiff_t overread_in_bytes =
+ kOverreadInBytesPass1_128 - sizeof(*src0) * width;
+ __m128i s[2][2], ma0, sq_128[2][4], b0[2];
+ __m256i mas[3], sq[2][8], bs[7];
+ s[0][0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+ s[0][1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+ s[1][0] = LoadUnaligned16Msan(src1 + 0, overread_in_bytes + 0);
+ s[1][1] = LoadUnaligned16Msan(src1 + 8, overread_in_bytes + 16);
+ Square(s[0][0], sq_128[0]);
+ Square(s[1][0], sq_128[1]);
+ BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq_128, &ma0, b0);
+ sq[0][0] = SetrM128i(sq_128[0][2], sq_128[0][2]);
+ sq[0][1] = SetrM128i(sq_128[0][3], sq_128[0][3]);
+ sq[1][0] = SetrM128i(sq_128[1][2], sq_128[1][2]);
+ sq[1][1] = SetrM128i(sq_128[1][3], sq_128[1][3]);
+ mas[0] = SetrM128i(ma0, ma0);
+ bs[0] = SetrM128i(b0[0], b0[0]);
+ bs[1] = SetrM128i(b0[1], b0[1]);
+
+ int x = 0;
+ do {
+ __m256i ma5[3], ma[4], b[4][2];
+ BoxFilterPreProcess5(
+ src0 + x + 8, src1 + x + 8,
+ kOverreadInBytesPass1_256 + sizeof(*src0) * (x + 8 - width), sum_width,
+ x + 8, scale, sum5, square_sum5, sq, mas, bs);
+ Prepare3_8(mas, ma5);
+ ma[2] = Sum565Lo(ma5);
+ ma[3] = Sum565Hi(ma5);
+ ma[1] = _mm256_permute2x128_si256(ma[2], ma[3], 0x20);
+ ma[3] = _mm256_permute2x128_si256(ma[2], ma[3], 0x31);
+ StoreAligned32(ma565[1] + x + 0, ma[1]);
+ StoreAligned32(ma565[1] + x + 16, ma[3]);
+ Sum565(bs + 0, b[1]);
+ Sum565(bs + 3, b[3]);
+ StoreAligned64(b565[1] + x, b[1]);
+ StoreAligned64(b565[1] + x + 16, b[3]);
+ const __m256i sr0_lo = LoadUnaligned32(src + x + 0);
+ ma[0] = LoadAligned32(ma565[0] + x);
+ LoadAligned64(b565[0] + x, b[0]);
+ const __m256i p0 = CalculateFilteredOutputPass1(sr0_lo, ma, b);
+ const __m256i d0 = SelfGuidedSingleMultiplier(sr0_lo, p0, w0);
+ ClipAndStore(dst + x + 0, d0);
+ const __m256i sr0_hi = LoadUnaligned32(src + x + 16);
+ ma[2] = LoadAligned32(ma565[0] + x + 16);
+ LoadAligned64(b565[0] + x + 16, b[2]);
+ const __m256i p1 = CalculateFilteredOutputPass1(sr0_hi, ma + 2, b + 2);
+ const __m256i d1 = SelfGuidedSingleMultiplier(sr0_hi, p1, w0);
+ ClipAndStore(dst + x + 16, d1);
+ const __m256i sr1_lo = LoadUnaligned32(src + stride + x + 0);
+ const __m256i p10 = CalculateFilteredOutput<4>(sr1_lo, ma[1], b[1]);
+ const __m256i d10 = SelfGuidedSingleMultiplier(sr1_lo, p10, w0);
+ ClipAndStore(dst + stride + x + 0, d10);
+ const __m256i sr1_hi = LoadUnaligned32(src + stride + x + 16);
+ const __m256i p11 = CalculateFilteredOutput<4>(sr1_hi, ma[3], b[3]);
+ const __m256i d11 = SelfGuidedSingleMultiplier(sr1_hi, p11, w0);
+ ClipAndStore(dst + stride + x + 16, d11);
+ sq[0][0] = sq[0][6];
+ sq[0][1] = sq[0][7];
+ sq[1][0] = sq[1][6];
+ sq[1][1] = sq[1][7];
+ mas[0] = mas[2];
+ bs[0] = bs[5];
+ bs[1] = bs[6];
+ x += 32;
+ } while (x < width);
+}
+
+inline void BoxFilterPass1LastRow(
+ const uint16_t* const src, const uint16_t* const src0, const int width,
+ const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
+ uint16_t* const sum5[5], uint32_t* const square_sum5[5], uint16_t* ma565,
+ uint32_t* b565, uint16_t* const dst) {
+ const ptrdiff_t overread_in_bytes =
+ kOverreadInBytesPass1_128 - sizeof(*src0) * width;
+ __m128i s[2], ma0[2], sq_128[8], b0[6];
+ __m256i mas[3], sq[8], bs[7];
+ s[0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+ s[1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+ Square(s[0], sq_128);
+ BoxFilterPreProcess5LastRowLo(s, scale, sum5, square_sum5, sq_128, &ma0[0],
+ b0);
+ sq[0] = SetrM128i(sq_128[2], sq_128[2]);
+ sq[1] = SetrM128i(sq_128[3], sq_128[3]);
+ mas[0] = SetrM128i(ma0[0], ma0[0]);
+ bs[0] = SetrM128i(b0[0], b0[0]);
+ bs[1] = SetrM128i(b0[1], b0[1]);
+
+ int x = 0;
+ do {
+ __m256i ma5[3], ma[4], b[4][2];
+ BoxFilterPreProcess5LastRow(
+ src0 + x + 8,
+ kOverreadInBytesPass1_256 + sizeof(*src0) * (x + 8 - width), sum_width,
+ x + 8, scale, sum5, square_sum5, sq, mas, bs);
+ Prepare3_8(mas, ma5);
+ ma[2] = Sum565Lo(ma5);
+ ma[3] = Sum565Hi(ma5);
+ Sum565(bs + 0, b[1]);
+ Sum565(bs + 3, b[3]);
+ const __m256i sr0_lo = LoadUnaligned32(src + x + 0);
+ ma[0] = LoadAligned32(ma565 + x);
+ ma[1] = _mm256_permute2x128_si256(ma[2], ma[3], 0x20);
+ LoadAligned64(b565 + x, b[0]);
+ const __m256i p0 = CalculateFilteredOutputPass1(sr0_lo, ma, b);
+ const __m256i d0 = SelfGuidedSingleMultiplier(sr0_lo, p0, w0);
+ ClipAndStore(dst + x + 0, d0);
+ const __m256i sr0_hi = LoadUnaligned32(src + x + 16);
+ ma[0] = LoadAligned32(ma565 + x + 16);
+ ma[1] = _mm256_permute2x128_si256(ma[2], ma[3], 0x31);
+ LoadAligned64(b565 + x + 16, b[2]);
+ const __m256i p1 = CalculateFilteredOutputPass1(sr0_hi, ma, b + 2);
+ const __m256i d1 = SelfGuidedSingleMultiplier(sr0_hi, p1, w0);
+ ClipAndStore(dst + x + 16, d1);
+ sq[0] = sq[6];
+ sq[1] = sq[7];
+ mas[0] = mas[2];
+ bs[0] = bs[5];
+ bs[1] = bs[6];
+ x += 32;
+ } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass2(
+ const uint16_t* const src, const uint16_t* const src0, const int width,
+ const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
+ uint16_t* const sum3[3], uint32_t* const square_sum3[3],
+ uint16_t* const ma343[3], uint16_t* const ma444[2], uint32_t* const b343[3],
+ uint32_t* const b444[2], uint16_t* const dst) {
+ const ptrdiff_t overread_in_bytes_128 =
+ kOverreadInBytesPass2_128 - sizeof(*src0) * width;
+ __m128i s0[2], ma0, sq_128[4], b0[2];
+ __m256i mas[3], sq[8], bs[7];
+ s0[0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes_128 + 0);
+ s0[1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes_128 + 16);
+ Square(s0[0], sq_128);
+ BoxFilterPreProcess3Lo(s0, scale, sum3, square_sum3, sq_128, &ma0, b0);
+ sq[0] = SetrM128i(sq_128[2], sq_128[2]);
+ sq[1] = SetrM128i(sq_128[3], sq_128[3]);
+ mas[0] = SetrM128i(ma0, ma0);
+ bs[0] = SetrM128i(b0[0], b0[0]);
+ bs[1] = SetrM128i(b0[1], b0[1]);
+
+ int x = 0;
+ do {
+ __m256i ma[4], b[4][2], ma3[3];
+ BoxFilterPreProcess3(
+ src0 + x + 8,
+ kOverreadInBytesPass2_256 + sizeof(*src0) * (x + 8 - width), x + 8,
+ sum_width, scale, sum3, square_sum3, sq, mas, bs);
+ Prepare3_8(mas, ma3);
+ Store343_444(ma3, bs, x, &ma[2], &ma[3], b[2], b[3], ma343[2], ma444[1],
+ b343[2], b444[1]);
+ const __m256i sr_lo = LoadUnaligned32(src + x + 0);
+ const __m256i sr_hi = LoadUnaligned32(src + x + 16);
+ ma[0] = LoadAligned32(ma343[0] + x);
+ ma[1] = LoadAligned32(ma444[0] + x);
+ LoadAligned64(b343[0] + x, b[0]);
+ LoadAligned64(b444[0] + x, b[1]);
+ const __m256i p0 = CalculateFilteredOutputPass2(sr_lo, ma, b);
+ ma[1] = LoadAligned32(ma343[0] + x + 16);
+ ma[2] = LoadAligned32(ma444[0] + x + 16);
+ LoadAligned64(b343[0] + x + 16, b[1]);
+ LoadAligned64(b444[0] + x + 16, b[2]);
+ const __m256i p1 = CalculateFilteredOutputPass2(sr_hi, ma + 1, b + 1);
+ const __m256i d0 = SelfGuidedSingleMultiplier(sr_lo, p0, w0);
+ const __m256i d1 = SelfGuidedSingleMultiplier(sr_hi, p1, w0);
+ ClipAndStore(dst + x + 0, d0);
+ ClipAndStore(dst + x + 16, d1);
+ sq[0] = sq[6];
+ sq[1] = sq[7];
+ mas[0] = mas[2];
+ bs[0] = bs[5];
+ bs[1] = bs[6];
+ x += 32;
+ } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilter(
+ const uint16_t* const src, const uint16_t* const src0,
+ const uint16_t* const src1, const ptrdiff_t stride, const int width,
+ const uint16_t scales[2], const int16_t w0, const int16_t w2,
+ uint16_t* const sum3[4], uint16_t* const sum5[5],
+ uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+ const ptrdiff_t sum_width, uint16_t* const ma343[4],
+ uint16_t* const ma444[3], uint16_t* const ma565[2], uint32_t* const b343[4],
+ uint32_t* const b444[3], uint32_t* const b565[2], uint16_t* const dst) {
+ const ptrdiff_t overread_in_bytes =
+ kOverreadInBytesPass1_128 - sizeof(*src0) * width;
+ __m128i s[2][4], ma3_128[2][3], ma5_0, sq_128[2][8], b3_128[2][10], b5_128[2];
+ __m256i ma3[2][3], ma5[3], sq[2][8], b3[2][7], b5[7];
+ s[0][0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+ s[0][1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+ s[1][0] = LoadUnaligned16Msan(src1 + 0, overread_in_bytes + 0);
+ s[1][1] = LoadUnaligned16Msan(src1 + 8, overread_in_bytes + 16);
+ Square(s[0][0], sq_128[0]);
+ Square(s[1][0], sq_128[1]);
+ BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq_128,
+ ma3_128, b3_128, &ma5_0, b5_128);
+ sq[0][0] = SetrM128i(sq_128[0][2], sq_128[0][2]);
+ sq[0][1] = SetrM128i(sq_128[0][3], sq_128[0][3]);
+ sq[1][0] = SetrM128i(sq_128[1][2], sq_128[1][2]);
+ sq[1][1] = SetrM128i(sq_128[1][3], sq_128[1][3]);
+ ma3[0][0] = SetrM128i(ma3_128[0][0], ma3_128[0][0]);
+ ma3[1][0] = SetrM128i(ma3_128[1][0], ma3_128[1][0]);
+ ma5[0] = SetrM128i(ma5_0, ma5_0);
+ b3[0][0] = SetrM128i(b3_128[0][0], b3_128[0][0]);
+ b3[0][1] = SetrM128i(b3_128[0][1], b3_128[0][1]);
+ b3[1][0] = SetrM128i(b3_128[1][0], b3_128[1][0]);
+ b3[1][1] = SetrM128i(b3_128[1][1], b3_128[1][1]);
+ b5[0] = SetrM128i(b5_128[0], b5_128[0]);
+ b5[1] = SetrM128i(b5_128[1], b5_128[1]);
+
+ int x = 0;
+ do {
+ __m256i ma[3][4], mat[3][3], b[3][3][2], bt[3][3][2], p[2][2], ma3x[2][3],
+ ma5x[3];
+ BoxFilterPreProcess(
+ src0 + x + 8, src1 + x + 8,
+ kOverreadInBytesPass1_256 + sizeof(*src0) * (x + 8 - width), x + 8,
+ scales, sum3, sum5, square_sum3, square_sum5, sum_width, sq, ma3, b3,
+ ma5, b5);
+ Prepare3_8(ma3[0], ma3x[0]);
+ Prepare3_8(ma3[1], ma3x[1]);
+ Prepare3_8(ma5, ma5x);
+ Store343_444(ma3x[0], b3[0], x, &ma[1][2], &mat[1][2], &ma[2][1],
+ &mat[2][1], b[1][2], bt[1][2], b[2][1], bt[2][1], ma343[2],
+ ma444[1], b343[2], b444[1]);
+ Store343_444(ma3x[1], b3[1], x, &ma[2][2], &mat[2][2], b[2][2], bt[2][2],
+ ma343[3], ma444[2], b343[3], b444[2]);
+
+ ma[0][2] = Sum565Lo(ma5x);
+ ma[0][3] = Sum565Hi(ma5x);
+ ma[0][1] = _mm256_permute2x128_si256(ma[0][2], ma[0][3], 0x20);
+ ma[0][3] = _mm256_permute2x128_si256(ma[0][2], ma[0][3], 0x31);
+ StoreAligned32(ma565[1] + x + 0, ma[0][1]);
+ StoreAligned32(ma565[1] + x + 16, ma[0][3]);
+ Sum565(b5, b[0][1]);
+ StoreAligned64(b565[1] + x, b[0][1]);
+ const __m256i sr0_lo = LoadUnaligned32(src + x);
+ const __m256i sr1_lo = LoadUnaligned32(src + stride + x);
+ ma[0][0] = LoadAligned32(ma565[0] + x);
+ LoadAligned64(b565[0] + x, b[0][0]);
+ p[0][0] = CalculateFilteredOutputPass1(sr0_lo, ma[0], b[0]);
+ p[1][0] = CalculateFilteredOutput<4>(sr1_lo, ma[0][1], b[0][1]);
+ ma[1][0] = LoadAligned32(ma343[0] + x);
+ ma[1][1] = LoadAligned32(ma444[0] + x);
+ // Keeping the following 4 redundant lines is faster. The reason is that
+ // there are not enough registers available, and these values could be saved
+ // and loaded which is even slower.
+ ma[1][2] = LoadAligned32(ma343[2] + x); // Redundant line 1.
+ LoadAligned64(b343[0] + x, b[1][0]);
+ LoadAligned64(b444[0] + x, b[1][1]);
+ p[0][1] = CalculateFilteredOutputPass2(sr0_lo, ma[1], b[1]);
+ ma[2][0] = LoadAligned32(ma343[1] + x);
+ ma[2][1] = LoadAligned32(ma444[1] + x); // Redundant line 2.
+ LoadAligned64(b343[1] + x, b[2][0]);
+ p[1][1] = CalculateFilteredOutputPass2(sr1_lo, ma[2], b[2]);
+ const __m256i d00 = SelfGuidedDoubleMultiplier(sr0_lo, p[0], w0, w2);
+ ClipAndStore(dst + x, d00);
+ const __m256i d10x = SelfGuidedDoubleMultiplier(sr1_lo, p[1], w0, w2);
+ ClipAndStore(dst + stride + x, d10x);
+
+ Sum565(b5 + 3, bt[0][1]);
+ StoreAligned64(b565[1] + x + 16, bt[0][1]);
+ const __m256i sr0_hi = LoadUnaligned32(src + x + 16);
+ const __m256i sr1_hi = LoadUnaligned32(src + stride + x + 16);
+ ma[0][2] = LoadAligned32(ma565[0] + x + 16);
+ LoadAligned64(b565[0] + x + 16, bt[0][0]);
+ p[0][0] = CalculateFilteredOutputPass1(sr0_hi, ma[0] + 2, bt[0]);
+ p[1][0] = CalculateFilteredOutput<4>(sr1_hi, ma[0][3], bt[0][1]);
+ mat[1][0] = LoadAligned32(ma343[0] + x + 16);
+ mat[1][1] = LoadAligned32(ma444[0] + x + 16);
+ mat[1][2] = LoadAligned32(ma343[2] + x + 16); // Redundant line 3.
+ LoadAligned64(b343[0] + x + 16, bt[1][0]);
+ LoadAligned64(b444[0] + x + 16, bt[1][1]);
+ p[0][1] = CalculateFilteredOutputPass2(sr0_hi, mat[1], bt[1]);
+ mat[2][0] = LoadAligned32(ma343[1] + x + 16);
+ mat[2][1] = LoadAligned32(ma444[1] + x + 16); // Redundant line 4.
+ LoadAligned64(b343[1] + x + 16, bt[2][0]);
+ p[1][1] = CalculateFilteredOutputPass2(sr1_hi, mat[2], bt[2]);
+ const __m256i d01 = SelfGuidedDoubleMultiplier(sr0_hi, p[0], w0, w2);
+ ClipAndStore(dst + x + 16, d01);
+ const __m256i d11 = SelfGuidedDoubleMultiplier(sr1_hi, p[1], w0, w2);
+ ClipAndStore(dst + stride + x + 16, d11);
+
+ sq[0][0] = sq[0][6];
+ sq[0][1] = sq[0][7];
+ sq[1][0] = sq[1][6];
+ sq[1][1] = sq[1][7];
+ ma3[0][0] = ma3[0][2];
+ ma3[1][0] = ma3[1][2];
+ ma5[0] = ma5[2];
+ b3[0][0] = b3[0][5];
+ b3[0][1] = b3[0][6];
+ b3[1][0] = b3[1][5];
+ b3[1][1] = b3[1][6];
+ b5[0] = b5[5];
+ b5[1] = b5[6];
+ x += 32;
+ } while (x < width);
+}
+
+inline void BoxFilterLastRow(
+ const uint16_t* const src, const uint16_t* const src0, const int width,
+ const ptrdiff_t sum_width, const uint16_t scales[2], const int16_t w0,
+ const int16_t w2, uint16_t* const sum3[4], uint16_t* const sum5[5],
+ uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+ uint16_t* const ma343, uint16_t* const ma444, uint16_t* const ma565,
+ uint32_t* const b343, uint32_t* const b444, uint32_t* const b565,
+ uint16_t* const dst) {
+ const ptrdiff_t overread_in_bytes =
+ kOverreadInBytesPass1_128 - sizeof(*src0) * width;
+ __m128i s[2], ma3_0, ma5_0, sq_128[4], b3_128[2], b5_128[2];
+ __m256i ma3[3], ma5[3], sq[8], b3[7], b5[7];
+ s[0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+ s[1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+ Square(s[0], sq_128);
+ BoxFilterPreProcessLastRowLo(s, scales, sum3, sum5, square_sum3, square_sum5,
+ sq_128, &ma3_0, &ma5_0, b3_128, b5_128);
+ sq[0] = SetrM128i(sq_128[2], sq_128[2]);
+ sq[1] = SetrM128i(sq_128[3], sq_128[3]);
+ ma3[0] = SetrM128i(ma3_0, ma3_0);
+ ma5[0] = SetrM128i(ma5_0, ma5_0);
+ b3[0] = SetrM128i(b3_128[0], b3_128[0]);
+ b3[1] = SetrM128i(b3_128[1], b3_128[1]);
+ b5[0] = SetrM128i(b5_128[0], b5_128[0]);
+ b5[1] = SetrM128i(b5_128[1], b5_128[1]);
+
+ int x = 0;
+ do {
+ __m256i ma[4], mat[4], b[3][2], bt[3][2], ma3x[3], ma5x[3], p[2];
+ BoxFilterPreProcessLastRow(
+ src0 + x + 8,
+ kOverreadInBytesPass1_256 + sizeof(*src0) * (x + 8 - width), sum_width,
+ x + 8, scales, sum3, sum5, square_sum3, square_sum5, sq, ma3, ma5, b3,
+ b5);
+ Prepare3_8(ma3, ma3x);
+ Prepare3_8(ma5, ma5x);
+ ma[2] = Sum565Lo(ma5x);
+ Sum565(b5, b[1]);
+ mat[1] = Sum565Hi(ma5x);
+ Sum565(b5 + 3, bt[1]);
+ ma[3] = Sum343Lo(ma3x);
+ Sum343(b3, b[2]);
+ mat[2] = Sum343Hi(ma3x);
+ Sum343(b3 + 3, bt[2]);
+
+ const __m256i sr_lo = LoadUnaligned32(src + x);
+ ma[0] = LoadAligned32(ma565 + x);
+ ma[1] = _mm256_permute2x128_si256(ma[2], mat[1], 0x20);
+ mat[1] = _mm256_permute2x128_si256(ma[2], mat[1], 0x31);
+ LoadAligned64(b565 + x, b[0]);
+ p[0] = CalculateFilteredOutputPass1(sr_lo, ma, b);
+ ma[0] = LoadAligned32(ma343 + x);
+ ma[1] = LoadAligned32(ma444 + x);
+ ma[2] = _mm256_permute2x128_si256(ma[3], mat[2], 0x20);
+ LoadAligned64(b343 + x, b[0]);
+ LoadAligned64(b444 + x, b[1]);
+ p[1] = CalculateFilteredOutputPass2(sr_lo, ma, b);
+ const __m256i d0 = SelfGuidedDoubleMultiplier(sr_lo, p, w0, w2);
+
+ const __m256i sr_hi = LoadUnaligned32(src + x + 16);
+ mat[0] = LoadAligned32(ma565 + x + 16);
+ LoadAligned64(b565 + x + 16, bt[0]);
+ p[0] = CalculateFilteredOutputPass1(sr_hi, mat, bt);
+ mat[0] = LoadAligned32(ma343 + x + 16);
+ mat[1] = LoadAligned32(ma444 + x + 16);
+ mat[2] = _mm256_permute2x128_si256(ma[3], mat[2], 0x31);
+ LoadAligned64(b343 + x + 16, bt[0]);
+ LoadAligned64(b444 + x + 16, bt[1]);
+ p[1] = CalculateFilteredOutputPass2(sr_hi, mat, bt);
+ const __m256i d1 = SelfGuidedDoubleMultiplier(sr_hi, p, w0, w2);
+ ClipAndStore(dst + x + 0, d0);
+ ClipAndStore(dst + x + 16, d1);
+
+ sq[0] = sq[6];
+ sq[1] = sq[7];
+ ma3[0] = ma3[2];
+ ma5[0] = ma5[2];
+ b3[0] = b3[5];
+ b3[1] = b3[6];
+ b5[0] = b5[5];
+ b5[1] = b5[6];
+ x += 32;
+ } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
+ const RestorationUnitInfo& restoration_info, const uint16_t* src,
+ const ptrdiff_t stride, const uint16_t* const top_border,
+ const ptrdiff_t top_border_stride, const uint16_t* bottom_border,
+ const ptrdiff_t bottom_border_stride, const int width, const int height,
+ SgrBuffer* const sgr_buffer, uint16_t* dst) {
+ const auto temp_stride = Align<ptrdiff_t>(width, 32);
+ const auto sum_width = temp_stride + 8;
+ const auto sum_stride = temp_stride + 32;
+ const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+ const uint16_t* const scales = kSgrScaleParameter[sgr_proj_index]; // < 2^12.
+ const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+ const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+ const int16_t w2 = (1 << kSgrProjPrecisionBits) - w0 - w1;
+ uint16_t *sum3[4], *sum5[5], *ma343[4], *ma444[3], *ma565[2];
+ uint32_t *square_sum3[4], *square_sum5[5], *b343[4], *b444[3], *b565[2];
+ sum3[0] = sgr_buffer->sum3 + kSumOffset;
+ square_sum3[0] = sgr_buffer->square_sum3 + kSumOffset;
+ ma343[0] = sgr_buffer->ma343;
+ b343[0] = sgr_buffer->b343;
+ for (int i = 1; i <= 3; ++i) {
+ sum3[i] = sum3[i - 1] + sum_stride;
+ square_sum3[i] = square_sum3[i - 1] + sum_stride;
+ ma343[i] = ma343[i - 1] + temp_stride;
+ b343[i] = b343[i - 1] + temp_stride;
+ }
+ sum5[0] = sgr_buffer->sum5 + kSumOffset;
+ square_sum5[0] = sgr_buffer->square_sum5 + kSumOffset;
+ for (int i = 1; i <= 4; ++i) {
+ sum5[i] = sum5[i - 1] + sum_stride;
+ square_sum5[i] = square_sum5[i - 1] + sum_stride;
+ }
+ ma444[0] = sgr_buffer->ma444;
+ b444[0] = sgr_buffer->b444;
+ for (int i = 1; i <= 2; ++i) {
+ ma444[i] = ma444[i - 1] + temp_stride;
+ b444[i] = b444[i - 1] + temp_stride;
+ }
+ ma565[0] = sgr_buffer->ma565;
+ ma565[1] = ma565[0] + temp_stride;
+ b565[0] = sgr_buffer->b565;
+ b565[1] = b565[0] + temp_stride;
+ assert(scales[0] != 0);
+ assert(scales[1] != 0);
+ BoxSum(top_border, top_border_stride, width, sum_stride, temp_stride, sum3[0],
+ sum5[1], square_sum3[0], square_sum5[1]);
+ sum5[0] = sum5[1];
+ square_sum5[0] = square_sum5[1];
+ const uint16_t* const s = (height > 1) ? src + stride : bottom_border;
+ BoxSumFilterPreProcess(src, s, width, scales, sum3, sum5, square_sum3,
+ square_sum5, sum_width, ma343, ma444[0], ma565[0],
+ b343, b444[0], b565[0]);
+ sum5[0] = sgr_buffer->sum5 + kSumOffset;
+ square_sum5[0] = sgr_buffer->square_sum5 + kSumOffset;
+
+ for (int y = (height >> 1) - 1; y > 0; --y) {
+ Circulate4PointersBy2<uint16_t>(sum3);
+ Circulate4PointersBy2<uint32_t>(square_sum3);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ BoxFilter(src + 3, src + 2 * stride, src + 3 * stride, stride, width,
+ scales, w0, w2, sum3, sum5, square_sum3, square_sum5, sum_width,
+ ma343, ma444, ma565, b343, b444, b565, dst);
+ src += 2 * stride;
+ dst += 2 * stride;
+ Circulate4PointersBy2<uint16_t>(ma343);
+ Circulate4PointersBy2<uint32_t>(b343);
+ std::swap(ma444[0], ma444[2]);
+ std::swap(b444[0], b444[2]);
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ }
+
+ Circulate4PointersBy2<uint16_t>(sum3);
+ Circulate4PointersBy2<uint32_t>(square_sum3);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ if ((height & 1) == 0 || height > 1) {
+ const uint16_t* sr[2];
+ if ((height & 1) == 0) {
+ sr[0] = bottom_border;
+ sr[1] = bottom_border + bottom_border_stride;
+ } else {
+ sr[0] = src + 2 * stride;
+ sr[1] = bottom_border;
+ }
+ BoxFilter(src + 3, sr[0], sr[1], stride, width, scales, w0, w2, sum3, sum5,
+ square_sum3, square_sum5, sum_width, ma343, ma444, ma565, b343,
+ b444, b565, dst);
+ }
+ if ((height & 1) != 0) {
+ if (height > 1) {
+ src += 2 * stride;
+ dst += 2 * stride;
+ Circulate4PointersBy2<uint16_t>(sum3);
+ Circulate4PointersBy2<uint32_t>(square_sum3);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ Circulate4PointersBy2<uint16_t>(ma343);
+ Circulate4PointersBy2<uint32_t>(b343);
+ std::swap(ma444[0], ma444[2]);
+ std::swap(b444[0], b444[2]);
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ }
+ BoxFilterLastRow(src + 3, bottom_border + bottom_border_stride, width,
+ sum_width, scales, w0, w2, sum3, sum5, square_sum3,
+ square_sum5, ma343[0], ma444[0], ma565[0], b343[0],
+ b444[0], b565[0], dst);
+ }
+}
+
+inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
+ const uint16_t* src, const ptrdiff_t stride,
+ const uint16_t* const top_border,
+ const ptrdiff_t top_border_stride,
+ const uint16_t* bottom_border,
+ const ptrdiff_t bottom_border_stride,
+ const int width, const int height,
+ SgrBuffer* const sgr_buffer, uint16_t* dst) {
+ const auto temp_stride = Align<ptrdiff_t>(width, 32);
+ const auto sum_width = temp_stride + 8;
+ const auto sum_stride = temp_stride + 32;
+ const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+ const uint32_t scale = kSgrScaleParameter[sgr_proj_index][0]; // < 2^12.
+ const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+ uint16_t *sum5[5], *ma565[2];
+ uint32_t *square_sum5[5], *b565[2];
+ sum5[0] = sgr_buffer->sum5 + kSumOffset;
+ square_sum5[0] = sgr_buffer->square_sum5 + kSumOffset;
+ for (int i = 1; i <= 4; ++i) {
+ sum5[i] = sum5[i - 1] + sum_stride;
+ square_sum5[i] = square_sum5[i - 1] + sum_stride;
+ }
+ ma565[0] = sgr_buffer->ma565;
+ ma565[1] = ma565[0] + temp_stride;
+ b565[0] = sgr_buffer->b565;
+ b565[1] = b565[0] + temp_stride;
+ assert(scale != 0);
+ BoxSum<5>(top_border, top_border_stride, width, sum_stride, temp_stride,
+ sum5[1], square_sum5[1]);
+ sum5[0] = sum5[1];
+ square_sum5[0] = square_sum5[1];
+ const uint16_t* const s = (height > 1) ? src + stride : bottom_border;
+ BoxSumFilterPreProcess5(src, s, width, scale, sum5, square_sum5, sum_width,
+ ma565[0], b565[0]);
+ sum5[0] = sgr_buffer->sum5 + kSumOffset;
+ square_sum5[0] = sgr_buffer->square_sum5 + kSumOffset;
+
+ for (int y = (height >> 1) - 1; y > 0; --y) {
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ BoxFilterPass1(src + 3, src + 2 * stride, src + 3 * stride, stride, sum5,
+ square_sum5, width, sum_width, scale, w0, ma565, b565, dst);
+ src += 2 * stride;
+ dst += 2 * stride;
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ }
+
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ if ((height & 1) == 0 || height > 1) {
+ const uint16_t* sr[2];
+ if ((height & 1) == 0) {
+ sr[0] = bottom_border;
+ sr[1] = bottom_border + bottom_border_stride;
+ } else {
+ sr[0] = src + 2 * stride;
+ sr[1] = bottom_border;
+ }
+ BoxFilterPass1(src + 3, sr[0], sr[1], stride, sum5, square_sum5, width,
+ sum_width, scale, w0, ma565, b565, dst);
+ }
+ if ((height & 1) != 0) {
+ src += 3;
+ if (height > 1) {
+ src += 2 * stride;
+ dst += 2 * stride;
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ }
+ BoxFilterPass1LastRow(src, bottom_border + bottom_border_stride, width,
+ sum_width, scale, w0, sum5, square_sum5, ma565[0],
+ b565[0], dst);
+ }
+}
+
+inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
+ const uint16_t* src, const ptrdiff_t stride,
+ const uint16_t* const top_border,
+ const ptrdiff_t top_border_stride,
+ const uint16_t* bottom_border,
+ const ptrdiff_t bottom_border_stride,
+ const int width, const int height,
+ SgrBuffer* const sgr_buffer, uint16_t* dst) {
+ assert(restoration_info.sgr_proj_info.multiplier[0] == 0);
+ const auto temp_stride = Align<ptrdiff_t>(width, 32);
+ const auto sum_width = temp_stride + 8;
+ const auto sum_stride = temp_stride + 32;
+ const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+ const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1;
+ const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+ const uint32_t scale = kSgrScaleParameter[sgr_proj_index][1]; // < 2^12.
+ uint16_t *sum3[3], *ma343[3], *ma444[2];
+ uint32_t *square_sum3[3], *b343[3], *b444[2];
+ sum3[0] = sgr_buffer->sum3 + kSumOffset;
+ square_sum3[0] = sgr_buffer->square_sum3 + kSumOffset;
+ ma343[0] = sgr_buffer->ma343;
+ b343[0] = sgr_buffer->b343;
+ for (int i = 1; i <= 2; ++i) {
+ sum3[i] = sum3[i - 1] + sum_stride;
+ square_sum3[i] = square_sum3[i - 1] + sum_stride;
+ ma343[i] = ma343[i - 1] + temp_stride;
+ b343[i] = b343[i - 1] + temp_stride;
+ }
+ ma444[0] = sgr_buffer->ma444;
+ ma444[1] = ma444[0] + temp_stride;
+ b444[0] = sgr_buffer->b444;
+ b444[1] = b444[0] + temp_stride;
+ assert(scale != 0);
+ BoxSum<3>(top_border, top_border_stride, width, sum_stride, temp_stride,
+ sum3[0], square_sum3[0]);
+ BoxSumFilterPreProcess3<false>(src, width, scale, sum3, square_sum3,
+ sum_width, ma343[0], nullptr, b343[0],
+ nullptr);
+ Circulate3PointersBy1<uint16_t>(sum3);
+ Circulate3PointersBy1<uint32_t>(square_sum3);
+ const uint16_t* s;
+ if (height > 1) {
+ s = src + stride;
+ } else {
+ s = bottom_border;
+ bottom_border += bottom_border_stride;
+ }
+ BoxSumFilterPreProcess3<true>(s, width, scale, sum3, square_sum3, sum_width,
+ ma343[1], ma444[0], b343[1], b444[0]);
+
+ for (int y = height - 2; y > 0; --y) {
+ Circulate3PointersBy1<uint16_t>(sum3);
+ Circulate3PointersBy1<uint32_t>(square_sum3);
+ BoxFilterPass2(src + 2, src + 2 * stride, width, sum_width, scale, w0, sum3,
+ square_sum3, ma343, ma444, b343, b444, dst);
+ src += stride;
+ dst += stride;
+ Circulate3PointersBy1<uint16_t>(ma343);
+ Circulate3PointersBy1<uint32_t>(b343);
+ std::swap(ma444[0], ma444[1]);
+ std::swap(b444[0], b444[1]);
+ }
+
+ int y = std::min(height, 2);
+ src += 2;
+ do {
+ Circulate3PointersBy1<uint16_t>(sum3);
+ Circulate3PointersBy1<uint32_t>(square_sum3);
+ BoxFilterPass2(src, bottom_border, width, sum_width, scale, w0, sum3,
+ square_sum3, ma343, ma444, b343, b444, dst);
+ src += stride;
+ dst += stride;
+ bottom_border += bottom_border_stride;
+ Circulate3PointersBy1<uint16_t>(ma343);
+ Circulate3PointersBy1<uint32_t>(b343);
+ std::swap(ma444[0], ma444[1]);
+ std::swap(b444[0], b444[1]);
+ } while (--y != 0);
+}
+
+// If |width| is non-multiple of 32, up to 31 more pixels are written to |dest|
+// in the end of each row. It is safe to overwrite the output as it will not be
+// part of the visible frame.
+void SelfGuidedFilter_AVX2(
+ const RestorationUnitInfo& restoration_info, const void* const source,
+ const ptrdiff_t stride, const void* const top_border,
+ const ptrdiff_t top_border_stride, const void* const bottom_border,
+ const ptrdiff_t bottom_border_stride, const int width, const int height,
+ RestorationBuffer* const restoration_buffer, void* const dest) {
+ const int index = restoration_info.sgr_proj_info.index;
+ const int radius_pass_0 = kSgrProjParams[index][0]; // 2 or 0
+ const int radius_pass_1 = kSgrProjParams[index][2]; // 1 or 0
+ const auto* const src = static_cast<const uint16_t*>(source);
+ const auto* const top = static_cast<const uint16_t*>(top_border);
+ const auto* const bottom = static_cast<const uint16_t*>(bottom_border);
+ auto* const dst = static_cast<uint16_t*>(dest);
+ SgrBuffer* const sgr_buffer = &restoration_buffer->sgr_buffer;
+ if (radius_pass_1 == 0) {
+ // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the
+ // following assertion.
+ assert(radius_pass_0 != 0);
+ BoxFilterProcessPass1(restoration_info, src - 3, stride, top - 3,
+ top_border_stride, bottom - 3, bottom_border_stride,
+ width, height, sgr_buffer, dst);
+ } else if (radius_pass_0 == 0) {
+ BoxFilterProcessPass2(restoration_info, src - 2, stride, top - 2,
+ top_border_stride, bottom - 2, bottom_border_stride,
+ width, height, sgr_buffer, dst);
+ } else {
+ BoxFilterProcess(restoration_info, src - 3, stride, top - 3,
+ top_border_stride, bottom - 3, bottom_border_stride, width,
+ height, sgr_buffer, dst);
+ }
+}
+
void Init10bpp() {
Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
assert(dsp != nullptr);
#if DSP_ENABLED_10BPP_AVX2(WienerFilter)
dsp->loop_restorations[0] = WienerFilter_AVX2;
#endif
+#if DSP_ENABLED_10BPP_AVX2(SelfGuidedFilter)
+ dsp->loop_restorations[1] = SelfGuidedFilter_AVX2;
+#endif
}
} // namespace
@@ -581,7 +3146,7 @@ void LoopRestorationInit10bpp_AVX2() { Init10bpp(); }
} // namespace dsp
} // namespace libgav1
-#else // !(LIBGAV1_TARGETING_AVX2 && LIBGAV1_MAX_BITDEPTH >= 10)
+#else // !(LIBGAV1_TARGETING_AVX2 && LIBGAV1_MAX_BITDEPTH >= 10)
namespace libgav1 {
namespace dsp {
diff --git a/src/dsp/x86/loop_restoration_10bit_sse4.cc b/src/dsp/x86/loop_restoration_10bit_sse4.cc
index 0598435..96380e3 100644
--- a/src/dsp/x86/loop_restoration_10bit_sse4.cc
+++ b/src/dsp/x86/loop_restoration_10bit_sse4.cc
@@ -428,13 +428,12 @@ inline void WienerVerticalTap1(const int16_t* wiener_buffer,
}
}
-void WienerFilter_SSE4_1(const RestorationUnitInfo& restoration_info,
- const void* const source, const void* const top_border,
- const void* const bottom_border,
- const ptrdiff_t stride, const int width,
- const int height,
- RestorationBuffer* const restoration_buffer,
- void* const dest) {
+void WienerFilter_SSE4_1(
+ const RestorationUnitInfo& restoration_info, const void* const source,
+ const ptrdiff_t stride, const void* const top_border,
+ const ptrdiff_t top_border_stride, const void* const bottom_border,
+ const ptrdiff_t bottom_border_stride, const int width, const int height,
+ RestorationBuffer* const restoration_buffer, void* const dest) {
const int16_t* const number_leading_zero_coefficients =
restoration_info.wiener_info.number_leading_zero_coefficients;
const int number_rows_to_skip = std::max(
@@ -458,39 +457,42 @@ void WienerFilter_SSE4_1(const RestorationUnitInfo& restoration_info,
const __m128i coefficients_horizontal =
LoadLo8(restoration_info.wiener_info.filter[WienerInfo::kHorizontal]);
if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) {
- WienerHorizontalTap7(top + (2 - height_extra) * stride - 3, stride,
- wiener_stride, height_extra, coefficients_horizontal,
- &wiener_buffer_horizontal);
- WienerHorizontalTap7(src - 3, stride, wiener_stride, height,
+ WienerHorizontalTap7(top + (2 - height_extra) * top_border_stride - 3,
+ top_border_stride, wiener_stride, height_extra,
coefficients_horizontal, &wiener_buffer_horizontal);
- WienerHorizontalTap7(bottom - 3, stride, wiener_stride, height_extra,
+ WienerHorizontalTap7(src - 3, stride, wiener_stride, height,
coefficients_horizontal, &wiener_buffer_horizontal);
- } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
- WienerHorizontalTap5(top + (2 - height_extra) * stride - 2, stride,
- wiener_stride, height_extra, coefficients_horizontal,
+ WienerHorizontalTap7(bottom - 3, bottom_border_stride, wiener_stride,
+ height_extra, coefficients_horizontal,
&wiener_buffer_horizontal);
- WienerHorizontalTap5(src - 2, stride, wiener_stride, height,
+ } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
+ WienerHorizontalTap5(top + (2 - height_extra) * top_border_stride - 2,
+ top_border_stride, wiener_stride, height_extra,
coefficients_horizontal, &wiener_buffer_horizontal);
- WienerHorizontalTap5(bottom - 2, stride, wiener_stride, height_extra,
+ WienerHorizontalTap5(src - 2, stride, wiener_stride, height,
coefficients_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap5(bottom - 2, bottom_border_stride, wiener_stride,
+ height_extra, coefficients_horizontal,
+ &wiener_buffer_horizontal);
} else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) {
// The maximum over-reads happen here.
- WienerHorizontalTap3(top + (2 - height_extra) * stride - 1, stride,
- wiener_stride, height_extra, coefficients_horizontal,
- &wiener_buffer_horizontal);
- WienerHorizontalTap3(src - 1, stride, wiener_stride, height,
+ WienerHorizontalTap3(top + (2 - height_extra) * top_border_stride - 1,
+ top_border_stride, wiener_stride, height_extra,
coefficients_horizontal, &wiener_buffer_horizontal);
- WienerHorizontalTap3(bottom - 1, stride, wiener_stride, height_extra,
+ WienerHorizontalTap3(src - 1, stride, wiener_stride, height,
coefficients_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap3(bottom - 1, bottom_border_stride, wiener_stride,
+ height_extra, coefficients_horizontal,
+ &wiener_buffer_horizontal);
} else {
assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3);
- WienerHorizontalTap1(top + (2 - height_extra) * stride, stride,
- wiener_stride, height_extra,
+ WienerHorizontalTap1(top + (2 - height_extra) * top_border_stride,
+ top_border_stride, wiener_stride, height_extra,
&wiener_buffer_horizontal);
WienerHorizontalTap1(src, stride, wiener_stride, height,
&wiener_buffer_horizontal);
- WienerHorizontalTap1(bottom, stride, wiener_stride, height_extra,
- &wiener_buffer_horizontal);
+ WienerHorizontalTap1(bottom, bottom_border_stride, wiener_stride,
+ height_extra, &wiener_buffer_horizontal);
}
// vertical filtering.
@@ -522,6 +524,1978 @@ void WienerFilter_SSE4_1(const RestorationUnitInfo& restoration_info,
}
}
+//------------------------------------------------------------------------------
+// SGR
+
+// SIMD overreads 8 - (width % 8) - 2 * padding pixels, where padding is 3 for
+// Pass 1 and 2 for Pass 2.
+constexpr int kOverreadInBytesPass1 = 4;
+constexpr int kOverreadInBytesPass2 = 8;
+
+inline void LoadAligned16x2U16(const uint16_t* const src[2], const ptrdiff_t x,
+ __m128i dst[2]) {
+ dst[0] = LoadAligned16(src[0] + x);
+ dst[1] = LoadAligned16(src[1] + x);
+}
+
+inline void LoadAligned16x2U16Msan(const uint16_t* const src[2],
+ const ptrdiff_t x, const ptrdiff_t border,
+ __m128i dst[2]) {
+ dst[0] = LoadAligned16Msan(src[0] + x, sizeof(**src) * (x + 8 - border));
+ dst[1] = LoadAligned16Msan(src[1] + x, sizeof(**src) * (x + 8 - border));
+}
+
+inline void LoadAligned16x3U16(const uint16_t* const src[3], const ptrdiff_t x,
+ __m128i dst[3]) {
+ dst[0] = LoadAligned16(src[0] + x);
+ dst[1] = LoadAligned16(src[1] + x);
+ dst[2] = LoadAligned16(src[2] + x);
+}
+
+inline void LoadAligned16x3U16Msan(const uint16_t* const src[3],
+ const ptrdiff_t x, const ptrdiff_t border,
+ __m128i dst[3]) {
+ dst[0] = LoadAligned16Msan(src[0] + x, sizeof(**src) * (x + 8 - border));
+ dst[1] = LoadAligned16Msan(src[1] + x, sizeof(**src) * (x + 8 - border));
+ dst[2] = LoadAligned16Msan(src[2] + x, sizeof(**src) * (x + 8 - border));
+}
+
+inline void LoadAligned32U32(const uint32_t* const src, __m128i dst[2]) {
+ dst[0] = LoadAligned16(src + 0);
+ dst[1] = LoadAligned16(src + 4);
+}
+
+inline void LoadAligned32U32Msan(const uint32_t* const src, const ptrdiff_t x,
+ const ptrdiff_t border, __m128i dst[2]) {
+ dst[0] = LoadAligned16Msan(src + x + 0, sizeof(*src) * (x + 4 - border));
+ dst[1] = LoadAligned16Msan(src + x + 4, sizeof(*src) * (x + 8 - border));
+}
+
+inline void LoadAligned32x2U32(const uint32_t* const src[2], const ptrdiff_t x,
+ __m128i dst[2][2]) {
+ LoadAligned32U32(src[0] + x, dst[0]);
+ LoadAligned32U32(src[1] + x, dst[1]);
+}
+
+inline void LoadAligned32x2U32Msan(const uint32_t* const src[2],
+ const ptrdiff_t x, const ptrdiff_t border,
+ __m128i dst[2][2]) {
+ LoadAligned32U32Msan(src[0], x, border, dst[0]);
+ LoadAligned32U32Msan(src[1], x, border, dst[1]);
+}
+
+inline void LoadAligned32x3U32(const uint32_t* const src[3], const ptrdiff_t x,
+ __m128i dst[3][2]) {
+ LoadAligned32U32(src[0] + x, dst[0]);
+ LoadAligned32U32(src[1] + x, dst[1]);
+ LoadAligned32U32(src[2] + x, dst[2]);
+}
+
+inline void LoadAligned32x3U32Msan(const uint32_t* const src[3],
+ const ptrdiff_t x, const ptrdiff_t border,
+ __m128i dst[3][2]) {
+ LoadAligned32U32Msan(src[0], x, border, dst[0]);
+ LoadAligned32U32Msan(src[1], x, border, dst[1]);
+ LoadAligned32U32Msan(src[2], x, border, dst[2]);
+}
+
+inline void StoreAligned32U16(uint16_t* const dst, const __m128i src[2]) {
+ StoreAligned16(dst + 0, src[0]);
+ StoreAligned16(dst + 8, src[1]);
+}
+
+inline void StoreAligned32U32(uint32_t* const dst, const __m128i src[2]) {
+ StoreAligned16(dst + 0, src[0]);
+ StoreAligned16(dst + 4, src[1]);
+}
+
+inline void StoreAligned64U32(uint32_t* const dst, const __m128i src[4]) {
+ StoreAligned32U32(dst + 0, src + 0);
+ StoreAligned32U32(dst + 8, src + 2);
+}
+
+// Don't use _mm_cvtepu8_epi16() or _mm_cvtepu16_epi32() in the following
+// functions. Some compilers may generate super inefficient code and the whole
+// decoder could be 15% slower.
+
+inline __m128i VaddlLo8(const __m128i src0, const __m128i src1) {
+ const __m128i s0 = _mm_unpacklo_epi8(src0, _mm_setzero_si128());
+ const __m128i s1 = _mm_unpacklo_epi8(src1, _mm_setzero_si128());
+ return _mm_add_epi16(s0, s1);
+}
+
+inline __m128i VaddlHi8(const __m128i src0, const __m128i src1) {
+ const __m128i s0 = _mm_unpackhi_epi8(src0, _mm_setzero_si128());
+ const __m128i s1 = _mm_unpackhi_epi8(src1, _mm_setzero_si128());
+ return _mm_add_epi16(s0, s1);
+}
+
+inline __m128i VaddwLo8(const __m128i src0, const __m128i src1) {
+ const __m128i s1 = _mm_unpacklo_epi8(src1, _mm_setzero_si128());
+ return _mm_add_epi16(src0, s1);
+}
+
+inline __m128i VaddwHi8(const __m128i src0, const __m128i src1) {
+ const __m128i s1 = _mm_unpackhi_epi8(src1, _mm_setzero_si128());
+ return _mm_add_epi16(src0, s1);
+}
+
+inline __m128i VmullNLo8(const __m128i src0, const int src1) {
+ const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128());
+ return _mm_madd_epi16(s0, _mm_set1_epi32(src1));
+}
+
+inline __m128i VmullNHi8(const __m128i src0, const int src1) {
+ const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128());
+ return _mm_madd_epi16(s0, _mm_set1_epi32(src1));
+}
+
+inline __m128i VmullLo16(const __m128i src0, const __m128i src1) {
+ const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128());
+ const __m128i s1 = _mm_unpacklo_epi16(src1, _mm_setzero_si128());
+ return _mm_madd_epi16(s0, s1);
+}
+
+inline __m128i VmullHi16(const __m128i src0, const __m128i src1) {
+ const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128());
+ const __m128i s1 = _mm_unpackhi_epi16(src1, _mm_setzero_si128());
+ return _mm_madd_epi16(s0, s1);
+}
+
+inline __m128i VrshrU16(const __m128i src0, const int src1) {
+ const __m128i sum = _mm_add_epi16(src0, _mm_set1_epi16(1 << (src1 - 1)));
+ return _mm_srli_epi16(sum, src1);
+}
+
+inline __m128i VrshrS32(const __m128i src0, const int src1) {
+ const __m128i sum = _mm_add_epi32(src0, _mm_set1_epi32(1 << (src1 - 1)));
+ return _mm_srai_epi32(sum, src1);
+}
+
+inline __m128i VrshrU32(const __m128i src0, const int src1) {
+ const __m128i sum = _mm_add_epi32(src0, _mm_set1_epi32(1 << (src1 - 1)));
+ return _mm_srli_epi32(sum, src1);
+}
+
+inline void Square(const __m128i src, __m128i dst[2]) {
+ const __m128i s0 = _mm_unpacklo_epi16(src, _mm_setzero_si128());
+ const __m128i s1 = _mm_unpackhi_epi16(src, _mm_setzero_si128());
+ dst[0] = _mm_madd_epi16(s0, s0);
+ dst[1] = _mm_madd_epi16(s1, s1);
+}
+
+template <int offset>
+inline void Prepare3_8(const __m128i src[2], __m128i dst[3]) {
+ dst[0] = _mm_alignr_epi8(src[1], src[0], offset + 0);
+ dst[1] = _mm_alignr_epi8(src[1], src[0], offset + 1);
+ dst[2] = _mm_alignr_epi8(src[1], src[0], offset + 2);
+}
+
+inline void Prepare3_16(const __m128i src[2], __m128i dst[3]) {
+ dst[0] = src[0];
+ dst[1] = _mm_alignr_epi8(src[1], src[0], 2);
+ dst[2] = _mm_alignr_epi8(src[1], src[0], 4);
+}
+
+inline void Prepare3_32(const __m128i src[2], __m128i dst[3]) {
+ dst[0] = src[0];
+ dst[1] = _mm_alignr_epi8(src[1], src[0], 4);
+ dst[2] = _mm_alignr_epi8(src[1], src[0], 8);
+}
+
+inline void Prepare5_16(const __m128i src[2], __m128i dst[5]) {
+ Prepare3_16(src, dst);
+ dst[3] = _mm_alignr_epi8(src[1], src[0], 6);
+ dst[4] = _mm_alignr_epi8(src[1], src[0], 8);
+}
+
+inline void Prepare5_32(const __m128i src[2], __m128i dst[5]) {
+ Prepare3_32(src, dst);
+ dst[3] = _mm_alignr_epi8(src[1], src[0], 12);
+ dst[4] = src[1];
+}
+
+inline __m128i Sum3_16(const __m128i src0, const __m128i src1,
+ const __m128i src2) {
+ const __m128i sum = _mm_add_epi16(src0, src1);
+ return _mm_add_epi16(sum, src2);
+}
+
+inline __m128i Sum3_16(const __m128i src[3]) {
+ return Sum3_16(src[0], src[1], src[2]);
+}
+
+inline __m128i Sum3_32(const __m128i src0, const __m128i src1,
+ const __m128i src2) {
+ const __m128i sum = _mm_add_epi32(src0, src1);
+ return _mm_add_epi32(sum, src2);
+}
+
+inline __m128i Sum3_32(const __m128i src[3]) {
+ return Sum3_32(src[0], src[1], src[2]);
+}
+
+inline void Sum3_32(const __m128i src[3][2], __m128i dst[2]) {
+ dst[0] = Sum3_32(src[0][0], src[1][0], src[2][0]);
+ dst[1] = Sum3_32(src[0][1], src[1][1], src[2][1]);
+}
+
+inline __m128i Sum3WLo16(const __m128i src[3]) {
+ const __m128i sum = VaddlLo8(src[0], src[1]);
+ return VaddwLo8(sum, src[2]);
+}
+
+inline __m128i Sum3WHi16(const __m128i src[3]) {
+ const __m128i sum = VaddlHi8(src[0], src[1]);
+ return VaddwHi8(sum, src[2]);
+}
+
+inline __m128i Sum5_16(const __m128i src[5]) {
+ const __m128i sum01 = _mm_add_epi16(src[0], src[1]);
+ const __m128i sum23 = _mm_add_epi16(src[2], src[3]);
+ const __m128i sum = _mm_add_epi16(sum01, sum23);
+ return _mm_add_epi16(sum, src[4]);
+}
+
+inline __m128i Sum5_32(const __m128i* const src0, const __m128i* const src1,
+ const __m128i* const src2, const __m128i* const src3,
+ const __m128i* const src4) {
+ const __m128i sum01 = _mm_add_epi32(*src0, *src1);
+ const __m128i sum23 = _mm_add_epi32(*src2, *src3);
+ const __m128i sum = _mm_add_epi32(sum01, sum23);
+ return _mm_add_epi32(sum, *src4);
+}
+
+inline __m128i Sum5_32(const __m128i src[5]) {
+ return Sum5_32(&src[0], &src[1], &src[2], &src[3], &src[4]);
+}
+
+inline void Sum5_32(const __m128i src[5][2], __m128i dst[2]) {
+ dst[0] = Sum5_32(&src[0][0], &src[1][0], &src[2][0], &src[3][0], &src[4][0]);
+ dst[1] = Sum5_32(&src[0][1], &src[1][1], &src[2][1], &src[3][1], &src[4][1]);
+}
+
+inline __m128i Sum3Horizontal16(const __m128i src[2]) {
+ __m128i s[3];
+ Prepare3_16(src, s);
+ return Sum3_16(s);
+}
+
+inline void Sum3Horizontal32(const __m128i src[3], __m128i dst[2]) {
+ __m128i s[3];
+ Prepare3_32(src + 0, s);
+ dst[0] = Sum3_32(s);
+ Prepare3_32(src + 1, s);
+ dst[1] = Sum3_32(s);
+}
+
+inline __m128i Sum5Horizontal16(const __m128i src[2]) {
+ __m128i s[5];
+ Prepare5_16(src, s);
+ return Sum5_16(s);
+}
+
+inline void Sum5Horizontal32(const __m128i src[3], __m128i dst[2]) {
+ __m128i s[5];
+ Prepare5_32(src + 0, s);
+ dst[0] = Sum5_32(s);
+ Prepare5_32(src + 1, s);
+ dst[1] = Sum5_32(s);
+}
+
+void SumHorizontal16(const __m128i src[2], __m128i* const row3,
+ __m128i* const row5) {
+ __m128i s[5];
+ Prepare5_16(src, s);
+ const __m128i sum04 = _mm_add_epi16(s[0], s[4]);
+ *row3 = Sum3_16(s + 1);
+ *row5 = _mm_add_epi16(sum04, *row3);
+}
+
+inline void SumHorizontal16(const __m128i src[3], __m128i* const row3_0,
+ __m128i* const row3_1, __m128i* const row5_0,
+ __m128i* const row5_1) {
+ SumHorizontal16(src + 0, row3_0, row5_0);
+ SumHorizontal16(src + 1, row3_1, row5_1);
+}
+
+void SumHorizontal32(const __m128i src[5], __m128i* const row_sq3,
+ __m128i* const row_sq5) {
+ const __m128i sum04 = _mm_add_epi32(src[0], src[4]);
+ *row_sq3 = Sum3_32(src + 1);
+ *row_sq5 = _mm_add_epi32(sum04, *row_sq3);
+}
+
+inline void SumHorizontal32(const __m128i src[3], __m128i* const row_sq3_0,
+ __m128i* const row_sq3_1, __m128i* const row_sq5_0,
+ __m128i* const row_sq5_1) {
+ __m128i s[5];
+ Prepare5_32(src + 0, s);
+ SumHorizontal32(s, row_sq3_0, row_sq5_0);
+ Prepare5_32(src + 1, s);
+ SumHorizontal32(s, row_sq3_1, row_sq5_1);
+}
+
+inline __m128i Sum343Lo(const __m128i ma3[3]) {
+ const __m128i sum = Sum3WLo16(ma3);
+ const __m128i sum3 = Sum3_16(sum, sum, sum);
+ return VaddwLo8(sum3, ma3[1]);
+}
+
+inline __m128i Sum343Hi(const __m128i ma3[3]) {
+ const __m128i sum = Sum3WHi16(ma3);
+ const __m128i sum3 = Sum3_16(sum, sum, sum);
+ return VaddwHi8(sum3, ma3[1]);
+}
+
+inline __m128i Sum343(const __m128i src[3]) {
+ const __m128i sum = Sum3_32(src);
+ const __m128i sum3 = Sum3_32(sum, sum, sum);
+ return _mm_add_epi32(sum3, src[1]);
+}
+
+inline void Sum343(const __m128i src[3], __m128i dst[2]) {
+ __m128i s[3];
+ Prepare3_32(src + 0, s);
+ dst[0] = Sum343(s);
+ Prepare3_32(src + 1, s);
+ dst[1] = Sum343(s);
+}
+
+inline __m128i Sum565Lo(const __m128i src[3]) {
+ const __m128i sum = Sum3WLo16(src);
+ const __m128i sum4 = _mm_slli_epi16(sum, 2);
+ const __m128i sum5 = _mm_add_epi16(sum4, sum);
+ return VaddwLo8(sum5, src[1]);
+}
+
+inline __m128i Sum565Hi(const __m128i src[3]) {
+ const __m128i sum = Sum3WHi16(src);
+ const __m128i sum4 = _mm_slli_epi16(sum, 2);
+ const __m128i sum5 = _mm_add_epi16(sum4, sum);
+ return VaddwHi8(sum5, src[1]);
+}
+
+inline __m128i Sum565(const __m128i src[3]) {
+ const __m128i sum = Sum3_32(src);
+ const __m128i sum4 = _mm_slli_epi32(sum, 2);
+ const __m128i sum5 = _mm_add_epi32(sum4, sum);
+ return _mm_add_epi32(sum5, src[1]);
+}
+
+inline void Sum565(const __m128i src[3], __m128i dst[2]) {
+ __m128i s[3];
+ Prepare3_32(src + 0, s);
+ dst[0] = Sum565(s);
+ Prepare3_32(src + 1, s);
+ dst[1] = Sum565(s);
+}
+
+inline void BoxSum(const uint16_t* src, const ptrdiff_t src_stride,
+ const ptrdiff_t width, const ptrdiff_t sum_stride,
+ const ptrdiff_t sum_width, uint16_t* sum3, uint16_t* sum5,
+ uint32_t* square_sum3, uint32_t* square_sum5) {
+ const ptrdiff_t overread_in_bytes =
+ kOverreadInBytesPass1 - sizeof(*src) * width;
+ int y = 2;
+ do {
+ __m128i s[3], sq[6];
+ s[0] = LoadUnaligned16Msan(src, overread_in_bytes);
+ Square(s[0], sq);
+ ptrdiff_t x = sum_width;
+ do {
+ __m128i row3[2], row5[2], row_sq3[2], row_sq5[2];
+ s[1] = LoadUnaligned16Msan(
+ src + 8, overread_in_bytes + sizeof(*src) * (sum_width - x + 8));
+ x -= 16;
+ src += 16;
+ s[2] = LoadUnaligned16Msan(
+ src, overread_in_bytes + sizeof(*src) * (sum_width - x));
+ Square(s[1], sq + 2);
+ Square(s[2], sq + 4);
+ SumHorizontal16(s, &row3[0], &row3[1], &row5[0], &row5[1]);
+ StoreAligned32U16(sum3, row3);
+ StoreAligned32U16(sum5, row5);
+ SumHorizontal32(sq + 0, &row_sq3[0], &row_sq3[1], &row_sq5[0],
+ &row_sq5[1]);
+ StoreAligned32U32(square_sum3 + 0, row_sq3);
+ StoreAligned32U32(square_sum5 + 0, row_sq5);
+ SumHorizontal32(sq + 2, &row_sq3[0], &row_sq3[1], &row_sq5[0],
+ &row_sq5[1]);
+ StoreAligned32U32(square_sum3 + 8, row_sq3);
+ StoreAligned32U32(square_sum5 + 8, row_sq5);
+ s[0] = s[2];
+ sq[0] = sq[4];
+ sq[1] = sq[5];
+ sum3 += 16;
+ sum5 += 16;
+ square_sum3 += 16;
+ square_sum5 += 16;
+ } while (x != 0);
+ src += src_stride - sum_width;
+ sum3 += sum_stride - sum_width;
+ sum5 += sum_stride - sum_width;
+ square_sum3 += sum_stride - sum_width;
+ square_sum5 += sum_stride - sum_width;
+ } while (--y != 0);
+}
+
+template <int size>
+inline void BoxSum(const uint16_t* src, const ptrdiff_t src_stride,
+ const ptrdiff_t width, const ptrdiff_t sum_stride,
+ const ptrdiff_t sum_width, uint16_t* sums,
+ uint32_t* square_sums) {
+ static_assert(size == 3 || size == 5, "");
+ const ptrdiff_t overread_in_bytes =
+ ((size == 5) ? kOverreadInBytesPass1 : kOverreadInBytesPass2) -
+ sizeof(*src) * width;
+ int y = 2;
+ do {
+ __m128i s[3], sq[6];
+ s[0] = LoadUnaligned16Msan(src, overread_in_bytes);
+ Square(s[0], sq);
+ ptrdiff_t x = sum_width;
+ do {
+ __m128i row[2], row_sq[4];
+ s[1] = LoadUnaligned16Msan(
+ src + 8, overread_in_bytes + sizeof(*src) * (sum_width - x + 8));
+ x -= 16;
+ src += 16;
+ s[2] = LoadUnaligned16Msan(
+ src, overread_in_bytes + sizeof(*src) * (sum_width - x));
+ Square(s[1], sq + 2);
+ Square(s[2], sq + 4);
+ if (size == 3) {
+ row[0] = Sum3Horizontal16(s + 0);
+ row[1] = Sum3Horizontal16(s + 1);
+ Sum3Horizontal32(sq + 0, row_sq + 0);
+ Sum3Horizontal32(sq + 2, row_sq + 2);
+ } else {
+ row[0] = Sum5Horizontal16(s + 0);
+ row[1] = Sum5Horizontal16(s + 1);
+ Sum5Horizontal32(sq + 0, row_sq + 0);
+ Sum5Horizontal32(sq + 2, row_sq + 2);
+ }
+ StoreAligned32U16(sums, row);
+ StoreAligned64U32(square_sums, row_sq);
+ s[0] = s[2];
+ sq[0] = sq[4];
+ sq[1] = sq[5];
+ sums += 16;
+ square_sums += 16;
+ } while (x != 0);
+ src += src_stride - sum_width;
+ sums += sum_stride - sum_width;
+ square_sums += sum_stride - sum_width;
+ } while (--y != 0);
+}
+
+template <int n>
+inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq,
+ const uint32_t scale) {
+ static_assert(n == 9 || n == 25, "");
+ // a = |sum_sq|
+ // d = |sum|
+ // p = (a * n < d * d) ? 0 : a * n - d * d;
+ const __m128i dxd = _mm_madd_epi16(sum, sum);
+ // _mm_mullo_epi32() has high latency. Using shifts and additions instead.
+ // Some compilers could do this for us but we make this explicit.
+ // return _mm_mullo_epi32(sum_sq, _mm_set1_epi32(n));
+ __m128i axn = _mm_add_epi32(sum_sq, _mm_slli_epi32(sum_sq, 3));
+ if (n == 25) axn = _mm_add_epi32(axn, _mm_slli_epi32(sum_sq, 4));
+ const __m128i sub = _mm_sub_epi32(axn, dxd);
+ const __m128i p = _mm_max_epi32(sub, _mm_setzero_si128());
+ const __m128i pxs = _mm_mullo_epi32(p, _mm_set1_epi32(scale));
+ return VrshrU32(pxs, kSgrProjScaleBits);
+}
+
+template <int n>
+inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq[2],
+ const uint32_t scale) {
+ static_assert(n == 9 || n == 25, "");
+ const __m128i b = VrshrU16(sum, 2);
+ const __m128i sum_lo = _mm_unpacklo_epi16(b, _mm_setzero_si128());
+ const __m128i sum_hi = _mm_unpackhi_epi16(b, _mm_setzero_si128());
+ const __m128i z0 = CalculateMa<n>(sum_lo, VrshrU32(sum_sq[0], 4), scale);
+ const __m128i z1 = CalculateMa<n>(sum_hi, VrshrU32(sum_sq[1], 4), scale);
+ return _mm_packus_epi32(z0, z1);
+}
+
+inline void CalculateB5(const __m128i sum, const __m128i ma, __m128i b[2]) {
+ // one_over_n == 164.
+ constexpr uint32_t one_over_n =
+ ((1 << kSgrProjReciprocalBits) + (25 >> 1)) / 25;
+ // one_over_n_quarter == 41.
+ constexpr uint32_t one_over_n_quarter = one_over_n >> 2;
+ static_assert(one_over_n == one_over_n_quarter << 2, "");
+ // |ma| is in range [0, 255].
+ const __m128i m = _mm_maddubs_epi16(ma, _mm_set1_epi16(one_over_n_quarter));
+ const __m128i m0 = VmullLo16(m, sum);
+ const __m128i m1 = VmullHi16(m, sum);
+ b[0] = VrshrU32(m0, kSgrProjReciprocalBits - 2);
+ b[1] = VrshrU32(m1, kSgrProjReciprocalBits - 2);
+}
+
+inline void CalculateB3(const __m128i sum, const __m128i ma, __m128i b[2]) {
+ // one_over_n == 455.
+ constexpr uint32_t one_over_n =
+ ((1 << kSgrProjReciprocalBits) + (9 >> 1)) / 9;
+ const __m128i m0 = VmullLo16(ma, sum);
+ const __m128i m1 = VmullHi16(ma, sum);
+ const __m128i m2 = _mm_mullo_epi32(m0, _mm_set1_epi32(one_over_n));
+ const __m128i m3 = _mm_mullo_epi32(m1, _mm_set1_epi32(one_over_n));
+ b[0] = VrshrU32(m2, kSgrProjReciprocalBits);
+ b[1] = VrshrU32(m3, kSgrProjReciprocalBits);
+}
+
+inline void CalculateSumAndIndex5(const __m128i s5[5], const __m128i sq5[5][2],
+ const uint32_t scale, __m128i* const sum,
+ __m128i* const index) {
+ __m128i sum_sq[2];
+ *sum = Sum5_16(s5);
+ Sum5_32(sq5, sum_sq);
+ *index = CalculateMa<25>(*sum, sum_sq, scale);
+}
+
+inline void CalculateSumAndIndex3(const __m128i s3[3], const __m128i sq3[3][2],
+ const uint32_t scale, __m128i* const sum,
+ __m128i* const index) {
+ __m128i sum_sq[2];
+ *sum = Sum3_16(s3);
+ Sum3_32(sq3, sum_sq);
+ *index = CalculateMa<9>(*sum, sum_sq, scale);
+}
+
+template <int n, int offset>
+inline void LookupIntermediate(const __m128i sum, const __m128i index,
+ __m128i* const ma, __m128i b[2]) {
+ static_assert(n == 9 || n == 25, "");
+ static_assert(offset == 0 || offset == 8, "");
+ const __m128i idx = _mm_packus_epi16(index, index);
+ // Actually it's not stored and loaded. The compiler will use a 64-bit
+ // general-purpose register to process. Faster than using _mm_extract_epi8().
+ uint8_t temp[8];
+ StoreLo8(temp, idx);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[0]], offset + 0);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[1]], offset + 1);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[2]], offset + 2);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[3]], offset + 3);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[4]], offset + 4);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[5]], offset + 5);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[6]], offset + 6);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[7]], offset + 7);
+ // b = ma * b * one_over_n
+ // |ma| = [0, 255]
+ // |sum| is a box sum with radius 1 or 2.
+ // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+ // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+ // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+ // When radius is 2 |n| is 25. |one_over_n| is 164.
+ // When radius is 1 |n| is 9. |one_over_n| is 455.
+ // |kSgrProjReciprocalBits| is 12.
+ // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+ // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+ __m128i maq;
+ if (offset == 0) {
+ maq = _mm_unpacklo_epi8(*ma, _mm_setzero_si128());
+ } else {
+ maq = _mm_unpackhi_epi8(*ma, _mm_setzero_si128());
+ }
+ if (n == 9) {
+ CalculateB3(sum, maq, b);
+ } else {
+ CalculateB5(sum, maq, b);
+ }
+}
+
+// Set the shuffle control mask of indices out of range [0, 15] to (1xxxxxxx)b
+// to get value 0 as the shuffle result. The most significiant bit 1 comes
+// either from the comparison instruction, or from the sign bit of the index.
+inline __m128i ShuffleIndex(const __m128i table, const __m128i index) {
+ __m128i mask;
+ mask = _mm_cmpgt_epi8(index, _mm_set1_epi8(15));
+ mask = _mm_or_si128(mask, index);
+ return _mm_shuffle_epi8(table, mask);
+}
+
+inline __m128i AdjustValue(const __m128i value, const __m128i index,
+ const int threshold) {
+ const __m128i thresholds = _mm_set1_epi8(threshold - 128);
+ const __m128i offset = _mm_cmpgt_epi8(index, thresholds);
+ return _mm_add_epi8(value, offset);
+}
+
+inline void CalculateIntermediate(const __m128i sum[2], const __m128i index[2],
+ __m128i* const ma, __m128i b0[2],
+ __m128i b1[2]) {
+ // Use table lookup to read elements whose indices are less than 48.
+ const __m128i c0 = LoadAligned16(kSgrMaLookup + 0 * 16);
+ const __m128i c1 = LoadAligned16(kSgrMaLookup + 1 * 16);
+ const __m128i c2 = LoadAligned16(kSgrMaLookup + 2 * 16);
+ const __m128i indices = _mm_packus_epi16(index[0], index[1]);
+ __m128i idx;
+ // Clip idx to 127 to apply signed comparison instructions.
+ idx = _mm_min_epu8(indices, _mm_set1_epi8(127));
+ // All elements whose indices are less than 48 are set to 0.
+ // Get shuffle results for indices in range [0, 15].
+ *ma = ShuffleIndex(c0, idx);
+ // Get shuffle results for indices in range [16, 31].
+ // Subtract 16 to utilize the sign bit of the index.
+ idx = _mm_sub_epi8(idx, _mm_set1_epi8(16));
+ const __m128i res1 = ShuffleIndex(c1, idx);
+ // Use OR instruction to combine shuffle results together.
+ *ma = _mm_or_si128(*ma, res1);
+ // Get shuffle results for indices in range [32, 47].
+ // Subtract 16 to utilize the sign bit of the index.
+ idx = _mm_sub_epi8(idx, _mm_set1_epi8(16));
+ const __m128i res2 = ShuffleIndex(c2, idx);
+ *ma = _mm_or_si128(*ma, res2);
+
+ // For elements whose indices are larger than 47, since they seldom change
+ // values with the increase of the index, we use comparison and arithmetic
+ // operations to calculate their values.
+ // Add -128 to apply signed comparison instructions.
+ idx = _mm_add_epi8(indices, _mm_set1_epi8(-128));
+ // Elements whose indices are larger than 47 (with value 0) are set to 5.
+ *ma = _mm_max_epu8(*ma, _mm_set1_epi8(5));
+ *ma = AdjustValue(*ma, idx, 55); // 55 is the last index which value is 5.
+ *ma = AdjustValue(*ma, idx, 72); // 72 is the last index which value is 4.
+ *ma = AdjustValue(*ma, idx, 101); // 101 is the last index which value is 3.
+ *ma = AdjustValue(*ma, idx, 169); // 169 is the last index which value is 2.
+ *ma = AdjustValue(*ma, idx, 254); // 254 is the last index which value is 1.
+
+ // b = ma * b * one_over_n
+ // |ma| = [0, 255]
+ // |sum| is a box sum with radius 1 or 2.
+ // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+ // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+ // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+ // When radius is 2 |n| is 25. |one_over_n| is 164.
+ // When radius is 1 |n| is 9. |one_over_n| is 455.
+ // |kSgrProjReciprocalBits| is 12.
+ // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+ // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+ const __m128i maq0 = _mm_unpacklo_epi8(*ma, _mm_setzero_si128());
+ CalculateB3(sum[0], maq0, b0);
+ const __m128i maq1 = _mm_unpackhi_epi8(*ma, _mm_setzero_si128());
+ CalculateB3(sum[1], maq1, b1);
+}
+
+inline void CalculateIntermediate(const __m128i sum[2], const __m128i index[2],
+ __m128i ma[2], __m128i b[4]) {
+ __m128i mas;
+ CalculateIntermediate(sum, index, &mas, b + 0, b + 2);
+ ma[0] = _mm_unpacklo_epi64(ma[0], mas);
+ ma[1] = _mm_srli_si128(mas, 8);
+}
+
+// Note: It has been tried to call CalculateIntermediate() to replace the slow
+// LookupIntermediate() when calculating 16 intermediate data points. However,
+// the compiler generates even slower code.
+template <int offset>
+inline void CalculateIntermediate5(const __m128i s5[5], const __m128i sq5[5][2],
+ const uint32_t scale, __m128i* const ma,
+ __m128i b[2]) {
+ static_assert(offset == 0 || offset == 8, "");
+ __m128i sum, index;
+ CalculateSumAndIndex5(s5, sq5, scale, &sum, &index);
+ LookupIntermediate<25, offset>(sum, index, ma, b);
+}
+
+inline void CalculateIntermediate3(const __m128i s3[3], const __m128i sq3[3][2],
+ const uint32_t scale, __m128i* const ma,
+ __m128i b[2]) {
+ __m128i sum, index;
+ CalculateSumAndIndex3(s3, sq3, scale, &sum, &index);
+ LookupIntermediate<9, 0>(sum, index, ma, b);
+}
+
+inline void Store343_444(const __m128i b3[3], const ptrdiff_t x,
+ __m128i sum_b343[2], __m128i sum_b444[2],
+ uint32_t* const b343, uint32_t* const b444) {
+ __m128i b[3], sum_b111[2];
+ Prepare3_32(b3 + 0, b);
+ sum_b111[0] = Sum3_32(b);
+ sum_b444[0] = _mm_slli_epi32(sum_b111[0], 2);
+ sum_b343[0] = _mm_sub_epi32(sum_b444[0], sum_b111[0]);
+ sum_b343[0] = _mm_add_epi32(sum_b343[0], b[1]);
+ Prepare3_32(b3 + 1, b);
+ sum_b111[1] = Sum3_32(b);
+ sum_b444[1] = _mm_slli_epi32(sum_b111[1], 2);
+ sum_b343[1] = _mm_sub_epi32(sum_b444[1], sum_b111[1]);
+ sum_b343[1] = _mm_add_epi32(sum_b343[1], b[1]);
+ StoreAligned32U32(b444 + x, sum_b444);
+ StoreAligned32U32(b343 + x, sum_b343);
+}
+
+inline void Store343_444Lo(const __m128i ma3[3], const __m128i b3[3],
+ const ptrdiff_t x, __m128i* const sum_ma343,
+ __m128i* const sum_ma444, __m128i sum_b343[2],
+ __m128i sum_b444[2], uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ const __m128i sum_ma111 = Sum3WLo16(ma3);
+ *sum_ma444 = _mm_slli_epi16(sum_ma111, 2);
+ StoreAligned16(ma444 + x, *sum_ma444);
+ const __m128i sum333 = _mm_sub_epi16(*sum_ma444, sum_ma111);
+ *sum_ma343 = VaddwLo8(sum333, ma3[1]);
+ StoreAligned16(ma343 + x, *sum_ma343);
+ Store343_444(b3, x, sum_b343, sum_b444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m128i ma3[3], const __m128i b3[3],
+ const ptrdiff_t x, __m128i* const sum_ma343,
+ __m128i* const sum_ma444, __m128i sum_b343[2],
+ __m128i sum_b444[2], uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ const __m128i sum_ma111 = Sum3WHi16(ma3);
+ *sum_ma444 = _mm_slli_epi16(sum_ma111, 2);
+ StoreAligned16(ma444 + x, *sum_ma444);
+ const __m128i sum333 = _mm_sub_epi16(*sum_ma444, sum_ma111);
+ *sum_ma343 = VaddwHi8(sum333, ma3[1]);
+ StoreAligned16(ma343 + x, *sum_ma343);
+ Store343_444(b3, x, sum_b343, sum_b444, b343, b444);
+}
+
+inline void Store343_444Lo(const __m128i ma3[3], const __m128i b3[2],
+ const ptrdiff_t x, __m128i* const sum_ma343,
+ __m128i sum_b343[2], uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ __m128i sum_ma444, sum_b444[2];
+ Store343_444Lo(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
+ ma444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m128i ma3[3], const __m128i b3[2],
+ const ptrdiff_t x, __m128i* const sum_ma343,
+ __m128i sum_b343[2], uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ __m128i sum_ma444, sum_b444[2];
+ Store343_444Hi(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
+ ma444, b343, b444);
+}
+
+inline void Store343_444Lo(const __m128i ma3[3], const __m128i b3[2],
+ const ptrdiff_t x, uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ __m128i sum_ma343, sum_b343[2];
+ Store343_444Lo(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m128i ma3[3], const __m128i b3[2],
+ const ptrdiff_t x, uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ __m128i sum_ma343, sum_b343[2];
+ Store343_444Hi(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5Lo(
+ const __m128i s[2][4], const uint32_t scale, uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5], __m128i sq[2][8], __m128i* const ma,
+ __m128i b[2]) {
+ __m128i s5[2][5], sq5[5][2];
+ Square(s[0][1], sq[0] + 2);
+ Square(s[1][1], sq[1] + 2);
+ s5[0][3] = Sum5Horizontal16(s[0]);
+ StoreAligned16(sum5[3], s5[0][3]);
+ s5[0][4] = Sum5Horizontal16(s[1]);
+ StoreAligned16(sum5[4], s5[0][4]);
+ Sum5Horizontal32(sq[0], sq5[3]);
+ StoreAligned32U32(square_sum5[3], sq5[3]);
+ Sum5Horizontal32(sq[1], sq5[4]);
+ StoreAligned32U32(square_sum5[4], sq5[4]);
+ LoadAligned16x3U16(sum5, 0, s5[0]);
+ LoadAligned32x3U32(square_sum5, 0, sq5);
+ CalculateIntermediate5<0>(s5[0], sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5(
+ const __m128i s[2][4], const ptrdiff_t sum_width, const ptrdiff_t x,
+ const uint32_t scale, uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5], __m128i sq[2][8], __m128i ma[2],
+ __m128i b[6]) {
+ __m128i s5[2][5], sq5[5][2];
+ Square(s[0][2], sq[0] + 4);
+ Square(s[1][2], sq[1] + 4);
+ s5[0][3] = Sum5Horizontal16(s[0] + 1);
+ s5[1][3] = Sum5Horizontal16(s[0] + 2);
+ StoreAligned16(sum5[3] + x + 0, s5[0][3]);
+ StoreAligned16(sum5[3] + x + 8, s5[1][3]);
+ s5[0][4] = Sum5Horizontal16(s[1] + 1);
+ s5[1][4] = Sum5Horizontal16(s[1] + 2);
+ StoreAligned16(sum5[4] + x + 0, s5[0][4]);
+ StoreAligned16(sum5[4] + x + 8, s5[1][4]);
+ Sum5Horizontal32(sq[0] + 2, sq5[3]);
+ StoreAligned32U32(square_sum5[3] + x, sq5[3]);
+ Sum5Horizontal32(sq[1] + 2, sq5[4]);
+ StoreAligned32U32(square_sum5[4] + x, sq5[4]);
+ LoadAligned16x3U16(sum5, x, s5[0]);
+ LoadAligned32x3U32(square_sum5, x, sq5);
+ CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], b + 2);
+
+ Square(s[0][3], sq[0] + 6);
+ Square(s[1][3], sq[1] + 6);
+ Sum5Horizontal32(sq[0] + 4, sq5[3]);
+ StoreAligned32U32(square_sum5[3] + x + 8, sq5[3]);
+ Sum5Horizontal32(sq[1] + 4, sq5[4]);
+ StoreAligned32U32(square_sum5[4] + x + 8, sq5[4]);
+ LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
+ LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
+ CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], b + 4);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRowLo(
+ const __m128i s[2], const uint32_t scale, const uint16_t* const sum5[5],
+ const uint32_t* const square_sum5[5], __m128i sq[4], __m128i* const ma,
+ __m128i b[2]) {
+ __m128i s5[5], sq5[5][2];
+ Square(s[1], sq + 2);
+ s5[3] = s5[4] = Sum5Horizontal16(s);
+ Sum5Horizontal32(sq, sq5[3]);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ LoadAligned16x3U16(sum5, 0, s5);
+ LoadAligned32x3U32(square_sum5, 0, sq5);
+ CalculateIntermediate5<0>(s5, sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRow(
+ const __m128i s[4], const ptrdiff_t sum_width, const ptrdiff_t x,
+ const uint32_t scale, const uint16_t* const sum5[5],
+ const uint32_t* const square_sum5[5], __m128i sq[8], __m128i ma[2],
+ __m128i b[6]) {
+ __m128i s5[2][5], sq5[5][2];
+ Square(s[2], sq + 4);
+ s5[0][3] = Sum5Horizontal16(s + 1);
+ s5[1][3] = Sum5Horizontal16(s + 2);
+ s5[0][4] = s5[0][3];
+ s5[1][4] = s5[1][3];
+ Sum5Horizontal32(sq + 2, sq5[3]);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ LoadAligned16x3U16(sum5, x, s5[0]);
+ LoadAligned32x3U32(square_sum5, x, sq5);
+ CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], b + 2);
+
+ Square(s[3], sq + 6);
+ Sum5Horizontal32(sq + 4, sq5[3]);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
+ LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
+ CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], b + 4);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3Lo(
+ const __m128i s[2], const uint32_t scale, uint16_t* const sum3[3],
+ uint32_t* const square_sum3[3], __m128i sq[4], __m128i* const ma,
+ __m128i b[2]) {
+ __m128i s3[3], sq3[3][2];
+ Square(s[1], sq + 2);
+ s3[2] = Sum3Horizontal16(s);
+ StoreAligned16(sum3[2], s3[2]);
+ Sum3Horizontal32(sq, sq3[2]);
+ StoreAligned32U32(square_sum3[2], sq3[2]);
+ LoadAligned16x2U16(sum3, 0, s3);
+ LoadAligned32x2U32(square_sum3, 0, sq3);
+ CalculateIntermediate3(s3, sq3, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3(
+ const __m128i s[4], const ptrdiff_t x, const ptrdiff_t sum_width,
+ const uint32_t scale, uint16_t* const sum3[3],
+ uint32_t* const square_sum3[3], __m128i sq[8], __m128i ma[2],
+ __m128i b[6]) {
+ __m128i s3[4], sq3[3][2], sum[2], index[2];
+ Square(s[2], sq + 4);
+ s3[2] = Sum3Horizontal16(s + 1);
+ s3[3] = Sum3Horizontal16(s + 2);
+ StoreAligned32U16(sum3[2] + x, s3 + 2);
+ Sum3Horizontal32(sq + 2, sq3[2]);
+ StoreAligned32U32(square_sum3[2] + x + 0, sq3[2]);
+ LoadAligned16x2U16(sum3, x, s3);
+ LoadAligned32x2U32(square_sum3, x, sq3);
+ CalculateSumAndIndex3(s3, sq3, scale, &sum[0], &index[0]);
+
+ Square(s[3], sq + 6);
+ Sum3Horizontal32(sq + 4, sq3[2]);
+ StoreAligned32U32(square_sum3[2] + x + 8, sq3[2]);
+ LoadAligned16x2U16Msan(sum3, x + 8, sum_width, s3 + 1);
+ LoadAligned32x2U32Msan(square_sum3, x + 8, sum_width, sq3);
+ CalculateSumAndIndex3(s3 + 1, sq3, scale, &sum[1], &index[1]);
+ CalculateIntermediate(sum, index, ma, b + 2);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLo(
+ const __m128i s[2][4], const uint16_t scales[2], uint16_t* const sum3[4],
+ uint16_t* const sum5[5], uint32_t* const square_sum3[4],
+ uint32_t* const square_sum5[5], __m128i sq[2][8], __m128i ma3[2][2],
+ __m128i b3[2][6], __m128i* const ma5, __m128i b5[2]) {
+ __m128i s3[4], s5[5], sq3[4][2], sq5[5][2], sum[2], index[2];
+ Square(s[0][1], sq[0] + 2);
+ Square(s[1][1], sq[1] + 2);
+ SumHorizontal16(s[0], &s3[2], &s5[3]);
+ SumHorizontal16(s[1], &s3[3], &s5[4]);
+ StoreAligned16(sum3[2], s3[2]);
+ StoreAligned16(sum3[3], s3[3]);
+ StoreAligned16(sum5[3], s5[3]);
+ StoreAligned16(sum5[4], s5[4]);
+ SumHorizontal32(sq[0], &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ StoreAligned32U32(square_sum3[2], sq3[2]);
+ StoreAligned32U32(square_sum5[3], sq5[3]);
+ SumHorizontal32(sq[1], &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+ StoreAligned32U32(square_sum3[3], sq3[3]);
+ StoreAligned32U32(square_sum5[4], sq5[4]);
+ LoadAligned16x2U16(sum3, 0, s3);
+ LoadAligned32x2U32(square_sum3, 0, sq3);
+ LoadAligned16x3U16(sum5, 0, s5);
+ LoadAligned32x3U32(square_sum5, 0, sq5);
+ CalculateSumAndIndex3(s3 + 0, sq3 + 0, scales[1], &sum[0], &index[0]);
+ CalculateSumAndIndex3(s3 + 1, sq3 + 1, scales[1], &sum[1], &index[1]);
+ CalculateIntermediate(sum, index, &ma3[0][0], b3[0], b3[1]);
+ ma3[1][0] = _mm_srli_si128(ma3[0][0], 8);
+ CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess(
+ const __m128i s[2][4], const ptrdiff_t x, const uint16_t scales[2],
+ uint16_t* const sum3[4], uint16_t* const sum5[5],
+ uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+ const ptrdiff_t sum_width, __m128i sq[2][8], __m128i ma3[2][2],
+ __m128i b3[2][6], __m128i ma5[2], __m128i b5[6]) {
+ __m128i s3[2][4], s5[2][5], sq3[4][2], sq5[5][2], sum[2][2], index[2][2];
+ SumHorizontal16(s[0] + 1, &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]);
+ StoreAligned16(sum3[2] + x + 0, s3[0][2]);
+ StoreAligned16(sum3[2] + x + 8, s3[1][2]);
+ StoreAligned16(sum5[3] + x + 0, s5[0][3]);
+ StoreAligned16(sum5[3] + x + 8, s5[1][3]);
+ SumHorizontal16(s[1] + 1, &s3[0][3], &s3[1][3], &s5[0][4], &s5[1][4]);
+ StoreAligned16(sum3[3] + x + 0, s3[0][3]);
+ StoreAligned16(sum3[3] + x + 8, s3[1][3]);
+ StoreAligned16(sum5[4] + x + 0, s5[0][4]);
+ StoreAligned16(sum5[4] + x + 8, s5[1][4]);
+ Square(s[0][2], sq[0] + 4);
+ Square(s[1][2], sq[1] + 4);
+ SumHorizontal32(sq[0] + 2, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ StoreAligned32U32(square_sum3[2] + x, sq3[2]);
+ StoreAligned32U32(square_sum5[3] + x, sq5[3]);
+ SumHorizontal32(sq[1] + 2, &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+ StoreAligned32U32(square_sum3[3] + x, sq3[3]);
+ StoreAligned32U32(square_sum5[4] + x, sq5[4]);
+ LoadAligned16x2U16(sum3, x, s3[0]);
+ LoadAligned32x2U32(square_sum3, x, sq3);
+ CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum[0][0], &index[0][0]);
+ CalculateSumAndIndex3(s3[0] + 1, sq3 + 1, scales[1], &sum[1][0],
+ &index[1][0]);
+ LoadAligned16x3U16(sum5, x, s5[0]);
+ LoadAligned32x3U32(square_sum5, x, sq5);
+ CalculateIntermediate5<8>(s5[0], sq5, scales[0], &ma5[0], b5 + 2);
+
+ Square(s[0][3], sq[0] + 6);
+ Square(s[1][3], sq[1] + 6);
+ SumHorizontal32(sq[0] + 4, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ StoreAligned32U32(square_sum3[2] + x + 8, sq3[2]);
+ StoreAligned32U32(square_sum5[3] + x + 8, sq5[3]);
+ SumHorizontal32(sq[1] + 4, &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+ StoreAligned32U32(square_sum3[3] + x + 8, sq3[3]);
+ StoreAligned32U32(square_sum5[4] + x + 8, sq5[4]);
+ LoadAligned16x2U16Msan(sum3, x + 8, sum_width, s3[1]);
+ LoadAligned32x2U32Msan(square_sum3, x + 8, sum_width, sq3);
+ CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum[0][1], &index[0][1]);
+ CalculateSumAndIndex3(s3[1] + 1, sq3 + 1, scales[1], &sum[1][1],
+ &index[1][1]);
+ CalculateIntermediate(sum[0], index[0], ma3[0], b3[0] + 2);
+ CalculateIntermediate(sum[1], index[1], ma3[1], b3[1] + 2);
+ LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
+ LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
+ CalculateIntermediate5<0>(s5[1], sq5, scales[0], &ma5[1], b5 + 4);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRowLo(
+ const __m128i s[2], const uint16_t scales[2], const uint16_t* const sum3[4],
+ const uint16_t* const sum5[5], const uint32_t* const square_sum3[4],
+ const uint32_t* const square_sum5[5], __m128i sq[4], __m128i* const ma3,
+ __m128i* const ma5, __m128i b3[2], __m128i b5[2]) {
+ __m128i s3[3], s5[5], sq3[3][2], sq5[5][2];
+ Square(s[1], sq + 2);
+ SumHorizontal16(s, &s3[2], &s5[3]);
+ SumHorizontal32(sq, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ LoadAligned16x3U16(sum5, 0, s5);
+ s5[4] = s5[3];
+ LoadAligned32x3U32(square_sum5, 0, sq5);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5);
+ LoadAligned16x2U16(sum3, 0, s3);
+ LoadAligned32x2U32(square_sum3, 0, sq3);
+ CalculateIntermediate3(s3, sq3, scales[1], ma3, b3);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRow(
+ const __m128i s[4], const ptrdiff_t sum_width, const ptrdiff_t x,
+ const uint16_t scales[2], const uint16_t* const sum3[4],
+ const uint16_t* const sum5[5], const uint32_t* const square_sum3[4],
+ const uint32_t* const square_sum5[5], __m128i sq[8], __m128i ma3[2],
+ __m128i ma5[2], __m128i b3[6], __m128i b5[6]) {
+ __m128i s3[2][3], s5[2][5], sq3[3][2], sq5[5][2], sum[2], index[2];
+ Square(s[2], sq + 4);
+ SumHorizontal16(s + 1, &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]);
+ SumHorizontal32(sq + 2, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ LoadAligned16x3U16(sum5, x, s5[0]);
+ s5[0][4] = s5[0][3];
+ LoadAligned32x3U32(square_sum5, x, sq5);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ CalculateIntermediate5<8>(s5[0], sq5, scales[0], ma5, b5 + 2);
+ LoadAligned16x2U16(sum3, x, s3[0]);
+ LoadAligned32x2U32(square_sum3, x, sq3);
+ CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum[0], &index[0]);
+
+ Square(s[3], sq + 6);
+ SumHorizontal32(sq + 4, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
+ s5[1][4] = s5[1][3];
+ LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ CalculateIntermediate5<0>(s5[1], sq5, scales[0], ma5 + 1, b5 + 4);
+ LoadAligned16x2U16Msan(sum3, x + 8, sum_width, s3[1]);
+ LoadAligned32x2U32Msan(square_sum3, x + 8, sum_width, sq3);
+ CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum[1], &index[1]);
+ CalculateIntermediate(sum, index, ma3, b3 + 2);
+}
+
+inline void BoxSumFilterPreProcess5(const uint16_t* const src0,
+ const uint16_t* const src1, const int width,
+ const uint32_t scale,
+ uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5],
+ const ptrdiff_t sum_width, uint16_t* ma565,
+ uint32_t* b565) {
+ const ptrdiff_t overread_in_bytes =
+ kOverreadInBytesPass1 - sizeof(*src0) * width;
+ __m128i s[2][4], mas[2], sq[2][8], bs[6];
+ s[0][0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+ s[0][1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+ s[1][0] = LoadUnaligned16Msan(src1 + 0, overread_in_bytes + 0);
+ s[1][1] = LoadUnaligned16Msan(src1 + 8, overread_in_bytes + 16);
+ Square(s[0][0], sq[0]);
+ Square(s[1][0], sq[1]);
+ BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq, &mas[0], bs);
+
+ int x = 0;
+ do {
+ __m128i ma5[3], ma[2], b[4];
+ s[0][2] = LoadUnaligned16Msan(src0 + x + 16,
+ overread_in_bytes + sizeof(*src0) * (x + 16));
+ s[0][3] = LoadUnaligned16Msan(src0 + x + 24,
+ overread_in_bytes + sizeof(*src0) * (x + 24));
+ s[1][2] = LoadUnaligned16Msan(src1 + x + 16,
+ overread_in_bytes + sizeof(*src1) * (x + 16));
+ s[1][3] = LoadUnaligned16Msan(src1 + x + 24,
+ overread_in_bytes + sizeof(*src1) * (x + 24));
+ BoxFilterPreProcess5(s, sum_width, x + 8, scale, sum5, square_sum5, sq, mas,
+ bs);
+ Prepare3_8<0>(mas, ma5);
+ ma[0] = Sum565Lo(ma5);
+ ma[1] = Sum565Hi(ma5);
+ StoreAligned32U16(ma565, ma);
+ Sum565(bs + 0, b + 0);
+ Sum565(bs + 2, b + 2);
+ StoreAligned64U32(b565, b);
+ s[0][0] = s[0][2];
+ s[0][1] = s[0][3];
+ s[1][0] = s[1][2];
+ s[1][1] = s[1][3];
+ sq[0][2] = sq[0][6];
+ sq[0][3] = sq[0][7];
+ sq[1][2] = sq[1][6];
+ sq[1][3] = sq[1][7];
+ mas[0] = mas[1];
+ bs[0] = bs[4];
+ bs[1] = bs[5];
+ ma565 += 16;
+ b565 += 16;
+ x += 16;
+ } while (x < width);
+}
+
+template <bool calculate444>
+LIBGAV1_ALWAYS_INLINE void BoxSumFilterPreProcess3(
+ const uint16_t* const src, const int width, const uint32_t scale,
+ uint16_t* const sum3[3], uint32_t* const square_sum3[3],
+ const ptrdiff_t sum_width, uint16_t* ma343, uint16_t* ma444, uint32_t* b343,
+ uint32_t* b444) {
+ const ptrdiff_t overread_in_bytes =
+ kOverreadInBytesPass2 - sizeof(*src) * width;
+ __m128i s[4], mas[2], sq[8], bs[6];
+ s[0] = LoadUnaligned16Msan(src + 0, overread_in_bytes + 0);
+ s[1] = LoadUnaligned16Msan(src + 8, overread_in_bytes + 16);
+ Square(s[0], sq);
+ BoxFilterPreProcess3Lo(s, scale, sum3, square_sum3, sq, &mas[0], bs);
+
+ int x = 0;
+ do {
+ s[2] = LoadUnaligned16Msan(src + x + 16,
+ overread_in_bytes + sizeof(*src) * (x + 16));
+ s[3] = LoadUnaligned16Msan(src + x + 24,
+ overread_in_bytes + sizeof(*src) * (x + 24));
+ BoxFilterPreProcess3(s, x + 8, sum_width, scale, sum3, square_sum3, sq, mas,
+ bs);
+ __m128i ma3[3];
+ Prepare3_8<0>(mas, ma3);
+ if (calculate444) { // NOLINT(readability-simplify-boolean-expr)
+ Store343_444Lo(ma3, bs + 0, 0, ma343, ma444, b343, b444);
+ Store343_444Hi(ma3, bs + 2, 8, ma343, ma444, b343, b444);
+ ma444 += 16;
+ b444 += 16;
+ } else {
+ __m128i ma[2], b[4];
+ ma[0] = Sum343Lo(ma3);
+ ma[1] = Sum343Hi(ma3);
+ StoreAligned32U16(ma343, ma);
+ Sum343(bs + 0, b + 0);
+ Sum343(bs + 2, b + 2);
+ StoreAligned64U32(b343, b);
+ }
+ s[1] = s[3];
+ sq[2] = sq[6];
+ sq[3] = sq[7];
+ mas[0] = mas[1];
+ bs[0] = bs[4];
+ bs[1] = bs[5];
+ ma343 += 16;
+ b343 += 16;
+ x += 16;
+ } while (x < width);
+}
+
+inline void BoxSumFilterPreProcess(
+ const uint16_t* const src0, const uint16_t* const src1, const int width,
+ const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
+ uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+ const ptrdiff_t sum_width, uint16_t* const ma343[4], uint16_t* const ma444,
+ uint16_t* ma565, uint32_t* const b343[4], uint32_t* const b444,
+ uint32_t* b565) {
+ const ptrdiff_t overread_in_bytes =
+ kOverreadInBytesPass1 - sizeof(*src0) * width;
+ __m128i s[2][4], ma3[2][2], ma5[2], sq[2][8], b3[2][6], b5[6];
+ s[0][0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+ s[0][1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+ s[1][0] = LoadUnaligned16Msan(src1 + 0, overread_in_bytes + 0);
+ s[1][1] = LoadUnaligned16Msan(src1 + 8, overread_in_bytes + 16);
+ Square(s[0][0], sq[0]);
+ Square(s[1][0], sq[1]);
+ BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq,
+ ma3, b3, &ma5[0], b5);
+
+ int x = 0;
+ do {
+ __m128i ma[2], b[4], ma3x[3], ma5x[3];
+ s[0][2] = LoadUnaligned16Msan(src0 + x + 16,
+ overread_in_bytes + sizeof(*src0) * (x + 16));
+ s[0][3] = LoadUnaligned16Msan(src0 + x + 24,
+ overread_in_bytes + sizeof(*src0) * (x + 24));
+ s[1][2] = LoadUnaligned16Msan(src1 + x + 16,
+ overread_in_bytes + sizeof(*src1) * (x + 16));
+ s[1][3] = LoadUnaligned16Msan(src1 + x + 24,
+ overread_in_bytes + sizeof(*src1) * (x + 24));
+ BoxFilterPreProcess(s, x + 8, scales, sum3, sum5, square_sum3, square_sum5,
+ sum_width, sq, ma3, b3, ma5, b5);
+
+ Prepare3_8<0>(ma3[0], ma3x);
+ ma[0] = Sum343Lo(ma3x);
+ ma[1] = Sum343Hi(ma3x);
+ StoreAligned32U16(ma343[0] + x, ma);
+ Sum343(b3[0] + 0, b + 0);
+ Sum343(b3[0] + 2, b + 2);
+ StoreAligned64U32(b343[0] + x, b);
+ Sum565(b5 + 0, b + 0);
+ Sum565(b5 + 2, b + 2);
+ StoreAligned64U32(b565, b);
+ Prepare3_8<0>(ma3[1], ma3x);
+ Store343_444Lo(ma3x, b3[1], x, ma343[1], ma444, b343[1], b444);
+ Store343_444Hi(ma3x, b3[1] + 2, x + 8, ma343[1], ma444, b343[1], b444);
+ Prepare3_8<0>(ma5, ma5x);
+ ma[0] = Sum565Lo(ma5x);
+ ma[1] = Sum565Hi(ma5x);
+ StoreAligned32U16(ma565, ma);
+ s[0][0] = s[0][2];
+ s[0][1] = s[0][3];
+ s[1][0] = s[1][2];
+ s[1][1] = s[1][3];
+ sq[0][2] = sq[0][6];
+ sq[0][3] = sq[0][7];
+ sq[1][2] = sq[1][6];
+ sq[1][3] = sq[1][7];
+ ma3[0][0] = ma3[0][1];
+ ma3[1][0] = ma3[1][1];
+ ma5[0] = ma5[1];
+ b3[0][0] = b3[0][4];
+ b3[0][1] = b3[0][5];
+ b3[1][0] = b3[1][4];
+ b3[1][1] = b3[1][5];
+ b5[0] = b5[4];
+ b5[1] = b5[5];
+ ma565 += 16;
+ b565 += 16;
+ x += 16;
+ } while (x < width);
+}
+
+template <int shift>
+inline __m128i FilterOutput(const __m128i ma_x_src, const __m128i b) {
+ // ma: 255 * 32 = 8160 (13 bits)
+ // b: 65088 * 32 = 2082816 (21 bits)
+ // v: b - ma * 255 (22 bits)
+ const __m128i v = _mm_sub_epi32(b, ma_x_src);
+ // kSgrProjSgrBits = 8
+ // kSgrProjRestoreBits = 4
+ // shift = 4 or 5
+ // v >> 8 or 9 (13 bits)
+ return VrshrS32(v, kSgrProjSgrBits + shift - kSgrProjRestoreBits);
+}
+
+template <int shift>
+inline __m128i CalculateFilteredOutput(const __m128i src, const __m128i ma,
+ const __m128i b[2]) {
+ const __m128i ma_x_src_lo = VmullLo16(ma, src);
+ const __m128i ma_x_src_hi = VmullHi16(ma, src);
+ const __m128i dst_lo = FilterOutput<shift>(ma_x_src_lo, b[0]);
+ const __m128i dst_hi = FilterOutput<shift>(ma_x_src_hi, b[1]);
+ return _mm_packs_epi32(dst_lo, dst_hi); // 13 bits
+}
+
+inline __m128i CalculateFilteredOutputPass1(const __m128i src,
+ const __m128i ma[2],
+ const __m128i b[2][2]) {
+ const __m128i ma_sum = _mm_add_epi16(ma[0], ma[1]);
+ __m128i b_sum[2];
+ b_sum[0] = _mm_add_epi32(b[0][0], b[1][0]);
+ b_sum[1] = _mm_add_epi32(b[0][1], b[1][1]);
+ return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
+}
+
+inline __m128i CalculateFilteredOutputPass2(const __m128i src,
+ const __m128i ma[3],
+ const __m128i b[3][2]) {
+ const __m128i ma_sum = Sum3_16(ma);
+ __m128i b_sum[2];
+ Sum3_32(b, b_sum);
+ return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
+}
+
+inline __m128i SelfGuidedFinal(const __m128i src, const __m128i v[2]) {
+ const __m128i v_lo =
+ VrshrS32(v[0], kSgrProjRestoreBits + kSgrProjPrecisionBits);
+ const __m128i v_hi =
+ VrshrS32(v[1], kSgrProjRestoreBits + kSgrProjPrecisionBits);
+ const __m128i vv = _mm_packs_epi32(v_lo, v_hi);
+ return _mm_add_epi16(src, vv);
+}
+
+inline __m128i SelfGuidedDoubleMultiplier(const __m128i src,
+ const __m128i filter[2], const int w0,
+ const int w2) {
+ __m128i v[2];
+ const __m128i w0_w2 = _mm_set1_epi32((w2 << 16) | static_cast<uint16_t>(w0));
+ const __m128i f_lo = _mm_unpacklo_epi16(filter[0], filter[1]);
+ const __m128i f_hi = _mm_unpackhi_epi16(filter[0], filter[1]);
+ v[0] = _mm_madd_epi16(w0_w2, f_lo);
+ v[1] = _mm_madd_epi16(w0_w2, f_hi);
+ return SelfGuidedFinal(src, v);
+}
+
+inline __m128i SelfGuidedSingleMultiplier(const __m128i src,
+ const __m128i filter, const int w0) {
+ // weight: -96 to 96 (Sgrproj_Xqd_Min/Max)
+ __m128i v[2];
+ v[0] = VmullNLo8(filter, w0);
+ v[1] = VmullNHi8(filter, w0);
+ return SelfGuidedFinal(src, v);
+}
+
+inline void ClipAndStore(uint16_t* const dst, const __m128i val) {
+ const __m128i val0 = _mm_max_epi16(val, _mm_setzero_si128());
+ const __m128i val1 = _mm_min_epi16(val0, _mm_set1_epi16(1023));
+ StoreAligned16(dst, val1);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass1(
+ const uint16_t* const src, const uint16_t* const src0,
+ const uint16_t* const src1, const ptrdiff_t stride, uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5], const int width, const ptrdiff_t sum_width,
+ const uint32_t scale, const int16_t w0, uint16_t* const ma565[2],
+ uint32_t* const b565[2], uint16_t* const dst) {
+ const ptrdiff_t overread_in_bytes =
+ kOverreadInBytesPass1 - sizeof(*src0) * width;
+ __m128i s[2][4], mas[2], sq[2][8], bs[6];
+ s[0][0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+ s[0][1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+ s[1][0] = LoadUnaligned16Msan(src1 + 0, overread_in_bytes + 0);
+ s[1][1] = LoadUnaligned16Msan(src1 + 8, overread_in_bytes + 16);
+ Square(s[0][0], sq[0]);
+ Square(s[1][0], sq[1]);
+ BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq, &mas[0], bs);
+
+ int x = 0;
+ do {
+ __m128i ma[2], ma5[3], b[2][2], p[2];
+ s[0][2] = LoadUnaligned16Msan(src0 + x + 16,
+ overread_in_bytes + sizeof(*src0) * (x + 16));
+ s[0][3] = LoadUnaligned16Msan(src0 + x + 24,
+ overread_in_bytes + sizeof(*src0) * (x + 24));
+ s[1][2] = LoadUnaligned16Msan(src1 + x + 16,
+ overread_in_bytes + sizeof(*src1) * (x + 16));
+ s[1][3] = LoadUnaligned16Msan(src1 + x + 24,
+ overread_in_bytes + sizeof(*src1) * (x + 24));
+ BoxFilterPreProcess5(s, sum_width, x + 8, scale, sum5, square_sum5, sq, mas,
+ bs);
+ Prepare3_8<0>(mas, ma5);
+ ma[1] = Sum565Lo(ma5);
+ StoreAligned16(ma565[1] + x, ma[1]);
+ Sum565(bs, b[1]);
+ StoreAligned32U32(b565[1] + x, b[1]);
+ const __m128i sr0_lo = LoadAligned16(src + x + 0);
+ const __m128i sr1_lo = LoadAligned16(src + stride + x + 0);
+ ma[0] = LoadAligned16(ma565[0] + x);
+ LoadAligned32U32(b565[0] + x, b[0]);
+ p[0] = CalculateFilteredOutputPass1(sr0_lo, ma, b);
+ p[1] = CalculateFilteredOutput<4>(sr1_lo, ma[1], b[1]);
+ const __m128i d00 = SelfGuidedSingleMultiplier(sr0_lo, p[0], w0);
+ const __m128i d10 = SelfGuidedSingleMultiplier(sr1_lo, p[1], w0);
+
+ ma[1] = Sum565Hi(ma5);
+ StoreAligned16(ma565[1] + x + 8, ma[1]);
+ Sum565(bs + 2, b[1]);
+ StoreAligned32U32(b565[1] + x + 8, b[1]);
+ const __m128i sr0_hi = LoadAligned16(src + x + 8);
+ const __m128i sr1_hi = LoadAligned16(src + stride + x + 8);
+ ma[0] = LoadAligned16(ma565[0] + x + 8);
+ LoadAligned32U32(b565[0] + x + 8, b[0]);
+ p[0] = CalculateFilteredOutputPass1(sr0_hi, ma, b);
+ p[1] = CalculateFilteredOutput<4>(sr1_hi, ma[1], b[1]);
+ const __m128i d01 = SelfGuidedSingleMultiplier(sr0_hi, p[0], w0);
+ ClipAndStore(dst + x + 0, d00);
+ ClipAndStore(dst + x + 8, d01);
+ const __m128i d11 = SelfGuidedSingleMultiplier(sr1_hi, p[1], w0);
+ ClipAndStore(dst + stride + x + 0, d10);
+ ClipAndStore(dst + stride + x + 8, d11);
+ s[0][0] = s[0][2];
+ s[0][1] = s[0][3];
+ s[1][0] = s[1][2];
+ s[1][1] = s[1][3];
+ sq[0][2] = sq[0][6];
+ sq[0][3] = sq[0][7];
+ sq[1][2] = sq[1][6];
+ sq[1][3] = sq[1][7];
+ mas[0] = mas[1];
+ bs[0] = bs[4];
+ bs[1] = bs[5];
+ x += 16;
+ } while (x < width);
+}
+
+inline void BoxFilterPass1LastRow(
+ const uint16_t* const src, const uint16_t* const src0, const int width,
+ const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
+ uint16_t* const sum5[5], uint32_t* const square_sum5[5], uint16_t* ma565,
+ uint32_t* b565, uint16_t* const dst) {
+ const ptrdiff_t overread_in_bytes =
+ kOverreadInBytesPass1 - sizeof(*src0) * width;
+ __m128i s[4], mas[2], sq[8], bs[6];
+ s[0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+ s[1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+ Square(s[0], sq);
+ BoxFilterPreProcess5LastRowLo(s, scale, sum5, square_sum5, sq, &mas[0], bs);
+
+ int x = 0;
+ do {
+ __m128i ma[2], ma5[3], b[2][2];
+ s[2] = LoadUnaligned16Msan(src0 + x + 16,
+ overread_in_bytes + sizeof(*src0) * (x + 16));
+ s[3] = LoadUnaligned16Msan(src0 + x + 24,
+ overread_in_bytes + sizeof(*src0) * (x + 24));
+ BoxFilterPreProcess5LastRow(s, sum_width, x + 8, scale, sum5, square_sum5,
+ sq, mas, bs);
+ Prepare3_8<0>(mas, ma5);
+ ma[1] = Sum565Lo(ma5);
+ Sum565(bs, b[1]);
+ ma[0] = LoadAligned16(ma565);
+ LoadAligned32U32(b565, b[0]);
+ const __m128i sr_lo = LoadAligned16(src + x + 0);
+ __m128i p = CalculateFilteredOutputPass1(sr_lo, ma, b);
+ const __m128i d0 = SelfGuidedSingleMultiplier(sr_lo, p, w0);
+
+ ma[1] = Sum565Hi(ma5);
+ Sum565(bs + 2, b[1]);
+ ma[0] = LoadAligned16(ma565 + 8);
+ LoadAligned32U32(b565 + 8, b[0]);
+ const __m128i sr_hi = LoadAligned16(src + x + 8);
+ p = CalculateFilteredOutputPass1(sr_hi, ma, b);
+ const __m128i d1 = SelfGuidedSingleMultiplier(sr_hi, p, w0);
+ ClipAndStore(dst + x + 0, d0);
+ ClipAndStore(dst + x + 8, d1);
+ s[1] = s[3];
+ sq[2] = sq[6];
+ sq[3] = sq[7];
+ mas[0] = mas[1];
+ bs[0] = bs[4];
+ bs[1] = bs[5];
+ ma565 += 16;
+ b565 += 16;
+ x += 16;
+ } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass2(
+ const uint16_t* const src, const uint16_t* const src0, const int width,
+ const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
+ uint16_t* const sum3[3], uint32_t* const square_sum3[3],
+ uint16_t* const ma343[3], uint16_t* const ma444[2], uint32_t* const b343[3],
+ uint32_t* const b444[2], uint16_t* const dst) {
+ const ptrdiff_t overread_in_bytes =
+ kOverreadInBytesPass2 - sizeof(*src0) * width;
+ __m128i s[4], mas[2], sq[8], bs[6];
+ s[0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+ s[1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+ Square(s[0], sq);
+ BoxFilterPreProcess3Lo(s, scale, sum3, square_sum3, sq, &mas[0], bs);
+
+ int x = 0;
+ do {
+ s[2] = LoadUnaligned16Msan(src0 + x + 16,
+ overread_in_bytes + sizeof(*src0) * (x + 16));
+ s[3] = LoadUnaligned16Msan(src0 + x + 24,
+ overread_in_bytes + sizeof(*src0) * (x + 24));
+ BoxFilterPreProcess3(s, x + 8, sum_width, scale, sum3, square_sum3, sq, mas,
+ bs);
+ __m128i ma[3], b[3][2], ma3[3];
+ Prepare3_8<0>(mas, ma3);
+ Store343_444Lo(ma3, bs + 0, x, &ma[2], b[2], ma343[2], ma444[1], b343[2],
+ b444[1]);
+ const __m128i sr_lo = LoadAligned16(src + x + 0);
+ ma[0] = LoadAligned16(ma343[0] + x);
+ ma[1] = LoadAligned16(ma444[0] + x);
+ LoadAligned32U32(b343[0] + x, b[0]);
+ LoadAligned32U32(b444[0] + x, b[1]);
+ const __m128i p0 = CalculateFilteredOutputPass2(sr_lo, ma, b);
+
+ Store343_444Hi(ma3, bs + 2, x + 8, &ma[2], b[2], ma343[2], ma444[1],
+ b343[2], b444[1]);
+ const __m128i sr_hi = LoadAligned16(src + x + 8);
+ ma[0] = LoadAligned16(ma343[0] + x + 8);
+ ma[1] = LoadAligned16(ma444[0] + x + 8);
+ LoadAligned32U32(b343[0] + x + 8, b[0]);
+ LoadAligned32U32(b444[0] + x + 8, b[1]);
+ const __m128i p1 = CalculateFilteredOutputPass2(sr_hi, ma, b);
+ const __m128i d0 = SelfGuidedSingleMultiplier(sr_lo, p0, w0);
+ const __m128i d1 = SelfGuidedSingleMultiplier(sr_hi, p1, w0);
+ ClipAndStore(dst + x + 0, d0);
+ ClipAndStore(dst + x + 8, d1);
+ s[1] = s[3];
+ sq[2] = sq[6];
+ sq[3] = sq[7];
+ mas[0] = mas[1];
+ bs[0] = bs[4];
+ bs[1] = bs[5];
+ x += 16;
+ } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilter(
+ const uint16_t* const src, const uint16_t* const src0,
+ const uint16_t* const src1, const ptrdiff_t stride, const int width,
+ const uint16_t scales[2], const int16_t w0, const int16_t w2,
+ uint16_t* const sum3[4], uint16_t* const sum5[5],
+ uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+ const ptrdiff_t sum_width, uint16_t* const ma343[4],
+ uint16_t* const ma444[3], uint16_t* const ma565[2], uint32_t* const b343[4],
+ uint32_t* const b444[3], uint32_t* const b565[2], uint16_t* const dst) {
+ const ptrdiff_t overread_in_bytes =
+ kOverreadInBytesPass1 - sizeof(*src0) * width;
+ __m128i s[2][4], ma3[2][2], ma5[2], sq[2][8], b3[2][6], b5[6];
+ s[0][0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+ s[0][1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+ s[1][0] = LoadUnaligned16Msan(src1 + 0, overread_in_bytes + 0);
+ s[1][1] = LoadUnaligned16Msan(src1 + 8, overread_in_bytes + 16);
+ Square(s[0][0], sq[0]);
+ Square(s[1][0], sq[1]);
+ BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq,
+ ma3, b3, &ma5[0], b5);
+
+ int x = 0;
+ do {
+ __m128i ma[3][3], b[3][3][2], p[2][2], ma3x[2][3], ma5x[3];
+ s[0][2] = LoadUnaligned16Msan(src0 + x + 16,
+ overread_in_bytes + sizeof(*src0) * (x + 16));
+ s[0][3] = LoadUnaligned16Msan(src0 + x + 24,
+ overread_in_bytes + sizeof(*src0) * (x + 24));
+ s[1][2] = LoadUnaligned16Msan(src1 + x + 16,
+ overread_in_bytes + sizeof(*src1) * (x + 16));
+ s[1][3] = LoadUnaligned16Msan(src1 + x + 24,
+ overread_in_bytes + sizeof(*src1) * (x + 24));
+ BoxFilterPreProcess(s, x + 8, scales, sum3, sum5, square_sum3, square_sum5,
+ sum_width, sq, ma3, b3, ma5, b5);
+ Prepare3_8<0>(ma3[0], ma3x[0]);
+ Prepare3_8<0>(ma3[1], ma3x[1]);
+ Prepare3_8<0>(ma5, ma5x);
+ Store343_444Lo(ma3x[0], b3[0], x, &ma[1][2], &ma[2][1], b[1][2], b[2][1],
+ ma343[2], ma444[1], b343[2], b444[1]);
+ Store343_444Lo(ma3x[1], b3[1], x, &ma[2][2], b[2][2], ma343[3], ma444[2],
+ b343[3], b444[2]);
+ ma[0][1] = Sum565Lo(ma5x);
+ StoreAligned16(ma565[1] + x, ma[0][1]);
+ Sum565(b5, b[0][1]);
+ StoreAligned32U32(b565[1] + x, b[0][1]);
+ const __m128i sr0_lo = LoadAligned16(src + x);
+ const __m128i sr1_lo = LoadAligned16(src + stride + x);
+ ma[0][0] = LoadAligned16(ma565[0] + x);
+ LoadAligned32U32(b565[0] + x, b[0][0]);
+ p[0][0] = CalculateFilteredOutputPass1(sr0_lo, ma[0], b[0]);
+ p[1][0] = CalculateFilteredOutput<4>(sr1_lo, ma[0][1], b[0][1]);
+ ma[1][0] = LoadAligned16(ma343[0] + x);
+ ma[1][1] = LoadAligned16(ma444[0] + x);
+ LoadAligned32U32(b343[0] + x, b[1][0]);
+ LoadAligned32U32(b444[0] + x, b[1][1]);
+ p[0][1] = CalculateFilteredOutputPass2(sr0_lo, ma[1], b[1]);
+ const __m128i d00 = SelfGuidedDoubleMultiplier(sr0_lo, p[0], w0, w2);
+ ma[2][0] = LoadAligned16(ma343[1] + x);
+ LoadAligned32U32(b343[1] + x, b[2][0]);
+ p[1][1] = CalculateFilteredOutputPass2(sr1_lo, ma[2], b[2]);
+ const __m128i d10 = SelfGuidedDoubleMultiplier(sr1_lo, p[1], w0, w2);
+
+ Store343_444Hi(ma3x[0], b3[0] + 2, x + 8, &ma[1][2], &ma[2][1], b[1][2],
+ b[2][1], ma343[2], ma444[1], b343[2], b444[1]);
+ Store343_444Hi(ma3x[1], b3[1] + 2, x + 8, &ma[2][2], b[2][2], ma343[3],
+ ma444[2], b343[3], b444[2]);
+ ma[0][1] = Sum565Hi(ma5x);
+ StoreAligned16(ma565[1] + x + 8, ma[0][1]);
+ Sum565(b5 + 2, b[0][1]);
+ StoreAligned32U32(b565[1] + x + 8, b[0][1]);
+ const __m128i sr0_hi = LoadAligned16(src + x + 8);
+ const __m128i sr1_hi = LoadAligned16(src + stride + x + 8);
+ ma[0][0] = LoadAligned16(ma565[0] + x + 8);
+ LoadAligned32U32(b565[0] + x + 8, b[0][0]);
+ p[0][0] = CalculateFilteredOutputPass1(sr0_hi, ma[0], b[0]);
+ p[1][0] = CalculateFilteredOutput<4>(sr1_hi, ma[0][1], b[0][1]);
+ ma[1][0] = LoadAligned16(ma343[0] + x + 8);
+ ma[1][1] = LoadAligned16(ma444[0] + x + 8);
+ LoadAligned32U32(b343[0] + x + 8, b[1][0]);
+ LoadAligned32U32(b444[0] + x + 8, b[1][1]);
+ p[0][1] = CalculateFilteredOutputPass2(sr0_hi, ma[1], b[1]);
+ const __m128i d01 = SelfGuidedDoubleMultiplier(sr0_hi, p[0], w0, w2);
+ ClipAndStore(dst + x + 0, d00);
+ ClipAndStore(dst + x + 8, d01);
+ ma[2][0] = LoadAligned16(ma343[1] + x + 8);
+ LoadAligned32U32(b343[1] + x + 8, b[2][0]);
+ p[1][1] = CalculateFilteredOutputPass2(sr1_hi, ma[2], b[2]);
+ const __m128i d11 = SelfGuidedDoubleMultiplier(sr1_hi, p[1], w0, w2);
+ ClipAndStore(dst + stride + x + 0, d10);
+ ClipAndStore(dst + stride + x + 8, d11);
+ s[0][0] = s[0][2];
+ s[0][1] = s[0][3];
+ s[1][0] = s[1][2];
+ s[1][1] = s[1][3];
+ sq[0][2] = sq[0][6];
+ sq[0][3] = sq[0][7];
+ sq[1][2] = sq[1][6];
+ sq[1][3] = sq[1][7];
+ ma3[0][0] = ma3[0][1];
+ ma3[1][0] = ma3[1][1];
+ ma5[0] = ma5[1];
+ b3[0][0] = b3[0][4];
+ b3[0][1] = b3[0][5];
+ b3[1][0] = b3[1][4];
+ b3[1][1] = b3[1][5];
+ b5[0] = b5[4];
+ b5[1] = b5[5];
+ x += 16;
+ } while (x < width);
+}
+
+inline void BoxFilterLastRow(
+ const uint16_t* const src, const uint16_t* const src0, const int width,
+ const ptrdiff_t sum_width, const uint16_t scales[2], const int16_t w0,
+ const int16_t w2, uint16_t* const sum3[4], uint16_t* const sum5[5],
+ uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+ uint16_t* const ma343, uint16_t* const ma444, uint16_t* const ma565,
+ uint32_t* const b343, uint32_t* const b444, uint32_t* const b565,
+ uint16_t* const dst) {
+ const ptrdiff_t overread_in_bytes =
+ kOverreadInBytesPass1 - sizeof(*src0) * width;
+ __m128i s[4], ma3[2], ma5[2], sq[8], b3[6], b5[6], ma[3], b[3][2];
+ s[0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+ s[1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+ Square(s[0], sq);
+ BoxFilterPreProcessLastRowLo(s, scales, sum3, sum5, square_sum3, square_sum5,
+ sq, &ma3[0], &ma5[0], b3, b5);
+
+ int x = 0;
+ do {
+ __m128i ma3x[3], ma5x[3], p[2];
+ s[2] = LoadUnaligned16Msan(src0 + x + 16,
+ overread_in_bytes + sizeof(*src0) * (x + 16));
+ s[3] = LoadUnaligned16Msan(src0 + x + 24,
+ overread_in_bytes + sizeof(*src0) * (x + 24));
+ BoxFilterPreProcessLastRow(s, sum_width, x + 8, scales, sum3, sum5,
+ square_sum3, square_sum5, sq, ma3, ma5, b3, b5);
+ Prepare3_8<0>(ma3, ma3x);
+ Prepare3_8<0>(ma5, ma5x);
+ ma[1] = Sum565Lo(ma5x);
+ Sum565(b5, b[1]);
+ ma[2] = Sum343Lo(ma3x);
+ Sum343(b3, b[2]);
+ const __m128i sr_lo = LoadAligned16(src + x + 0);
+ ma[0] = LoadAligned16(ma565 + x);
+ LoadAligned32U32(b565 + x, b[0]);
+ p[0] = CalculateFilteredOutputPass1(sr_lo, ma, b);
+ ma[0] = LoadAligned16(ma343 + x);
+ ma[1] = LoadAligned16(ma444 + x);
+ LoadAligned32U32(b343 + x, b[0]);
+ LoadAligned32U32(b444 + x, b[1]);
+ p[1] = CalculateFilteredOutputPass2(sr_lo, ma, b);
+ const __m128i d0 = SelfGuidedDoubleMultiplier(sr_lo, p, w0, w2);
+
+ ma[1] = Sum565Hi(ma5x);
+ Sum565(b5 + 2, b[1]);
+ ma[2] = Sum343Hi(ma3x);
+ Sum343(b3 + 2, b[2]);
+ const __m128i sr_hi = LoadAligned16(src + x + 8);
+ ma[0] = LoadAligned16(ma565 + x + 8);
+ LoadAligned32U32(b565 + x + 8, b[0]);
+ p[0] = CalculateFilteredOutputPass1(sr_hi, ma, b);
+ ma[0] = LoadAligned16(ma343 + x + 8);
+ ma[1] = LoadAligned16(ma444 + x + 8);
+ LoadAligned32U32(b343 + x + 8, b[0]);
+ LoadAligned32U32(b444 + x + 8, b[1]);
+ p[1] = CalculateFilteredOutputPass2(sr_hi, ma, b);
+ const __m128i d1 = SelfGuidedDoubleMultiplier(sr_hi, p, w0, w2);
+ ClipAndStore(dst + x + 0, d0);
+ ClipAndStore(dst + x + 8, d1);
+ s[1] = s[3];
+ sq[2] = sq[6];
+ sq[3] = sq[7];
+ ma3[0] = ma3[1];
+ ma5[0] = ma5[1];
+ b3[0] = b3[4];
+ b3[1] = b3[5];
+ b5[0] = b5[4];
+ b5[1] = b5[5];
+ x += 16;
+ } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
+ const RestorationUnitInfo& restoration_info, const uint16_t* src,
+ const ptrdiff_t stride, const uint16_t* const top_border,
+ const ptrdiff_t top_border_stride, const uint16_t* bottom_border,
+ const ptrdiff_t bottom_border_stride, const int width, const int height,
+ SgrBuffer* const sgr_buffer, uint16_t* dst) {
+ const auto temp_stride = Align<ptrdiff_t>(width, 16);
+ const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
+ const auto sum_stride = temp_stride + 16;
+ const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+ const uint16_t* const scales = kSgrScaleParameter[sgr_proj_index]; // < 2^12.
+ const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+ const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+ const int16_t w2 = (1 << kSgrProjPrecisionBits) - w0 - w1;
+ uint16_t *sum3[4], *sum5[5], *ma343[4], *ma444[3], *ma565[2];
+ uint32_t *square_sum3[4], *square_sum5[5], *b343[4], *b444[3], *b565[2];
+ sum3[0] = sgr_buffer->sum3;
+ square_sum3[0] = sgr_buffer->square_sum3;
+ ma343[0] = sgr_buffer->ma343;
+ b343[0] = sgr_buffer->b343;
+ for (int i = 1; i <= 3; ++i) {
+ sum3[i] = sum3[i - 1] + sum_stride;
+ square_sum3[i] = square_sum3[i - 1] + sum_stride;
+ ma343[i] = ma343[i - 1] + temp_stride;
+ b343[i] = b343[i - 1] + temp_stride;
+ }
+ sum5[0] = sgr_buffer->sum5;
+ square_sum5[0] = sgr_buffer->square_sum5;
+ for (int i = 1; i <= 4; ++i) {
+ sum5[i] = sum5[i - 1] + sum_stride;
+ square_sum5[i] = square_sum5[i - 1] + sum_stride;
+ }
+ ma444[0] = sgr_buffer->ma444;
+ b444[0] = sgr_buffer->b444;
+ for (int i = 1; i <= 2; ++i) {
+ ma444[i] = ma444[i - 1] + temp_stride;
+ b444[i] = b444[i - 1] + temp_stride;
+ }
+ ma565[0] = sgr_buffer->ma565;
+ ma565[1] = ma565[0] + temp_stride;
+ b565[0] = sgr_buffer->b565;
+ b565[1] = b565[0] + temp_stride;
+ assert(scales[0] != 0);
+ assert(scales[1] != 0);
+ BoxSum(top_border, top_border_stride, width, sum_stride, sum_width, sum3[0],
+ sum5[1], square_sum3[0], square_sum5[1]);
+ sum5[0] = sum5[1];
+ square_sum5[0] = square_sum5[1];
+ const uint16_t* const s = (height > 1) ? src + stride : bottom_border;
+ BoxSumFilterPreProcess(src, s, width, scales, sum3, sum5, square_sum3,
+ square_sum5, sum_width, ma343, ma444[0], ma565[0],
+ b343, b444[0], b565[0]);
+ sum5[0] = sgr_buffer->sum5;
+ square_sum5[0] = sgr_buffer->square_sum5;
+
+ for (int y = (height >> 1) - 1; y > 0; --y) {
+ Circulate4PointersBy2<uint16_t>(sum3);
+ Circulate4PointersBy2<uint32_t>(square_sum3);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ BoxFilter(src + 3, src + 2 * stride, src + 3 * stride, stride, width,
+ scales, w0, w2, sum3, sum5, square_sum3, square_sum5, sum_width,
+ ma343, ma444, ma565, b343, b444, b565, dst);
+ src += 2 * stride;
+ dst += 2 * stride;
+ Circulate4PointersBy2<uint16_t>(ma343);
+ Circulate4PointersBy2<uint32_t>(b343);
+ std::swap(ma444[0], ma444[2]);
+ std::swap(b444[0], b444[2]);
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ }
+
+ Circulate4PointersBy2<uint16_t>(sum3);
+ Circulate4PointersBy2<uint32_t>(square_sum3);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ if ((height & 1) == 0 || height > 1) {
+ const uint16_t* sr[2];
+ if ((height & 1) == 0) {
+ sr[0] = bottom_border;
+ sr[1] = bottom_border + bottom_border_stride;
+ } else {
+ sr[0] = src + 2 * stride;
+ sr[1] = bottom_border;
+ }
+ BoxFilter(src + 3, sr[0], sr[1], stride, width, scales, w0, w2, sum3, sum5,
+ square_sum3, square_sum5, sum_width, ma343, ma444, ma565, b343,
+ b444, b565, dst);
+ }
+ if ((height & 1) != 0) {
+ if (height > 1) {
+ src += 2 * stride;
+ dst += 2 * stride;
+ Circulate4PointersBy2<uint16_t>(sum3);
+ Circulate4PointersBy2<uint32_t>(square_sum3);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ Circulate4PointersBy2<uint16_t>(ma343);
+ Circulate4PointersBy2<uint32_t>(b343);
+ std::swap(ma444[0], ma444[2]);
+ std::swap(b444[0], b444[2]);
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ }
+ BoxFilterLastRow(src + 3, bottom_border + bottom_border_stride, width,
+ sum_width, scales, w0, w2, sum3, sum5, square_sum3,
+ square_sum5, ma343[0], ma444[0], ma565[0], b343[0],
+ b444[0], b565[0], dst);
+ }
+}
+
+inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
+ const uint16_t* src, const ptrdiff_t stride,
+ const uint16_t* const top_border,
+ const ptrdiff_t top_border_stride,
+ const uint16_t* bottom_border,
+ const ptrdiff_t bottom_border_stride,
+ const int width, const int height,
+ SgrBuffer* const sgr_buffer, uint16_t* dst) {
+ const auto temp_stride = Align<ptrdiff_t>(width, 16);
+ const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
+ const auto sum_stride = temp_stride + 16;
+ const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+ const uint32_t scale = kSgrScaleParameter[sgr_proj_index][0]; // < 2^12.
+ const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+ uint16_t *sum5[5], *ma565[2];
+ uint32_t *square_sum5[5], *b565[2];
+ sum5[0] = sgr_buffer->sum5;
+ square_sum5[0] = sgr_buffer->square_sum5;
+ for (int i = 1; i <= 4; ++i) {
+ sum5[i] = sum5[i - 1] + sum_stride;
+ square_sum5[i] = square_sum5[i - 1] + sum_stride;
+ }
+ ma565[0] = sgr_buffer->ma565;
+ ma565[1] = ma565[0] + temp_stride;
+ b565[0] = sgr_buffer->b565;
+ b565[1] = b565[0] + temp_stride;
+ assert(scale != 0);
+ BoxSum<5>(top_border, top_border_stride, width, sum_stride, sum_width,
+ sum5[1], square_sum5[1]);
+ sum5[0] = sum5[1];
+ square_sum5[0] = square_sum5[1];
+ const uint16_t* const s = (height > 1) ? src + stride : bottom_border;
+ BoxSumFilterPreProcess5(src, s, width, scale, sum5, square_sum5, sum_width,
+ ma565[0], b565[0]);
+ sum5[0] = sgr_buffer->sum5;
+ square_sum5[0] = sgr_buffer->square_sum5;
+
+ for (int y = (height >> 1) - 1; y > 0; --y) {
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ BoxFilterPass1(src + 3, src + 2 * stride, src + 3 * stride, stride, sum5,
+ square_sum5, width, sum_width, scale, w0, ma565, b565, dst);
+ src += 2 * stride;
+ dst += 2 * stride;
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ }
+
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ if ((height & 1) == 0 || height > 1) {
+ const uint16_t* sr[2];
+ if ((height & 1) == 0) {
+ sr[0] = bottom_border;
+ sr[1] = bottom_border + bottom_border_stride;
+ } else {
+ sr[0] = src + 2 * stride;
+ sr[1] = bottom_border;
+ }
+ BoxFilterPass1(src + 3, sr[0], sr[1], stride, sum5, square_sum5, width,
+ sum_width, scale, w0, ma565, b565, dst);
+ }
+ if ((height & 1) != 0) {
+ src += 3;
+ if (height > 1) {
+ src += 2 * stride;
+ dst += 2 * stride;
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ }
+ BoxFilterPass1LastRow(src, bottom_border + bottom_border_stride, width,
+ sum_width, scale, w0, sum5, square_sum5, ma565[0],
+ b565[0], dst);
+ }
+}
+
+inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
+ const uint16_t* src, const ptrdiff_t stride,
+ const uint16_t* const top_border,
+ const ptrdiff_t top_border_stride,
+ const uint16_t* bottom_border,
+ const ptrdiff_t bottom_border_stride,
+ const int width, const int height,
+ SgrBuffer* const sgr_buffer, uint16_t* dst) {
+ assert(restoration_info.sgr_proj_info.multiplier[0] == 0);
+ const auto temp_stride = Align<ptrdiff_t>(width, 16);
+ const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
+ const auto sum_stride = temp_stride + 16;
+ const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+ const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1;
+ const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+ const uint32_t scale = kSgrScaleParameter[sgr_proj_index][1]; // < 2^12.
+ uint16_t *sum3[3], *ma343[3], *ma444[2];
+ uint32_t *square_sum3[3], *b343[3], *b444[2];
+ sum3[0] = sgr_buffer->sum3;
+ square_sum3[0] = sgr_buffer->square_sum3;
+ ma343[0] = sgr_buffer->ma343;
+ b343[0] = sgr_buffer->b343;
+ for (int i = 1; i <= 2; ++i) {
+ sum3[i] = sum3[i - 1] + sum_stride;
+ square_sum3[i] = square_sum3[i - 1] + sum_stride;
+ ma343[i] = ma343[i - 1] + temp_stride;
+ b343[i] = b343[i - 1] + temp_stride;
+ }
+ ma444[0] = sgr_buffer->ma444;
+ ma444[1] = ma444[0] + temp_stride;
+ b444[0] = sgr_buffer->b444;
+ b444[1] = b444[0] + temp_stride;
+ assert(scale != 0);
+ BoxSum<3>(top_border, top_border_stride, width, sum_stride, sum_width,
+ sum3[0], square_sum3[0]);
+ BoxSumFilterPreProcess3<false>(src, width, scale, sum3, square_sum3,
+ sum_width, ma343[0], nullptr, b343[0],
+ nullptr);
+ Circulate3PointersBy1<uint16_t>(sum3);
+ Circulate3PointersBy1<uint32_t>(square_sum3);
+ const uint16_t* s;
+ if (height > 1) {
+ s = src + stride;
+ } else {
+ s = bottom_border;
+ bottom_border += bottom_border_stride;
+ }
+ BoxSumFilterPreProcess3<true>(s, width, scale, sum3, square_sum3, sum_width,
+ ma343[1], ma444[0], b343[1], b444[0]);
+
+ for (int y = height - 2; y > 0; --y) {
+ Circulate3PointersBy1<uint16_t>(sum3);
+ Circulate3PointersBy1<uint32_t>(square_sum3);
+ BoxFilterPass2(src + 2, src + 2 * stride, width, sum_width, scale, w0, sum3,
+ square_sum3, ma343, ma444, b343, b444, dst);
+ src += stride;
+ dst += stride;
+ Circulate3PointersBy1<uint16_t>(ma343);
+ Circulate3PointersBy1<uint32_t>(b343);
+ std::swap(ma444[0], ma444[1]);
+ std::swap(b444[0], b444[1]);
+ }
+
+ int y = std::min(height, 2);
+ src += 2;
+ do {
+ Circulate3PointersBy1<uint16_t>(sum3);
+ Circulate3PointersBy1<uint32_t>(square_sum3);
+ BoxFilterPass2(src, bottom_border, width, sum_width, scale, w0, sum3,
+ square_sum3, ma343, ma444, b343, b444, dst);
+ src += stride;
+ dst += stride;
+ bottom_border += bottom_border_stride;
+ Circulate3PointersBy1<uint16_t>(ma343);
+ Circulate3PointersBy1<uint32_t>(b343);
+ std::swap(ma444[0], ma444[1]);
+ std::swap(b444[0], b444[1]);
+ } while (--y != 0);
+}
+
+// If |width| is non-multiple of 16, up to 15 more pixels are written to |dest|
+// in the end of each row. It is safe to overwrite the output as it will not be
+// part of the visible frame.
+void SelfGuidedFilter_SSE4_1(
+ const RestorationUnitInfo& restoration_info, const void* const source,
+ const ptrdiff_t stride, const void* const top_border,
+ const ptrdiff_t top_border_stride, const void* const bottom_border,
+ const ptrdiff_t bottom_border_stride, const int width, const int height,
+ RestorationBuffer* const restoration_buffer, void* const dest) {
+ const int index = restoration_info.sgr_proj_info.index;
+ const int radius_pass_0 = kSgrProjParams[index][0]; // 2 or 0
+ const int radius_pass_1 = kSgrProjParams[index][2]; // 1 or 0
+ const auto* const src = static_cast<const uint16_t*>(source);
+ const auto* const top = static_cast<const uint16_t*>(top_border);
+ const auto* const bottom = static_cast<const uint16_t*>(bottom_border);
+ auto* const dst = static_cast<uint16_t*>(dest);
+ SgrBuffer* const sgr_buffer = &restoration_buffer->sgr_buffer;
+ if (radius_pass_1 == 0) {
+ // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the
+ // following assertion.
+ assert(radius_pass_0 != 0);
+ BoxFilterProcessPass1(restoration_info, src - 3, stride, top - 3,
+ top_border_stride, bottom - 3, bottom_border_stride,
+ width, height, sgr_buffer, dst);
+ } else if (radius_pass_0 == 0) {
+ BoxFilterProcessPass2(restoration_info, src - 2, stride, top - 2,
+ top_border_stride, bottom - 2, bottom_border_stride,
+ width, height, sgr_buffer, dst);
+ } else {
+ BoxFilterProcess(restoration_info, src - 3, stride, top - 3,
+ top_border_stride, bottom - 3, bottom_border_stride, width,
+ height, sgr_buffer, dst);
+ }
+}
+
void Init10bpp() {
Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
assert(dsp != nullptr);
@@ -531,6 +2505,11 @@ void Init10bpp() {
#else
static_cast<void>(WienerFilter_SSE4_1);
#endif
+#if DSP_ENABLED_10BPP_SSE4_1(SelfGuidedFilter)
+ dsp->loop_restorations[1] = SelfGuidedFilter_SSE4_1;
+#else
+ static_cast<void>(SelfGuidedFilter_SSE4_1);
+#endif
}
} // namespace
@@ -540,7 +2519,7 @@ void LoopRestorationInit10bpp_SSE4_1() { Init10bpp(); }
} // namespace dsp
} // namespace libgav1
-#else // !(LIBGAV1_TARGETING_SSE4_1 && LIBGAV1_MAX_BITDEPTH >= 10)
+#else // !(LIBGAV1_TARGETING_SSE4_1 && LIBGAV1_MAX_BITDEPTH >= 10)
namespace libgav1 {
namespace dsp {
diff --git a/src/dsp/x86/loop_restoration_avx2.cc b/src/dsp/x86/loop_restoration_avx2.cc
index 7ae7c90..351a324 100644
--- a/src/dsp/x86/loop_restoration_avx2.cc
+++ b/src/dsp/x86/loop_restoration_avx2.cc
@@ -28,7 +28,6 @@
#include "src/dsp/constants.h"
#include "src/dsp/dsp.h"
#include "src/dsp/x86/common_avx2.h"
-#include "src/dsp/x86/common_sse4.h"
#include "src/utils/common.h"
#include "src/utils/constants.h"
@@ -116,7 +115,8 @@ inline void WienerHorizontalTap7(const uint8_t* src, const ptrdiff_t src_stride,
filter[0] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x0100));
filter[1] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x0302));
filter[2] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x0102));
- filter[3] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x8000));
+ filter[3] = _mm256_shuffle_epi8(
+ coefficients, _mm256_set1_epi16(static_cast<int16_t>(0x8000)));
for (int y = height; y != 0; --y) {
__m256i s = LoadUnaligned32(src);
__m256i ss[4];
@@ -144,7 +144,8 @@ inline void WienerHorizontalTap5(const uint8_t* src, const ptrdiff_t src_stride,
__m256i filter[3];
filter[0] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x0201));
filter[1] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x0203));
- filter[2] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x8001));
+ filter[2] = _mm256_shuffle_epi8(
+ coefficients, _mm256_set1_epi16(static_cast<int16_t>(0x8001)));
for (int y = height; y != 0; --y) {
__m256i s = LoadUnaligned32(src);
__m256i ss[4];
@@ -171,7 +172,8 @@ inline void WienerHorizontalTap3(const uint8_t* src, const ptrdiff_t src_stride,
int16_t** const wiener_buffer) {
__m256i filter[2];
filter[0] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x0302));
- filter[1] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x8002));
+ filter[1] = _mm256_shuffle_epi8(
+ coefficients, _mm256_set1_epi16(static_cast<int16_t>(0x8002)));
for (int y = height; y != 0; --y) {
__m256i s = LoadUnaligned32(src);
__m256i ss[4];
@@ -480,12 +482,12 @@ inline void WienerVerticalTap1(const int16_t* wiener_buffer,
}
}
-void WienerFilter_AVX2(const RestorationUnitInfo& restoration_info,
- const void* const source, const void* const top_border,
- const void* const bottom_border, const ptrdiff_t stride,
- const int width, const int height,
- RestorationBuffer* const restoration_buffer,
- void* const dest) {
+void WienerFilter_AVX2(
+ const RestorationUnitInfo& restoration_info, const void* const source,
+ const ptrdiff_t stride, const void* const top_border,
+ const ptrdiff_t top_border_stride, const void* const bottom_border,
+ const ptrdiff_t bottom_border_stride, const int width, const int height,
+ RestorationBuffer* const restoration_buffer, void* const dest) {
const int16_t* const number_leading_zero_coefficients =
restoration_info.wiener_info.number_leading_zero_coefficients;
const int number_rows_to_skip = std::max(
@@ -515,39 +517,42 @@ void WienerFilter_AVX2(const RestorationUnitInfo& restoration_info,
c_horizontal = _mm_packs_epi16(c_horizontal, c_horizontal);
const __m256i coefficients_horizontal = _mm256_broadcastd_epi32(c_horizontal);
if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) {
- WienerHorizontalTap7(top + (2 - height_extra) * stride - 3, stride,
- wiener_stride, height_extra, coefficients_horizontal,
- &wiener_buffer_horizontal);
- WienerHorizontalTap7(src - 3, stride, wiener_stride, height,
+ WienerHorizontalTap7(top + (2 - height_extra) * top_border_stride - 3,
+ top_border_stride, wiener_stride, height_extra,
coefficients_horizontal, &wiener_buffer_horizontal);
- WienerHorizontalTap7(bottom - 3, stride, wiener_stride, height_extra,
+ WienerHorizontalTap7(src - 3, stride, wiener_stride, height,
coefficients_horizontal, &wiener_buffer_horizontal);
- } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
- WienerHorizontalTap5(top + (2 - height_extra) * stride - 2, stride,
- wiener_stride, height_extra, coefficients_horizontal,
+ WienerHorizontalTap7(bottom - 3, bottom_border_stride, wiener_stride,
+ height_extra, coefficients_horizontal,
&wiener_buffer_horizontal);
- WienerHorizontalTap5(src - 2, stride, wiener_stride, height,
+ } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
+ WienerHorizontalTap5(top + (2 - height_extra) * top_border_stride - 2,
+ top_border_stride, wiener_stride, height_extra,
coefficients_horizontal, &wiener_buffer_horizontal);
- WienerHorizontalTap5(bottom - 2, stride, wiener_stride, height_extra,
+ WienerHorizontalTap5(src - 2, stride, wiener_stride, height,
coefficients_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap5(bottom - 2, bottom_border_stride, wiener_stride,
+ height_extra, coefficients_horizontal,
+ &wiener_buffer_horizontal);
} else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) {
// The maximum over-reads happen here.
- WienerHorizontalTap3(top + (2 - height_extra) * stride - 1, stride,
- wiener_stride, height_extra, coefficients_horizontal,
- &wiener_buffer_horizontal);
- WienerHorizontalTap3(src - 1, stride, wiener_stride, height,
+ WienerHorizontalTap3(top + (2 - height_extra) * top_border_stride - 1,
+ top_border_stride, wiener_stride, height_extra,
coefficients_horizontal, &wiener_buffer_horizontal);
- WienerHorizontalTap3(bottom - 1, stride, wiener_stride, height_extra,
+ WienerHorizontalTap3(src - 1, stride, wiener_stride, height,
coefficients_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap3(bottom - 1, bottom_border_stride, wiener_stride,
+ height_extra, coefficients_horizontal,
+ &wiener_buffer_horizontal);
} else {
assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3);
- WienerHorizontalTap1(top + (2 - height_extra) * stride, stride,
- wiener_stride, height_extra,
+ WienerHorizontalTap1(top + (2 - height_extra) * top_border_stride,
+ top_border_stride, wiener_stride, height_extra,
&wiener_buffer_horizontal);
WienerHorizontalTap1(src, stride, wiener_stride, height,
&wiener_buffer_horizontal);
- WienerHorizontalTap1(bottom, stride, wiener_stride, height_extra,
- &wiener_buffer_horizontal);
+ WienerHorizontalTap1(bottom, bottom_border_stride, wiener_stride,
+ height_extra, &wiener_buffer_horizontal);
}
// vertical filtering.
@@ -765,17 +770,6 @@ inline __m256i VaddwHi16(const __m256i src0, const __m256i src1) {
return _mm256_add_epi32(src0, s1);
}
-// Using VgetLane16() can save a sign extension instruction.
-template <int n>
-inline int VgetLane16(__m256i src) {
- return _mm256_extract_epi16(src, n);
-}
-
-template <int n>
-inline int VgetLane8(__m256i src) {
- return _mm256_extract_epi8(src, n);
-}
-
inline __m256i VmullNLo8(const __m256i src0, const int src1) {
const __m256i s0 = _mm256_unpacklo_epi16(src0, _mm256_setzero_si256());
return _mm256_madd_epi16(s0, _mm256_set1_epi32(src1));
@@ -1253,9 +1247,8 @@ inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
do {
const __m128i s0 =
LoadUnaligned16Msan(src, kOverreadInBytesPass1_128 - width);
- __m128i sq_128[2];
+ __m128i sq_128[2], s3, s5, sq3[2], sq5[2];
__m256i sq[3];
- __m128i s3, s5, sq3[2], sq5[2];
sq_128[0] = SquareLo8(s0);
sq_128[1] = SquareHi8(s0);
SumHorizontalLo(s0, &s3, &s5);
@@ -1432,11 +1425,43 @@ inline __m256i CalculateMa(const __m256i sum, const __m256i sum_sq[2],
return _mm256_packus_epi32(z0, z1);
}
-template <int n>
-inline __m128i CalculateB(const __m128i sum, const __m128i ma) {
- static_assert(n == 9 || n == 25, "");
+inline __m128i CalculateB5(const __m128i sum, const __m128i ma) {
+ // one_over_n == 164.
constexpr uint32_t one_over_n =
- ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n;
+ ((1 << kSgrProjReciprocalBits) + (25 >> 1)) / 25;
+ // one_over_n_quarter == 41.
+ constexpr uint32_t one_over_n_quarter = one_over_n >> 2;
+ static_assert(one_over_n == one_over_n_quarter << 2, "");
+ // |ma| is in range [0, 255].
+ const __m128i m = _mm_maddubs_epi16(ma, _mm_set1_epi16(one_over_n_quarter));
+ const __m128i m0 = VmullLo16(m, sum);
+ const __m128i m1 = VmullHi16(m, sum);
+ const __m128i b_lo = VrshrU32(m0, kSgrProjReciprocalBits - 2);
+ const __m128i b_hi = VrshrU32(m1, kSgrProjReciprocalBits - 2);
+ return _mm_packus_epi32(b_lo, b_hi);
+}
+
+inline __m256i CalculateB5(const __m256i sum, const __m256i ma) {
+ // one_over_n == 164.
+ constexpr uint32_t one_over_n =
+ ((1 << kSgrProjReciprocalBits) + (25 >> 1)) / 25;
+ // one_over_n_quarter == 41.
+ constexpr uint32_t one_over_n_quarter = one_over_n >> 2;
+ static_assert(one_over_n == one_over_n_quarter << 2, "");
+ // |ma| is in range [0, 255].
+ const __m256i m =
+ _mm256_maddubs_epi16(ma, _mm256_set1_epi16(one_over_n_quarter));
+ const __m256i m0 = VmullLo16(m, sum);
+ const __m256i m1 = VmullHi16(m, sum);
+ const __m256i b_lo = VrshrU32(m0, kSgrProjReciprocalBits - 2);
+ const __m256i b_hi = VrshrU32(m1, kSgrProjReciprocalBits - 2);
+ return _mm256_packus_epi32(b_lo, b_hi);
+}
+
+inline __m128i CalculateB3(const __m128i sum, const __m128i ma) {
+ // one_over_n == 455.
+ constexpr uint32_t one_over_n =
+ ((1 << kSgrProjReciprocalBits) + (9 >> 1)) / 9;
const __m128i m0 = VmullLo16(ma, sum);
const __m128i m1 = VmullHi16(ma, sum);
const __m128i m2 = _mm_mullo_epi32(m0, _mm_set1_epi32(one_over_n));
@@ -1446,11 +1471,10 @@ inline __m128i CalculateB(const __m128i sum, const __m128i ma) {
return _mm_packus_epi32(b_lo, b_hi);
}
-template <int n>
-inline __m256i CalculateB(const __m256i sum, const __m256i ma) {
- static_assert(n == 9 || n == 25, "");
+inline __m256i CalculateB3(const __m256i sum, const __m256i ma) {
+ // one_over_n == 455.
constexpr uint32_t one_over_n =
- ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n;
+ ((1 << kSgrProjReciprocalBits) + (9 >> 1)) / 9;
const __m256i m0 = VmullLo16(ma, sum);
const __m256i m1 = VmullHi16(ma, sum);
const __m256i m2 = _mm256_mullo_epi32(m0, _mm256_set1_epi32(one_over_n));
@@ -1525,7 +1549,7 @@ inline void LookupIntermediate(const __m128i sum, const __m128i index,
// Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
// Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
const __m128i maq = _mm_unpacklo_epi8(*ma, _mm_setzero_si128());
- *b = CalculateB<n>(sum, maq);
+ *b = (n == 9) ? CalculateB3(sum, maq) : CalculateB5(sum, maq);
}
// Repeat the first 48 elements in kSgrMaLookup with a period of 16.
@@ -1539,7 +1563,7 @@ alignas(32) constexpr uint8_t kSgrMaLookupAvx2[96] = {
// Set the shuffle control mask of indices out of range [0, 15] to (1xxxxxxx)b
// to get value 0 as the shuffle result. The most significiant bit 1 comes
-// either from the comparision instruction, or from the sign bit of the index.
+// either from the comparison instruction, or from the sign bit of the index.
inline __m256i ShuffleIndex(const __m256i table, const __m256i index) {
__m256i mask;
mask = _mm256_cmpgt_epi8(index, _mm256_set1_epi8(15));
@@ -1558,15 +1582,15 @@ template <int n>
inline void CalculateIntermediate(const __m256i sum[2], const __m256i index[2],
__m256i ma[3], __m256i b[2]) {
static_assert(n == 9 || n == 25, "");
- // Use table lookup to read elements which indices are less than 48.
+ // Use table lookup to read elements whose indices are less than 48.
const __m256i c0 = LoadAligned32(kSgrMaLookupAvx2 + 0 * 32);
const __m256i c1 = LoadAligned32(kSgrMaLookupAvx2 + 1 * 32);
const __m256i c2 = LoadAligned32(kSgrMaLookupAvx2 + 2 * 32);
const __m256i indices = _mm256_packus_epi16(index[0], index[1]);
__m256i idx, mas;
- // Clip idx to 127 to apply signed comparision instructions.
+ // Clip idx to 127 to apply signed comparison instructions.
idx = _mm256_min_epu8(indices, _mm256_set1_epi8(127));
- // All elements which indices are less than 48 are set to 0.
+ // All elements whose indices are less than 48 are set to 0.
// Get shuffle results for indices in range [0, 15].
mas = ShuffleIndex(c0, idx);
// Get shuffle results for indices in range [16, 31].
@@ -1581,12 +1605,12 @@ inline void CalculateIntermediate(const __m256i sum[2], const __m256i index[2],
const __m256i res2 = ShuffleIndex(c2, idx);
mas = _mm256_or_si256(mas, res2);
- // For elements which indices are larger than 47, since they seldom change
+ // For elements whose indices are larger than 47, since they seldom change
// values with the increase of the index, we use comparison and arithmetic
// operations to calculate their values.
- // Add -128 to apply signed comparision instructions.
+ // Add -128 to apply signed comparison instructions.
idx = _mm256_add_epi8(indices, _mm256_set1_epi8(-128));
- // Elements which indices are larger than 47 (with value 0) are set to 5.
+ // Elements whose indices are larger than 47 (with value 0) are set to 5.
mas = _mm256_max_epu8(mas, _mm256_set1_epi8(5));
mas = AdjustValue(mas, idx, 55); // 55 is the last index which value is 5.
mas = AdjustValue(mas, idx, 72); // 72 is the last index which value is 4.
@@ -1611,8 +1635,13 @@ inline void CalculateIntermediate(const __m256i sum[2], const __m256i index[2],
// Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
const __m256i maq0 = _mm256_unpackhi_epi8(ma[0], _mm256_setzero_si256());
const __m256i maq1 = _mm256_unpacklo_epi8(ma[1], _mm256_setzero_si256());
- b[0] = CalculateB<n>(sum[0], maq0);
- b[1] = CalculateB<n>(sum[1], maq1);
+ if (n == 9) {
+ b[0] = CalculateB3(sum[0], maq0);
+ b[1] = CalculateB3(sum[1], maq1);
+ } else {
+ b[0] = CalculateB5(sum[0], maq0);
+ b[1] = CalculateB5(sum[1], maq1);
+ }
}
inline void CalculateIntermediate5(const __m128i s5[5], const __m128i sq5[5][2],
@@ -1903,8 +1932,8 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess(
__m256i b3[2][5], __m256i ma5[3], __m256i b5[5]) {
const __m256i s0 = LoadUnaligned32Msan(src0 + 8, over_read_in_bytes + 8);
const __m256i s1 = LoadUnaligned32Msan(src1 + 8, over_read_in_bytes + 8);
- __m256i s3[2][4], s5[2][5], sq3[4][2], sq5[5][2], sq3t[4][2], sq5t[5][2],
- sum_3[2][2], index_3[2][2], sum_5[2], index_5[2];
+ __m256i s3[2][4], s5[2][5], sq3[4][2], sq5[5][2], sum_3[2][2], index_3[2][2],
+ sum_5[2], index_5[2];
sq[0][1] = SquareLo8(s0);
sq[0][2] = SquareHi8(s0);
sq[1][1] = SquareLo8(s1);
@@ -1938,22 +1967,22 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess(
LoadAligned64x3U32(square_sum5, x, sq5);
CalculateSumAndIndex5(s5[0], sq5, scales[0], &sum_5[0], &index_5[0]);
- SumHorizontal(sq[0] + 1, &sq3t[2][0], &sq3t[2][1], &sq5t[3][0], &sq5t[3][1]);
- SumHorizontal(sq[1] + 1, &sq3t[3][0], &sq3t[3][1], &sq5t[4][0], &sq5t[4][1]);
- StoreAligned64(square_sum3[2] + x + 16, sq3t[2]);
- StoreAligned64(square_sum5[3] + x + 16, sq5t[3]);
- StoreAligned64(square_sum3[3] + x + 16, sq3t[3]);
- StoreAligned64(square_sum5[4] + x + 16, sq5t[4]);
+ SumHorizontal(sq[0] + 1, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ SumHorizontal(sq[1] + 1, &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+ StoreAligned64(square_sum3[2] + x + 16, sq3[2]);
+ StoreAligned64(square_sum5[3] + x + 16, sq5[3]);
+ StoreAligned64(square_sum3[3] + x + 16, sq3[3]);
+ StoreAligned64(square_sum5[4] + x + 16, sq5[4]);
LoadAligned32x2U16Msan(sum3, x + 16, sum_width, s3[1]);
- LoadAligned64x2U32Msan(square_sum3, x + 16, sum_width, sq3t);
- CalculateSumAndIndex3(s3[1], sq3t, scales[1], &sum_3[0][1], &index_3[0][1]);
- CalculateSumAndIndex3(s3[1] + 1, sq3t + 1, scales[1], &sum_3[1][1],
+ LoadAligned64x2U32Msan(square_sum3, x + 16, sum_width, sq3);
+ CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum_3[0][1], &index_3[0][1]);
+ CalculateSumAndIndex3(s3[1] + 1, sq3 + 1, scales[1], &sum_3[1][1],
&index_3[1][1]);
CalculateIntermediate<9>(sum_3[0], index_3[0], ma3[0], b3[0] + 1);
CalculateIntermediate<9>(sum_3[1], index_3[1], ma3[1], b3[1] + 1);
LoadAligned32x3U16Msan(sum5, x + 16, sum_width, s5[1]);
- LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5t);
- CalculateSumAndIndex5(s5[1], sq5t, scales[0], &sum_5[1], &index_5[1]);
+ LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5);
+ CalculateSumAndIndex5(s5[1], sq5, scales[0], &sum_5[1], &index_5[1]);
CalculateIntermediate<25>(sum_5, index_5, ma5, b5 + 1);
b3[0][0] = _mm256_permute2x128_si256(b3[0][0], b3[0][2], 0x21);
b3[1][0] = _mm256_permute2x128_si256(b3[1][0], b3[1][2], 0x21);
@@ -1988,8 +2017,8 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRow(
__m256i sq[6], __m256i ma3[2], __m256i ma5[2], __m256i b3[5],
__m256i b5[5]) {
const __m256i s0 = LoadUnaligned32Msan(src + 8, over_read_in_bytes + 8);
- __m256i s3[2][3], s5[2][5], sq3[4][2], sq3t[4][2], sq5[5][2], sq5t[5][2],
- sum_3[2], index_3[2], sum_5[2], index_5[2];
+ __m256i s3[2][3], s5[2][5], sq3[4][2], sq5[5][2], sum_3[2], index_3[2],
+ sum_5[2], index_5[2];
sq[1] = SquareLo8(s0);
sq[2] = SquareHi8(s0);
sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
@@ -2006,17 +2035,17 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRow(
sq5[4][1] = sq5[3][1];
CalculateSumAndIndex5(s5[0], sq5, scales[0], &sum_5[0], &index_5[0]);
- SumHorizontal(sq + 1, &sq3t[2][0], &sq3t[2][1], &sq5t[3][0], &sq5t[3][1]);
+ SumHorizontal(sq + 1, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
LoadAligned32x2U16Msan(sum3, x + 16, sum_width, s3[1]);
- LoadAligned64x2U32Msan(square_sum3, x + 16, sum_width, sq3t);
- CalculateSumAndIndex3(s3[1], sq3t, scales[1], &sum_3[1], &index_3[1]);
+ LoadAligned64x2U32Msan(square_sum3, x + 16, sum_width, sq3);
+ CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum_3[1], &index_3[1]);
CalculateIntermediate<9>(sum_3, index_3, ma3, b3 + 1);
LoadAligned32x3U16Msan(sum5, x + 16, sum_width, s5[1]);
s5[1][4] = s5[1][3];
- LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5t);
- sq5t[4][0] = sq5t[3][0];
- sq5t[4][1] = sq5t[3][1];
- CalculateSumAndIndex5(s5[1], sq5t, scales[0], &sum_5[1], &index_5[1]);
+ LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ CalculateSumAndIndex5(s5[1], sq5, scales[0], &sum_5[1], &index_5[1]);
CalculateIntermediate<25>(sum_5, index_5, ma5, b5 + 1);
b3[0] = _mm256_permute2x128_si256(b3[0], b3[2], 0x21);
b5[0] = _mm256_permute2x128_si256(b5[0], b5[2], 0x21);
@@ -2071,9 +2100,9 @@ LIBGAV1_ALWAYS_INLINE void BoxSumFilterPreProcess3(
uint16_t* const sum3[3], uint32_t* const square_sum3[3],
const ptrdiff_t sum_width, uint16_t* ma343, uint16_t* ma444, uint32_t* b343,
uint32_t* b444) {
+ const __m128i s = LoadUnaligned16Msan(src, kOverreadInBytesPass2_128 - width);
__m128i ma0, sq_128[2], b0;
__m256i mas[3], sq[3], bs[3];
- const __m128i s = LoadUnaligned16Msan(src, kOverreadInBytesPass2_128 - width);
sq_128[0] = SquareLo8(s);
BoxFilterPreProcess3Lo(s, scale, sum3, square_sum3, sq_128, &ma0, &b0);
sq[0] = SetrM128i(sq_128[0], sq_128[1]);
@@ -2115,9 +2144,9 @@ inline void BoxSumFilterPreProcess(
const uint8_t* const src0, const uint8_t* const src1, const int width,
const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
- const ptrdiff_t sum_width, uint16_t* const ma343[4],
- uint16_t* const ma444[2], uint16_t* ma565, uint32_t* const b343[4],
- uint32_t* const b444[2], uint32_t* b565) {
+ const ptrdiff_t sum_width, uint16_t* const ma343[4], uint16_t* const ma444,
+ uint16_t* ma565, uint32_t* const b343[4], uint32_t* const b444,
+ uint32_t* b565) {
__m128i s[2], ma3_128[2], ma5_0, sq_128[2][2], b3_128[2], b5_0;
__m256i ma3[2][3], ma5[3], sq[2][3], b3[2][5], b5[5];
s[0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1_128 - width);
@@ -2151,9 +2180,8 @@ inline void BoxSumFilterPreProcess(
Sum565W(b5, b);
StoreAligned64(b565, b);
Prepare3_8(ma3[1], ma3x);
- Store343_444Lo(ma3x, b3[1], x, ma343[1], ma444[0], b343[1], b444[0]);
- Store343_444Hi(ma3x, b3[1] + 1, x + 16, ma343[1], ma444[0], b343[1],
- b444[0]);
+ Store343_444Lo(ma3x, b3[1], x, ma343[1], ma444, b343[1], b444);
+ Store343_444Hi(ma3x, b3[1] + 1, x + 16, ma343[1], ma444, b343[1], b444);
Prepare3_8(ma5, ma5x);
ma[0] = Sum565Lo(ma5x);
ma[1] = Sum565Hi(ma5x);
@@ -2199,8 +2227,9 @@ inline __m256i CalculateFilteredOutput(const __m256i src, const __m256i ma,
return _mm256_packs_epi32(dst_lo, dst_hi); // 13 bits
}
-inline __m256i CalculateFilteredOutputPass1(const __m256i src, __m256i ma[2],
- __m256i b[2][2]) {
+inline __m256i CalculateFilteredOutputPass1(const __m256i src,
+ const __m256i ma[2],
+ const __m256i b[2][2]) {
const __m256i ma_sum = _mm256_add_epi16(ma[0], ma[1]);
__m256i b_sum[2];
b_sum[0] = _mm256_add_epi32(b[0][0], b[1][0]);
@@ -2208,8 +2237,9 @@ inline __m256i CalculateFilteredOutputPass1(const __m256i src, __m256i ma[2],
return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
}
-inline __m256i CalculateFilteredOutputPass2(const __m256i src, __m256i ma[3],
- __m256i b[3][2]) {
+inline __m256i CalculateFilteredOutputPass2(const __m256i src,
+ const __m256i ma[3],
+ const __m256i b[3][2]) {
const __m256i ma_sum = Sum3_16(ma);
__m256i b_sum[2];
Sum3_32(b, b_sum);
@@ -2267,13 +2297,13 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterPass1(
int x = 0;
do {
- __m256i ma[3], ma3[3], b[2][2][2];
+ __m256i ma[3], ma5[3], b[2][2][2];
BoxFilterPreProcess5(src0 + x + 8, src1 + x + 8,
x + 8 + kOverreadInBytesPass1_256 - width, sum_width,
x + 8, scale, sum5, square_sum5, sq, mas, bs);
- Prepare3_8(mas, ma3);
- ma[1] = Sum565Lo(ma3);
- ma[2] = Sum565Hi(ma3);
+ Prepare3_8(mas, ma5);
+ ma[1] = Sum565Lo(ma5);
+ ma[2] = Sum565Hi(ma5);
StoreAligned64(ma565[1] + x, ma + 1);
Sum565W(bs + 0, b[0][1]);
Sum565W(bs + 1, b[1][1]);
@@ -2511,9 +2541,9 @@ inline void BoxFilterLastRow(
const ptrdiff_t sum_width, const uint16_t scales[2], const int16_t w0,
const int16_t w2, uint16_t* const sum3[4], uint16_t* const sum5[5],
uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
- uint16_t* const ma343[4], uint16_t* const ma444[3],
- uint16_t* const ma565[2], uint32_t* const b343[4], uint32_t* const b444[3],
- uint32_t* const b565[2], uint8_t* const dst) {
+ uint16_t* const ma343, uint16_t* const ma444, uint16_t* const ma565,
+ uint32_t* const b343, uint32_t* const b444, uint32_t* const b565,
+ uint8_t* const dst) {
const __m128i s0 =
LoadUnaligned16Msan(src0, kOverreadInBytesPass1_128 - width);
__m128i ma3_0, ma5_0, b3_0, b5_0, sq_128[2];
@@ -2542,13 +2572,13 @@ inline void BoxFilterLastRow(
Sum343W(b3, b[2]);
const __m256i sr = LoadUnaligned32(src + x);
const __m256i sr_lo = _mm256_unpacklo_epi8(sr, _mm256_setzero_si256());
- ma[0] = LoadAligned32(ma565[0] + x);
- LoadAligned64(b565[0] + x, b[0]);
+ ma[0] = LoadAligned32(ma565 + x);
+ LoadAligned64(b565 + x, b[0]);
p[0] = CalculateFilteredOutputPass1(sr_lo, ma, b);
- ma[0] = LoadAligned32(ma343[0] + x);
- ma[1] = LoadAligned32(ma444[0] + x);
- LoadAligned64(b343[0] + x, b[0]);
- LoadAligned64(b444[0] + x, b[1]);
+ ma[0] = LoadAligned32(ma343 + x);
+ ma[1] = LoadAligned32(ma444 + x);
+ LoadAligned64(b343 + x, b[0]);
+ LoadAligned64(b444 + x, b[1]);
p[1] = CalculateFilteredOutputPass2(sr_lo, ma, b);
const __m256i d0 = SelfGuidedDoubleMultiplier(sr_lo, p, w0, w2);
@@ -2557,13 +2587,13 @@ inline void BoxFilterLastRow(
mat[2] = Sum343Hi(ma3x);
Sum343W(b3 + 1, b[2]);
const __m256i sr_hi = _mm256_unpackhi_epi8(sr, _mm256_setzero_si256());
- mat[0] = LoadAligned32(ma565[0] + x + 16);
- LoadAligned64(b565[0] + x + 16, b[0]);
+ mat[0] = LoadAligned32(ma565 + x + 16);
+ LoadAligned64(b565 + x + 16, b[0]);
p[0] = CalculateFilteredOutputPass1(sr_hi, mat, b);
- mat[0] = LoadAligned32(ma343[0] + x + 16);
- mat[1] = LoadAligned32(ma444[0] + x + 16);
- LoadAligned64(b343[0] + x + 16, b[0]);
- LoadAligned64(b444[0] + x + 16, b[1]);
+ mat[0] = LoadAligned32(ma343 + x + 16);
+ mat[1] = LoadAligned32(ma444 + x + 16);
+ LoadAligned64(b343 + x + 16, b[0]);
+ LoadAligned64(b444 + x + 16, b[1]);
p[1] = CalculateFilteredOutputPass2(sr_hi, mat, b);
const __m256i d1 = SelfGuidedDoubleMultiplier(sr_hi, p, w0, w2);
StoreUnaligned32(dst + x, _mm256_packus_epi16(d0, d1));
@@ -2578,8 +2608,9 @@ inline void BoxFilterLastRow(
LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
const RestorationUnitInfo& restoration_info, const uint8_t* src,
- const uint8_t* const top_border, const uint8_t* bottom_border,
- const ptrdiff_t stride, const int width, const int height,
+ const ptrdiff_t stride, const uint8_t* const top_border,
+ const ptrdiff_t top_border_stride, const uint8_t* bottom_border,
+ const ptrdiff_t bottom_border_stride, const int width, const int height,
SgrBuffer* const sgr_buffer, uint8_t* dst) {
const auto temp_stride = Align<ptrdiff_t>(width, 32);
const auto sum_width = temp_stride + 8;
@@ -2619,14 +2650,14 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
b565[1] = b565[0] + temp_stride;
assert(scales[0] != 0);
assert(scales[1] != 0);
- BoxSum(top_border, stride, width, sum_stride, temp_stride, sum3[0], sum5[1],
- square_sum3[0], square_sum5[1]);
+ BoxSum(top_border, top_border_stride, width, sum_stride, temp_stride, sum3[0],
+ sum5[1], square_sum3[0], square_sum5[1]);
sum5[0] = sum5[1];
square_sum5[0] = square_sum5[1];
const uint8_t* const s = (height > 1) ? src + stride : bottom_border;
BoxSumFilterPreProcess(src, s, width, scales, sum3, sum5, square_sum3,
- square_sum5, sum_width, ma343, ma444, ma565[0], b343,
- b444, b565[0]);
+ square_sum5, sum_width, ma343, ma444[0], ma565[0],
+ b343, b444[0], b565[0]);
sum5[0] = sgr_buffer->sum5 + kSumOffset;
square_sum5[0] = sgr_buffer->square_sum5 + kSumOffset;
@@ -2656,7 +2687,7 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
const uint8_t* sr[2];
if ((height & 1) == 0) {
sr[0] = bottom_border;
- sr[1] = bottom_border + stride;
+ sr[1] = bottom_border + bottom_border_stride;
} else {
sr[0] = src + 2 * stride;
sr[1] = bottom_border;
@@ -2680,19 +2711,21 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
std::swap(ma565[0], ma565[1]);
std::swap(b565[0], b565[1]);
}
- BoxFilterLastRow(src + 3, bottom_border + stride, width, sum_width, scales,
- w0, w2, sum3, sum5, square_sum3, square_sum5, ma343, ma444,
- ma565, b343, b444, b565, dst);
+ BoxFilterLastRow(src + 3, bottom_border + bottom_border_stride, width,
+ sum_width, scales, w0, w2, sum3, sum5, square_sum3,
+ square_sum5, ma343[0], ma444[0], ma565[0], b343[0],
+ b444[0], b565[0], dst);
}
}
inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
- const uint8_t* src,
+ const uint8_t* src, const ptrdiff_t stride,
const uint8_t* const top_border,
+ const ptrdiff_t top_border_stride,
const uint8_t* bottom_border,
- const ptrdiff_t stride, const int width,
- const int height, SgrBuffer* const sgr_buffer,
- uint8_t* dst) {
+ const ptrdiff_t bottom_border_stride,
+ const int width, const int height,
+ SgrBuffer* const sgr_buffer, uint8_t* dst) {
const auto temp_stride = Align<ptrdiff_t>(width, 32);
const auto sum_width = temp_stride + 8;
const auto sum_stride = temp_stride + 32;
@@ -2712,8 +2745,8 @@ inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
b565[0] = sgr_buffer->b565;
b565[1] = b565[0] + temp_stride;
assert(scale != 0);
- BoxSum<5>(top_border, stride, width, sum_stride, temp_stride, sum5[1],
- square_sum5[1]);
+ BoxSum<5>(top_border, top_border_stride, width, sum_stride, temp_stride,
+ sum5[1], square_sum5[1]);
sum5[0] = sum5[1];
square_sum5[0] = square_sum5[1];
const uint8_t* const s = (height > 1) ? src + stride : bottom_border;
@@ -2739,7 +2772,7 @@ inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
const uint8_t* sr[2];
if ((height & 1) == 0) {
sr[0] = bottom_border;
- sr[1] = bottom_border + stride;
+ sr[1] = bottom_border + bottom_border_stride;
} else {
sr[0] = src + 2 * stride;
sr[1] = bottom_border;
@@ -2757,18 +2790,20 @@ inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
Circulate5PointersBy2<uint16_t>(sum5);
Circulate5PointersBy2<uint32_t>(square_sum5);
}
- BoxFilterPass1LastRow(src, bottom_border + stride, width, sum_width, scale,
- w0, sum5, square_sum5, ma565[0], b565[0], dst);
+ BoxFilterPass1LastRow(src, bottom_border + bottom_border_stride, width,
+ sum_width, scale, w0, sum5, square_sum5, ma565[0],
+ b565[0], dst);
}
}
inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
- const uint8_t* src,
+ const uint8_t* src, const ptrdiff_t stride,
const uint8_t* const top_border,
+ const ptrdiff_t top_border_stride,
const uint8_t* bottom_border,
- const ptrdiff_t stride, const int width,
- const int height, SgrBuffer* const sgr_buffer,
- uint8_t* dst) {
+ const ptrdiff_t bottom_border_stride,
+ const int width, const int height,
+ SgrBuffer* const sgr_buffer, uint8_t* dst) {
assert(restoration_info.sgr_proj_info.multiplier[0] == 0);
const auto temp_stride = Align<ptrdiff_t>(width, 32);
const auto sum_width = temp_stride + 8;
@@ -2794,8 +2829,8 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
b444[0] = sgr_buffer->b444;
b444[1] = b444[0] + temp_stride;
assert(scale != 0);
- BoxSum<3>(top_border, stride, width, sum_stride, temp_stride, sum3[0],
- square_sum3[0]);
+ BoxSum<3>(top_border, top_border_stride, width, sum_stride, temp_stride,
+ sum3[0], square_sum3[0]);
BoxSumFilterPreProcess3<false>(src, width, scale, sum3, square_sum3,
sum_width, ma343[0], nullptr, b343[0],
nullptr);
@@ -2806,7 +2841,7 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
s = src + stride;
} else {
s = bottom_border;
- bottom_border += stride;
+ bottom_border += bottom_border_stride;
}
BoxSumFilterPreProcess3<true>(s, width, scale, sum3, square_sum3, sum_width,
ma343[1], ma444[0], b343[1], b444[0]);
@@ -2833,7 +2868,7 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
square_sum3, ma343, ma444, b343, b444, dst);
src += stride;
dst += stride;
- bottom_border += stride;
+ bottom_border += bottom_border_stride;
Circulate3PointersBy1<uint16_t>(ma343);
Circulate3PointersBy1<uint32_t>(b343);
std::swap(ma444[0], ma444[1]);
@@ -2841,13 +2876,14 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
} while (--y != 0);
}
-// If |width| is non-multiple of 8, up to 7 more pixels are written to |dest| in
-// the end of each row. It is safe to overwrite the output as it will not be
+// If |width| is non-multiple of 32, up to 31 more pixels are written to |dest|
+// in the end of each row. It is safe to overwrite the output as it will not be
// part of the visible frame.
void SelfGuidedFilter_AVX2(
const RestorationUnitInfo& restoration_info, const void* const source,
- const void* const top_border, const void* const bottom_border,
- const ptrdiff_t stride, const int width, const int height,
+ const ptrdiff_t stride, const void* const top_border,
+ const ptrdiff_t top_border_stride, const void* const bottom_border,
+ const ptrdiff_t bottom_border_stride, const int width, const int height,
RestorationBuffer* const restoration_buffer, void* const dest) {
const int index = restoration_info.sgr_proj_info.index;
const int radius_pass_0 = kSgrProjParams[index][0]; // 2 or 0
@@ -2861,14 +2897,17 @@ void SelfGuidedFilter_AVX2(
// |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the
// following assertion.
assert(radius_pass_0 != 0);
- BoxFilterProcessPass1(restoration_info, src - 3, top - 3, bottom - 3,
- stride, width, height, sgr_buffer, dst);
+ BoxFilterProcessPass1(restoration_info, src - 3, stride, top - 3,
+ top_border_stride, bottom - 3, bottom_border_stride,
+ width, height, sgr_buffer, dst);
} else if (radius_pass_0 == 0) {
- BoxFilterProcessPass2(restoration_info, src - 2, top - 2, bottom - 2,
- stride, width, height, sgr_buffer, dst);
+ BoxFilterProcessPass2(restoration_info, src - 2, stride, top - 2,
+ top_border_stride, bottom - 2, bottom_border_stride,
+ width, height, sgr_buffer, dst);
} else {
- BoxFilterProcess(restoration_info, src - 3, top - 3, bottom - 3, stride,
- width, height, sgr_buffer, dst);
+ BoxFilterProcess(restoration_info, src - 3, stride, top - 3,
+ top_border_stride, bottom - 3, bottom_border_stride, width,
+ height, sgr_buffer, dst);
}
}
@@ -2891,7 +2930,7 @@ void LoopRestorationInit_AVX2() { low_bitdepth::Init8bpp(); }
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_TARGETING_AVX2
+#else // !LIBGAV1_TARGETING_AVX2
namespace libgav1 {
namespace dsp {
diff --git a/src/dsp/x86/loop_restoration_avx2.h b/src/dsp/x86/loop_restoration_avx2.h
index d80227c..2c3534a 100644
--- a/src/dsp/x86/loop_restoration_avx2.h
+++ b/src/dsp/x86/loop_restoration_avx2.h
@@ -47,6 +47,10 @@ void LoopRestorationInit10bpp_AVX2();
#define LIBGAV1_Dsp8bpp_SelfGuidedFilter LIBGAV1_CPU_AVX2
#endif
+#ifndef LIBGAV1_Dsp10bpp_SelfGuidedFilter
+#define LIBGAV1_Dsp10bpp_SelfGuidedFilter LIBGAV1_CPU_AVX2
+#endif
+
#endif // LIBGAV1_TARGETING_AVX2
#endif // LIBGAV1_SRC_DSP_X86_LOOP_RESTORATION_AVX2_H_
diff --git a/src/dsp/x86/loop_restoration_sse4.cc b/src/dsp/x86/loop_restoration_sse4.cc
index 24f5ad2..273bcc8 100644
--- a/src/dsp/x86/loop_restoration_sse4.cc
+++ b/src/dsp/x86/loop_restoration_sse4.cc
@@ -481,13 +481,12 @@ inline void WienerVerticalTap1(const int16_t* wiener_buffer,
}
}
-void WienerFilter_SSE4_1(const RestorationUnitInfo& restoration_info,
- const void* const source, const void* const top_border,
- const void* const bottom_border,
- const ptrdiff_t stride, const int width,
- const int height,
- RestorationBuffer* const restoration_buffer,
- void* const dest) {
+void WienerFilter_SSE4_1(
+ const RestorationUnitInfo& restoration_info, const void* const source,
+ const ptrdiff_t stride, const void* const top_border,
+ const ptrdiff_t top_border_stride, const void* const bottom_border,
+ const ptrdiff_t bottom_border_stride, const int width, const int height,
+ RestorationBuffer* const restoration_buffer, void* const dest) {
const int16_t* const number_leading_zero_coefficients =
restoration_info.wiener_info.number_leading_zero_coefficients;
const int number_rows_to_skip = std::max(
@@ -516,45 +515,48 @@ void WienerFilter_SSE4_1(const RestorationUnitInfo& restoration_info,
const __m128i coefficients_horizontal =
_mm_sub_epi16(c, _mm_setr_epi16(0, 0, 0, 128, 0, 0, 0, 0));
if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) {
- WienerHorizontalTap7(top + (2 - height_extra) * stride - 3, stride,
- wiener_stride, height_extra, filter_horizontal[0],
- coefficients_horizontal, &wiener_buffer_horizontal);
- WienerHorizontalTap7(src - 3, stride, wiener_stride, height,
+ WienerHorizontalTap7(top + (2 - height_extra) * top_border_stride - 3,
+ top_border_stride, wiener_stride, height_extra,
filter_horizontal[0], coefficients_horizontal,
&wiener_buffer_horizontal);
- WienerHorizontalTap7(bottom - 3, stride, wiener_stride, height_extra,
+ WienerHorizontalTap7(src - 3, stride, wiener_stride, height,
filter_horizontal[0], coefficients_horizontal,
&wiener_buffer_horizontal);
- } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
- WienerHorizontalTap5(top + (2 - height_extra) * stride - 2, stride,
- wiener_stride, height_extra, filter_horizontal[1],
+ WienerHorizontalTap7(bottom - 3, bottom_border_stride, wiener_stride,
+ height_extra, filter_horizontal[0],
coefficients_horizontal, &wiener_buffer_horizontal);
- WienerHorizontalTap5(src - 2, stride, wiener_stride, height,
+ } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
+ WienerHorizontalTap5(top + (2 - height_extra) * top_border_stride - 2,
+ top_border_stride, wiener_stride, height_extra,
filter_horizontal[1], coefficients_horizontal,
&wiener_buffer_horizontal);
- WienerHorizontalTap5(bottom - 2, stride, wiener_stride, height_extra,
+ WienerHorizontalTap5(src - 2, stride, wiener_stride, height,
filter_horizontal[1], coefficients_horizontal,
&wiener_buffer_horizontal);
+ WienerHorizontalTap5(bottom - 2, bottom_border_stride, wiener_stride,
+ height_extra, filter_horizontal[1],
+ coefficients_horizontal, &wiener_buffer_horizontal);
} else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) {
// The maximum over-reads happen here.
- WienerHorizontalTap3(top + (2 - height_extra) * stride - 1, stride,
- wiener_stride, height_extra, filter_horizontal[2],
- coefficients_horizontal, &wiener_buffer_horizontal);
- WienerHorizontalTap3(src - 1, stride, wiener_stride, height,
+ WienerHorizontalTap3(top + (2 - height_extra) * top_border_stride - 1,
+ top_border_stride, wiener_stride, height_extra,
filter_horizontal[2], coefficients_horizontal,
&wiener_buffer_horizontal);
- WienerHorizontalTap3(bottom - 1, stride, wiener_stride, height_extra,
+ WienerHorizontalTap3(src - 1, stride, wiener_stride, height,
filter_horizontal[2], coefficients_horizontal,
&wiener_buffer_horizontal);
+ WienerHorizontalTap3(bottom - 1, bottom_border_stride, wiener_stride,
+ height_extra, filter_horizontal[2],
+ coefficients_horizontal, &wiener_buffer_horizontal);
} else {
assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3);
- WienerHorizontalTap1(top + (2 - height_extra) * stride, stride,
- wiener_stride, height_extra,
+ WienerHorizontalTap1(top + (2 - height_extra) * top_border_stride,
+ top_border_stride, wiener_stride, height_extra,
&wiener_buffer_horizontal);
WienerHorizontalTap1(src, stride, wiener_stride, height,
&wiener_buffer_horizontal);
- WienerHorizontalTap1(bottom, stride, wiener_stride, height_extra,
- &wiener_buffer_horizontal);
+ WienerHorizontalTap1(bottom, bottom_border_stride, wiener_stride,
+ height_extra, &wiener_buffer_horizontal);
}
// vertical filtering.
@@ -1160,11 +1162,26 @@ inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq[2],
return _mm_packus_epi32(z0, z1);
}
-template <int n>
-inline __m128i CalculateB(const __m128i sum, const __m128i ma) {
- static_assert(n == 9 || n == 25, "");
+inline __m128i CalculateB5(const __m128i sum, const __m128i ma) {
+ // one_over_n == 164.
constexpr uint32_t one_over_n =
- ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n;
+ ((1 << kSgrProjReciprocalBits) + (25 >> 1)) / 25;
+ // one_over_n_quarter == 41.
+ constexpr uint32_t one_over_n_quarter = one_over_n >> 2;
+ static_assert(one_over_n == one_over_n_quarter << 2, "");
+ // |ma| is in range [0, 255].
+ const __m128i m = _mm_maddubs_epi16(ma, _mm_set1_epi16(one_over_n_quarter));
+ const __m128i m0 = VmullLo16(m, sum);
+ const __m128i m1 = VmullHi16(m, sum);
+ const __m128i b_lo = VrshrU32(m0, kSgrProjReciprocalBits - 2);
+ const __m128i b_hi = VrshrU32(m1, kSgrProjReciprocalBits - 2);
+ return _mm_packus_epi32(b_lo, b_hi);
+}
+
+inline __m128i CalculateB3(const __m128i sum, const __m128i ma) {
+ // one_over_n == 455.
+ constexpr uint32_t one_over_n =
+ ((1 << kSgrProjReciprocalBits) + (9 >> 1)) / 9;
const __m128i m0 = VmullLo16(ma, sum);
const __m128i m1 = VmullHi16(ma, sum);
const __m128i m2 = _mm_mullo_epi32(m0, _mm_set1_epi32(one_over_n));
@@ -1227,12 +1244,12 @@ inline void LookupIntermediate(const __m128i sum, const __m128i index,
} else {
maq = _mm_unpackhi_epi8(*ma, _mm_setzero_si128());
}
- *b = CalculateB<n>(sum, maq);
+ *b = (n == 9) ? CalculateB3(sum, maq) : CalculateB5(sum, maq);
}
// Set the shuffle control mask of indices out of range [0, 15] to (1xxxxxxx)b
// to get value 0 as the shuffle result. The most significiant bit 1 comes
-// either from the comparision instruction, or from the sign bit of the index.
+// either from the comparison instruction, or from the sign bit of the index.
inline __m128i ShuffleIndex(const __m128i table, const __m128i index) {
__m128i mask;
mask = _mm_cmpgt_epi8(index, _mm_set1_epi8(15));
@@ -1250,15 +1267,15 @@ inline __m128i AdjustValue(const __m128i value, const __m128i index,
inline void CalculateIntermediate(const __m128i sum[2], const __m128i index[2],
__m128i* const ma, __m128i* const b0,
__m128i* const b1) {
- // Use table lookup to read elements which indices are less than 48.
+ // Use table lookup to read elements whose indices are less than 48.
const __m128i c0 = LoadAligned16(kSgrMaLookup + 0 * 16);
const __m128i c1 = LoadAligned16(kSgrMaLookup + 1 * 16);
const __m128i c2 = LoadAligned16(kSgrMaLookup + 2 * 16);
const __m128i indices = _mm_packus_epi16(index[0], index[1]);
__m128i idx;
- // Clip idx to 127 to apply signed comparision instructions.
+ // Clip idx to 127 to apply signed comparison instructions.
idx = _mm_min_epu8(indices, _mm_set1_epi8(127));
- // All elements which indices are less than 48 are set to 0.
+ // All elements whose indices are less than 48 are set to 0.
// Get shuffle results for indices in range [0, 15].
*ma = ShuffleIndex(c0, idx);
// Get shuffle results for indices in range [16, 31].
@@ -1273,12 +1290,12 @@ inline void CalculateIntermediate(const __m128i sum[2], const __m128i index[2],
const __m128i res2 = ShuffleIndex(c2, idx);
*ma = _mm_or_si128(*ma, res2);
- // For elements which indices are larger than 47, since they seldom change
+ // For elements whose indices are larger than 47, since they seldom change
// values with the increase of the index, we use comparison and arithmetic
// operations to calculate their values.
- // Add -128 to apply signed comparision instructions.
+ // Add -128 to apply signed comparison instructions.
idx = _mm_add_epi8(indices, _mm_set1_epi8(-128));
- // Elements which indices are larger than 47 (with value 0) are set to 5.
+ // Elements whose indices are larger than 47 (with value 0) are set to 5.
*ma = _mm_max_epu8(*ma, _mm_set1_epi8(5));
*ma = AdjustValue(*ma, idx, 55); // 55 is the last index which value is 5.
*ma = AdjustValue(*ma, idx, 72); // 72 is the last index which value is 4.
@@ -1298,9 +1315,9 @@ inline void CalculateIntermediate(const __m128i sum[2], const __m128i index[2],
// Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
// Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
const __m128i maq0 = _mm_unpacklo_epi8(*ma, _mm_setzero_si128());
- *b0 = CalculateB<9>(sum[0], maq0);
+ *b0 = CalculateB3(sum[0], maq0);
const __m128i maq1 = _mm_unpackhi_epi8(*ma, _mm_setzero_si128());
- *b1 = CalculateB<9>(sum[1], maq1);
+ *b1 = CalculateB3(sum[1], maq1);
}
inline void CalculateIntermediate(const __m128i sum[2], const __m128i index[2],
@@ -1776,9 +1793,9 @@ inline void BoxSumFilterPreProcess(
const uint8_t* const src0, const uint8_t* const src1, const int width,
const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
- const ptrdiff_t sum_width, uint16_t* const ma343[4],
- uint16_t* const ma444[2], uint16_t* ma565, uint32_t* const b343[4],
- uint32_t* const b444[2], uint32_t* b565) {
+ const ptrdiff_t sum_width, uint16_t* const ma343[4], uint16_t* const ma444,
+ uint16_t* ma565, uint32_t* const b343[4], uint32_t* const b444,
+ uint32_t* b565) {
__m128i s[2][2], ma3[2][2], ma5[2], sq[2][4], b3[2][3], b5[3];
s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1 - width);
@@ -1808,9 +1825,8 @@ inline void BoxSumFilterPreProcess(
Sum565W(b5 + 1, b + 2);
StoreAligned64U32(b565, b);
Prepare3_8<0>(ma3[1], ma3x);
- Store343_444Lo(ma3x, b3[1], x, ma343[1], ma444[0], b343[1], b444[0]);
- Store343_444Hi(ma3x, b3[1] + 1, x + 8, ma343[1], ma444[0], b343[1],
- b444[0]);
+ Store343_444Lo(ma3x, b3[1], x, ma343[1], ma444, b343[1], b444);
+ Store343_444Hi(ma3x, b3[1] + 1, x + 8, ma343[1], ma444, b343[1], b444);
Prepare3_8<0>(ma5, ma5x);
ma[0] = Sum565Lo(ma5x);
ma[1] = Sum565Hi(ma5x);
@@ -1854,8 +1870,9 @@ inline __m128i CalculateFilteredOutput(const __m128i src, const __m128i ma,
return _mm_packs_epi32(dst_lo, dst_hi); // 13 bits
}
-inline __m128i CalculateFilteredOutputPass1(const __m128i src, __m128i ma[2],
- __m128i b[2][2]) {
+inline __m128i CalculateFilteredOutputPass1(const __m128i src,
+ const __m128i ma[2],
+ const __m128i b[2][2]) {
const __m128i ma_sum = _mm_add_epi16(ma[0], ma[1]);
__m128i b_sum[2];
b_sum[0] = _mm_add_epi32(b[0][0], b[1][0]);
@@ -1863,8 +1880,9 @@ inline __m128i CalculateFilteredOutputPass1(const __m128i src, __m128i ma[2],
return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
}
-inline __m128i CalculateFilteredOutputPass2(const __m128i src, __m128i ma[3],
- __m128i b[3][2]) {
+inline __m128i CalculateFilteredOutputPass2(const __m128i src,
+ const __m128i ma[3],
+ const __m128i b[3][2]) {
const __m128i ma_sum = Sum3_16(ma);
__m128i b_sum[2];
Sum3_32(b, b_sum);
@@ -1916,15 +1934,15 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterPass1(
int x = 0;
do {
- __m128i ma[2], ma3[3], b[2][2], sr[2], p[2];
+ __m128i ma[2], ma5[3], b[2][2], sr[2], p[2];
s[0][1] = LoadUnaligned16Msan(src0 + x + 16,
x + 16 + kOverreadInBytesPass1 - width);
s[1][1] = LoadUnaligned16Msan(src1 + x + 16,
x + 16 + kOverreadInBytesPass1 - width);
BoxFilterPreProcess5(s, sum_width, x + 8, scale, sum5, square_sum5, sq, mas,
bs);
- Prepare3_8<0>(mas, ma3);
- ma[1] = Sum565Lo(ma3);
+ Prepare3_8<0>(mas, ma5);
+ ma[1] = Sum565Lo(ma5);
StoreAligned16(ma565[1] + x, ma[1]);
Sum565W(bs, b[1]);
StoreAligned32U32(b565[1] + x, b[1]);
@@ -1939,7 +1957,7 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterPass1(
const __m128i d00 = SelfGuidedSingleMultiplier(sr0_lo, p[0], w0);
const __m128i d10 = SelfGuidedSingleMultiplier(sr1_lo, p[1], w0);
- ma[1] = Sum565Hi(ma3);
+ ma[1] = Sum565Hi(ma5);
StoreAligned16(ma565[1] + x + 8, ma[1]);
Sum565W(bs + 1, b[1]);
StoreAligned32U32(b565[1] + x + 8, b[1]);
@@ -2158,9 +2176,9 @@ inline void BoxFilterLastRow(
const ptrdiff_t sum_width, const uint16_t scales[2], const int16_t w0,
const int16_t w2, uint16_t* const sum3[4], uint16_t* const sum5[5],
uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
- uint16_t* const ma343[4], uint16_t* const ma444[3],
- uint16_t* const ma565[2], uint32_t* const b343[4], uint32_t* const b444[3],
- uint32_t* const b565[2], uint8_t* const dst) {
+ uint16_t* const ma343, uint16_t* const ma444, uint16_t* const ma565,
+ uint32_t* const b343, uint32_t* const b444, uint32_t* const b565,
+ uint8_t* const dst) {
__m128i s[2], ma3[2], ma5[2], sq[4], b3[3], b5[3], ma[3], b[3][2];
s[0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
sq[0] = SquareLo8(s[0]);
@@ -2183,13 +2201,13 @@ inline void BoxFilterLastRow(
Sum343W(b3, b[2]);
const __m128i sr = LoadAligned16(src + x);
const __m128i sr_lo = _mm_unpacklo_epi8(sr, _mm_setzero_si128());
- ma[0] = LoadAligned16(ma565[0] + x);
- LoadAligned32U32(b565[0] + x, b[0]);
+ ma[0] = LoadAligned16(ma565 + x);
+ LoadAligned32U32(b565 + x, b[0]);
p[0] = CalculateFilteredOutputPass1(sr_lo, ma, b);
- ma[0] = LoadAligned16(ma343[0] + x);
- ma[1] = LoadAligned16(ma444[0] + x);
- LoadAligned32U32(b343[0] + x, b[0]);
- LoadAligned32U32(b444[0] + x, b[1]);
+ ma[0] = LoadAligned16(ma343 + x);
+ ma[1] = LoadAligned16(ma444 + x);
+ LoadAligned32U32(b343 + x, b[0]);
+ LoadAligned32U32(b444 + x, b[1]);
p[1] = CalculateFilteredOutputPass2(sr_lo, ma, b);
const __m128i d0 = SelfGuidedDoubleMultiplier(sr_lo, p, w0, w2);
@@ -2198,13 +2216,13 @@ inline void BoxFilterLastRow(
ma[2] = Sum343Hi(ma3x);
Sum343W(b3 + 1, b[2]);
const __m128i sr_hi = _mm_unpackhi_epi8(sr, _mm_setzero_si128());
- ma[0] = LoadAligned16(ma565[0] + x + 8);
- LoadAligned32U32(b565[0] + x + 8, b[0]);
+ ma[0] = LoadAligned16(ma565 + x + 8);
+ LoadAligned32U32(b565 + x + 8, b[0]);
p[0] = CalculateFilteredOutputPass1(sr_hi, ma, b);
- ma[0] = LoadAligned16(ma343[0] + x + 8);
- ma[1] = LoadAligned16(ma444[0] + x + 8);
- LoadAligned32U32(b343[0] + x + 8, b[0]);
- LoadAligned32U32(b444[0] + x + 8, b[1]);
+ ma[0] = LoadAligned16(ma343 + x + 8);
+ ma[1] = LoadAligned16(ma444 + x + 8);
+ LoadAligned32U32(b343 + x + 8, b[0]);
+ LoadAligned32U32(b444 + x + 8, b[1]);
p[1] = CalculateFilteredOutputPass2(sr_hi, ma, b);
const __m128i d1 = SelfGuidedDoubleMultiplier(sr_hi, p, w0, w2);
StoreAligned16(dst + x, _mm_packus_epi16(d0, d1));
@@ -2220,8 +2238,9 @@ inline void BoxFilterLastRow(
LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
const RestorationUnitInfo& restoration_info, const uint8_t* src,
- const uint8_t* const top_border, const uint8_t* bottom_border,
- const ptrdiff_t stride, const int width, const int height,
+ const ptrdiff_t stride, const uint8_t* const top_border,
+ const ptrdiff_t top_border_stride, const uint8_t* bottom_border,
+ const ptrdiff_t bottom_border_stride, const int width, const int height,
SgrBuffer* const sgr_buffer, uint8_t* dst) {
const auto temp_stride = Align<ptrdiff_t>(width, 16);
const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
@@ -2261,14 +2280,14 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
b565[1] = b565[0] + temp_stride;
assert(scales[0] != 0);
assert(scales[1] != 0);
- BoxSum(top_border, stride, width, sum_stride, sum_width, sum3[0], sum5[1],
- square_sum3[0], square_sum5[1]);
+ BoxSum(top_border, top_border_stride, width, sum_stride, sum_width, sum3[0],
+ sum5[1], square_sum3[0], square_sum5[1]);
sum5[0] = sum5[1];
square_sum5[0] = square_sum5[1];
const uint8_t* const s = (height > 1) ? src + stride : bottom_border;
BoxSumFilterPreProcess(src, s, width, scales, sum3, sum5, square_sum3,
- square_sum5, sum_width, ma343, ma444, ma565[0], b343,
- b444, b565[0]);
+ square_sum5, sum_width, ma343, ma444[0], ma565[0],
+ b343, b444[0], b565[0]);
sum5[0] = sgr_buffer->sum5;
square_sum5[0] = sgr_buffer->square_sum5;
@@ -2298,7 +2317,7 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
const uint8_t* sr[2];
if ((height & 1) == 0) {
sr[0] = bottom_border;
- sr[1] = bottom_border + stride;
+ sr[1] = bottom_border + bottom_border_stride;
} else {
sr[0] = src + 2 * stride;
sr[1] = bottom_border;
@@ -2322,19 +2341,21 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
std::swap(ma565[0], ma565[1]);
std::swap(b565[0], b565[1]);
}
- BoxFilterLastRow(src + 3, bottom_border + stride, width, sum_width, scales,
- w0, w2, sum3, sum5, square_sum3, square_sum5, ma343, ma444,
- ma565, b343, b444, b565, dst);
+ BoxFilterLastRow(src + 3, bottom_border + bottom_border_stride, width,
+ sum_width, scales, w0, w2, sum3, sum5, square_sum3,
+ square_sum5, ma343[0], ma444[0], ma565[0], b343[0],
+ b444[0], b565[0], dst);
}
}
inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
- const uint8_t* src,
+ const uint8_t* src, const ptrdiff_t stride,
const uint8_t* const top_border,
+ const ptrdiff_t top_border_stride,
const uint8_t* bottom_border,
- const ptrdiff_t stride, const int width,
- const int height, SgrBuffer* const sgr_buffer,
- uint8_t* dst) {
+ const ptrdiff_t bottom_border_stride,
+ const int width, const int height,
+ SgrBuffer* const sgr_buffer, uint8_t* dst) {
const auto temp_stride = Align<ptrdiff_t>(width, 16);
const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
const auto sum_stride = temp_stride + 16;
@@ -2354,8 +2375,8 @@ inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
b565[0] = sgr_buffer->b565;
b565[1] = b565[0] + temp_stride;
assert(scale != 0);
- BoxSum<5>(top_border, stride, width, sum_stride, sum_width, sum5[1],
- square_sum5[1]);
+ BoxSum<5>(top_border, top_border_stride, width, sum_stride, sum_width,
+ sum5[1], square_sum5[1]);
sum5[0] = sum5[1];
square_sum5[0] = square_sum5[1];
const uint8_t* const s = (height > 1) ? src + stride : bottom_border;
@@ -2381,7 +2402,7 @@ inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
const uint8_t* sr[2];
if ((height & 1) == 0) {
sr[0] = bottom_border;
- sr[1] = bottom_border + stride;
+ sr[1] = bottom_border + bottom_border_stride;
} else {
sr[0] = src + 2 * stride;
sr[1] = bottom_border;
@@ -2399,18 +2420,20 @@ inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
Circulate5PointersBy2<uint16_t>(sum5);
Circulate5PointersBy2<uint32_t>(square_sum5);
}
- BoxFilterPass1LastRow(src, bottom_border + stride, width, sum_width, scale,
- w0, sum5, square_sum5, ma565[0], b565[0], dst);
+ BoxFilterPass1LastRow(src, bottom_border + bottom_border_stride, width,
+ sum_width, scale, w0, sum5, square_sum5, ma565[0],
+ b565[0], dst);
}
}
inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
- const uint8_t* src,
+ const uint8_t* src, const ptrdiff_t stride,
const uint8_t* const top_border,
+ const ptrdiff_t top_border_stride,
const uint8_t* bottom_border,
- const ptrdiff_t stride, const int width,
- const int height, SgrBuffer* const sgr_buffer,
- uint8_t* dst) {
+ const ptrdiff_t bottom_border_stride,
+ const int width, const int height,
+ SgrBuffer* const sgr_buffer, uint8_t* dst) {
assert(restoration_info.sgr_proj_info.multiplier[0] == 0);
const auto temp_stride = Align<ptrdiff_t>(width, 16);
const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
@@ -2436,8 +2459,8 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
b444[0] = sgr_buffer->b444;
b444[1] = b444[0] + temp_stride;
assert(scale != 0);
- BoxSum<3>(top_border, stride, width, sum_stride, sum_width, sum3[0],
- square_sum3[0]);
+ BoxSum<3>(top_border, top_border_stride, width, sum_stride, sum_width,
+ sum3[0], square_sum3[0]);
BoxSumFilterPreProcess3<false>(src, width, scale, sum3, square_sum3,
sum_width, ma343[0], nullptr, b343[0],
nullptr);
@@ -2448,7 +2471,7 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
s = src + stride;
} else {
s = bottom_border;
- bottom_border += stride;
+ bottom_border += bottom_border_stride;
}
BoxSumFilterPreProcess3<true>(s, width, scale, sum3, square_sum3, sum_width,
ma343[1], ma444[0], b343[1], b444[0]);
@@ -2475,7 +2498,7 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
square_sum3, ma343, ma444, b343, b444, dst);
src += stride;
dst += stride;
- bottom_border += stride;
+ bottom_border += bottom_border_stride;
Circulate3PointersBy1<uint16_t>(ma343);
Circulate3PointersBy1<uint32_t>(b343);
std::swap(ma444[0], ma444[1]);
@@ -2483,13 +2506,14 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
} while (--y != 0);
}
-// If |width| is non-multiple of 8, up to 7 more pixels are written to |dest| in
-// the end of each row. It is safe to overwrite the output as it will not be
+// If |width| is non-multiple of 16, up to 15 more pixels are written to |dest|
+// in the end of each row. It is safe to overwrite the output as it will not be
// part of the visible frame.
void SelfGuidedFilter_SSE4_1(
const RestorationUnitInfo& restoration_info, const void* const source,
- const void* const top_border, const void* const bottom_border,
- const ptrdiff_t stride, const int width, const int height,
+ const ptrdiff_t stride, const void* const top_border,
+ const ptrdiff_t top_border_stride, const void* const bottom_border,
+ const ptrdiff_t bottom_border_stride, const int width, const int height,
RestorationBuffer* const restoration_buffer, void* const dest) {
const int index = restoration_info.sgr_proj_info.index;
const int radius_pass_0 = kSgrProjParams[index][0]; // 2 or 0
@@ -2503,14 +2527,17 @@ void SelfGuidedFilter_SSE4_1(
// |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the
// following assertion.
assert(radius_pass_0 != 0);
- BoxFilterProcessPass1(restoration_info, src - 3, top - 3, bottom - 3,
- stride, width, height, sgr_buffer, dst);
+ BoxFilterProcessPass1(restoration_info, src - 3, stride, top - 3,
+ top_border_stride, bottom - 3, bottom_border_stride,
+ width, height, sgr_buffer, dst);
} else if (radius_pass_0 == 0) {
- BoxFilterProcessPass2(restoration_info, src - 2, top - 2, bottom - 2,
- stride, width, height, sgr_buffer, dst);
+ BoxFilterProcessPass2(restoration_info, src - 2, stride, top - 2,
+ top_border_stride, bottom - 2, bottom_border_stride,
+ width, height, sgr_buffer, dst);
} else {
- BoxFilterProcess(restoration_info, src - 3, top - 3, bottom - 3, stride,
- width, height, sgr_buffer, dst);
+ BoxFilterProcess(restoration_info, src - 3, stride, top - 3,
+ top_border_stride, bottom - 3, bottom_border_stride, width,
+ height, sgr_buffer, dst);
}
}
@@ -2538,7 +2565,7 @@ void LoopRestorationInit_SSE4_1() { low_bitdepth::Init8bpp(); }
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_TARGETING_SSE4_1
+#else // !LIBGAV1_TARGETING_SSE4_1
namespace libgav1 {
namespace dsp {
diff --git a/src/dsp/x86/loop_restoration_sse4.h b/src/dsp/x86/loop_restoration_sse4.h
index 65b2b11..00df3af 100644
--- a/src/dsp/x86/loop_restoration_sse4.h
+++ b/src/dsp/x86/loop_restoration_sse4.h
@@ -47,6 +47,10 @@ void LoopRestorationInit10bpp_SSE4_1();
#define LIBGAV1_Dsp10bpp_WienerFilter LIBGAV1_CPU_SSE4_1
#endif
+#ifndef LIBGAV1_Dsp10bpp_SelfGuidedFilter
+#define LIBGAV1_Dsp10bpp_SelfGuidedFilter LIBGAV1_CPU_SSE4_1
+#endif
+
#endif // LIBGAV1_TARGETING_SSE4_1
#endif // LIBGAV1_SRC_DSP_X86_LOOP_RESTORATION_SSE4_H_
diff --git a/src/dsp/x86/mask_blend_sse4.cc b/src/dsp/x86/mask_blend_sse4.cc
index d8036be..2e836af 100644
--- a/src/dsp/x86/mask_blend_sse4.cc
+++ b/src/dsp/x86/mask_blend_sse4.cc
@@ -430,12 +430,515 @@ void Init8bpp() {
} // namespace
} // namespace low_bitdepth
-void MaskBlendInit_SSE4_1() { low_bitdepth::Init8bpp(); }
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+constexpr int kMax10bppSample = (1 << 10) - 1;
+constexpr int kMaskInverse = 64;
+constexpr int kRoundBitsMaskBlend = 4;
+
+inline __m128i RightShiftWithRoundingZero_U16(const __m128i v_val_d, int bits,
+ const __m128i zero) {
+ // Shift out all but the last bit.
+ const __m128i v_tmp_d = _mm_srli_epi16(v_val_d, bits - 1);
+ // Avg with zero will shift by 1 and round.
+ return _mm_avg_epu16(v_tmp_d, zero);
+}
+
+inline __m128i RightShiftWithRoundingConst_S32(const __m128i v_val_d, int bits,
+ const __m128i shift) {
+ const __m128i v_tmp_d = _mm_add_epi32(v_val_d, shift);
+ return _mm_srai_epi32(v_tmp_d, bits);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline __m128i GetMask4x2(const uint8_t* mask, ptrdiff_t mask_stride,
+ const __m128i zero) {
+ if (subsampling_x == 1) {
+ if (subsampling_y == 0) {
+ const __m128i mask_val_0 = _mm_cvtepu8_epi16(LoadLo8(mask));
+ const __m128i mask_val_1 =
+ _mm_cvtepu8_epi16(LoadLo8(mask + (mask_stride << subsampling_y)));
+ __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1);
+ return RightShiftWithRoundingZero_U16(subsampled_mask, 1, zero);
+ }
+ const __m128i one = _mm_set1_epi8(1);
+ const __m128i mask_val_0 =
+ LoadHi8(LoadLo8(mask), mask + (mask_stride << 1));
+ const __m128i mask_val_1 = LoadHi8(LoadLo8(mask + mask_stride),
+ mask + (mask_stride << 1) + mask_stride);
+ const __m128i add = _mm_adds_epu8(mask_val_0, mask_val_1);
+ const __m128i subsampled_mask = _mm_maddubs_epi16(add, one);
+ return RightShiftWithRoundingZero_U16(subsampled_mask, 2, zero);
+ }
+ assert(subsampling_y == 0 && subsampling_x == 0);
+ const __m128i mask_val_0 = Load4(mask);
+ const __m128i mask_val_1 = Load4(mask + mask_stride);
+ return _mm_cvtepu8_epi16(
+ _mm_or_si128(mask_val_0, _mm_slli_si128(mask_val_1, 4)));
+}
+
+template <int subsampling_x, int subsampling_y>
+inline __m128i GetMask8(const uint8_t* mask, const ptrdiff_t stride,
+ const __m128i zero) {
+ if (subsampling_x == 1) {
+ if (subsampling_y == 0) {
+ const __m128i row_vals = LoadUnaligned16(mask);
+ const __m128i mask_val_0 = _mm_cvtepu8_epi16(row_vals);
+ const __m128i mask_val_1 = _mm_cvtepu8_epi16(_mm_srli_si128(row_vals, 8));
+ __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1);
+ return RightShiftWithRoundingZero_U16(subsampled_mask, 1, zero);
+ }
+ const __m128i one = _mm_set1_epi8(1);
+ const __m128i mask_val_0 = LoadUnaligned16(mask);
+ const __m128i mask_val_1 = LoadUnaligned16(mask + stride);
+ const __m128i add_0 = _mm_adds_epu8(mask_val_0, mask_val_1);
+ const __m128i mask_0 = _mm_maddubs_epi16(add_0, one);
+ return RightShiftWithRoundingZero_U16(mask_0, 2, zero);
+ }
+ assert(subsampling_y == 0 && subsampling_x == 0);
+ const __m128i mask_val = LoadLo8(mask);
+ return _mm_cvtepu8_epi16(mask_val);
+}
+
+inline void WriteMaskBlendLine10bpp4x2_SSE4_1(
+ const uint16_t* pred_0, const uint16_t* pred_1,
+ const ptrdiff_t pred_stride_1, const __m128i& pred_mask_0,
+ const __m128i& pred_mask_1, const __m128i& offset, const __m128i& max,
+ const __m128i& shift4, uint16_t* dst, const ptrdiff_t dst_stride) {
+ const __m128i pred_val_0 = LoadUnaligned16(pred_0);
+ const __m128i pred_val_1 = LoadHi8(LoadLo8(pred_1), pred_1 + pred_stride_1);
+
+ // int res = (mask_value * pred_0[x] + (64 - mask_value) * pred_1[x]) >> 6;
+ const __m128i compound_pred_lo_0 = _mm_mullo_epi16(pred_val_0, pred_mask_0);
+ const __m128i compound_pred_hi_0 = _mm_mulhi_epu16(pred_val_0, pred_mask_0);
+ const __m128i compound_pred_lo_1 = _mm_mullo_epi16(pred_val_1, pred_mask_1);
+ const __m128i compound_pred_hi_1 = _mm_mulhi_epu16(pred_val_1, pred_mask_1);
+ const __m128i pack0_lo =
+ _mm_unpacklo_epi16(compound_pred_lo_0, compound_pred_hi_0);
+ const __m128i pack0_hi =
+ _mm_unpackhi_epi16(compound_pred_lo_0, compound_pred_hi_0);
+ const __m128i pack1_lo =
+ _mm_unpacklo_epi16(compound_pred_lo_1, compound_pred_hi_1);
+ const __m128i pack1_hi =
+ _mm_unpackhi_epi16(compound_pred_lo_1, compound_pred_hi_1);
+ const __m128i compound_pred_lo = _mm_add_epi32(pack0_lo, pack1_lo);
+ const __m128i compound_pred_hi = _mm_add_epi32(pack0_hi, pack1_hi);
+ // res -= (bitdepth == 8) ? 0 : kCompoundOffset;
+ const __m128i sub_0 =
+ _mm_sub_epi32(_mm_srli_epi32(compound_pred_lo, 6), offset);
+ const __m128i sub_1 =
+ _mm_sub_epi32(_mm_srli_epi32(compound_pred_hi, 6), offset);
+
+ // dst[x] = static_cast<Pixel>(
+ // Clip3(RightShiftWithRounding(res, inter_post_round_bits), 0,
+ // (1 << kBitdepth8) - 1));
+ const __m128i shift_0 =
+ RightShiftWithRoundingConst_S32(sub_0, kRoundBitsMaskBlend, shift4);
+ const __m128i shift_1 =
+ RightShiftWithRoundingConst_S32(sub_1, kRoundBitsMaskBlend, shift4);
+ const __m128i result = _mm_min_epi16(_mm_packus_epi32(shift_0, shift_1), max);
+ StoreLo8(dst, result);
+ StoreHi8(dst + dst_stride, result);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void MaskBlend10bpp4x4_SSE4_1(const uint16_t* pred_0,
+ const uint16_t* pred_1,
+ const ptrdiff_t pred_stride_1,
+ const uint8_t* mask,
+ const ptrdiff_t mask_stride, uint16_t* dst,
+ const ptrdiff_t dst_stride) {
+ const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i shift4 = _mm_set1_epi32((1 << kRoundBitsMaskBlend) >> 1);
+ const __m128i offset = _mm_set1_epi32(kCompoundOffset);
+ const __m128i max = _mm_set1_epi16(kMax10bppSample);
+ __m128i pred_mask_0 =
+ GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+ __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+ WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, pred_mask_0,
+ pred_mask_1, offset, max, shift4, dst,
+ dst_stride);
+ pred_0 += 4 << 1;
+ pred_1 += pred_stride_1 << 1;
+ mask += mask_stride << (1 + subsampling_y);
+ dst += dst_stride << 1;
+
+ pred_mask_0 =
+ GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+ pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+ WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, pred_mask_0,
+ pred_mask_1, offset, max, shift4, dst,
+ dst_stride);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void MaskBlend10bpp4xH_SSE4_1(const uint16_t* pred_0,
+ const uint16_t* pred_1,
+ const ptrdiff_t pred_stride_1,
+ const uint8_t* const mask_ptr,
+ const ptrdiff_t mask_stride,
+ const int height, uint16_t* dst,
+ const ptrdiff_t dst_stride) {
+ const uint8_t* mask = mask_ptr;
+ if (height == 4) {
+ MaskBlend10bpp4x4_SSE4_1<subsampling_x, subsampling_y>(
+ pred_0, pred_1, pred_stride_1, mask, mask_stride, dst, dst_stride);
+ return;
+ }
+ const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse);
+ const __m128i zero = _mm_setzero_si128();
+ const uint8_t pred0_stride2 = 4 << 1;
+ const ptrdiff_t pred1_stride2 = pred_stride_1 << 1;
+ const ptrdiff_t mask_stride2 = mask_stride << (1 + subsampling_y);
+ const ptrdiff_t dst_stride2 = dst_stride << 1;
+ const __m128i offset = _mm_set1_epi32(kCompoundOffset);
+ const __m128i max = _mm_set1_epi16(kMax10bppSample);
+ const __m128i shift4 = _mm_set1_epi32((1 << kRoundBitsMaskBlend) >> 1);
+ int y = height;
+ do {
+ __m128i pred_mask_0 =
+ GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+ __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+
+ WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
+ pred_mask_0, pred_mask_1, offset, max,
+ shift4, dst, dst_stride);
+ pred_0 += pred0_stride2;
+ pred_1 += pred1_stride2;
+ mask += mask_stride2;
+ dst += dst_stride2;
+
+ pred_mask_0 =
+ GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+ pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+ WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
+ pred_mask_0, pred_mask_1, offset, max,
+ shift4, dst, dst_stride);
+ pred_0 += pred0_stride2;
+ pred_1 += pred1_stride2;
+ mask += mask_stride2;
+ dst += dst_stride2;
+
+ pred_mask_0 =
+ GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+ pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+ WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
+ pred_mask_0, pred_mask_1, offset, max,
+ shift4, dst, dst_stride);
+ pred_0 += pred0_stride2;
+ pred_1 += pred1_stride2;
+ mask += mask_stride2;
+ dst += dst_stride2;
+
+ pred_mask_0 =
+ GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+ pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+ WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
+ pred_mask_0, pred_mask_1, offset, max,
+ shift4, dst, dst_stride);
+ pred_0 += pred0_stride2;
+ pred_1 += pred1_stride2;
+ mask += mask_stride2;
+ dst += dst_stride2;
+ y -= 8;
+ } while (y != 0);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void MaskBlend10bpp_SSE4_1(const void* prediction_0,
+ const void* prediction_1,
+ const ptrdiff_t prediction_stride_1,
+ const uint8_t* const mask_ptr,
+ const ptrdiff_t mask_stride, const int width,
+ const int height, void* dest,
+ const ptrdiff_t dest_stride) {
+ auto* dst = static_cast<uint16_t*>(dest);
+ const ptrdiff_t dst_stride = dest_stride / sizeof(dst[0]);
+ const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+ const ptrdiff_t pred_stride_0 = width;
+ const ptrdiff_t pred_stride_1 = prediction_stride_1;
+ if (width == 4) {
+ MaskBlend10bpp4xH_SSE4_1<subsampling_x, subsampling_y>(
+ pred_0, pred_1, pred_stride_1, mask_ptr, mask_stride, height, dst,
+ dst_stride);
+ return;
+ }
+ const uint8_t* mask = mask_ptr;
+ const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse);
+ const __m128i zero = _mm_setzero_si128();
+ const ptrdiff_t mask_stride_ss = mask_stride << subsampling_y;
+ const __m128i offset = _mm_set1_epi32(kCompoundOffset);
+ const __m128i max = _mm_set1_epi16(kMax10bppSample);
+ const __m128i shift4 = _mm_set1_epi32((1 << kRoundBitsMaskBlend) >> 1);
+ int y = height;
+ do {
+ int x = 0;
+ do {
+ const __m128i pred_mask_0 = GetMask8<subsampling_x, subsampling_y>(
+ mask + (x << subsampling_x), mask_stride, zero);
+ const __m128i pred_val_0 = LoadUnaligned16(pred_0 + x);
+ const __m128i pred_val_1 = LoadUnaligned16(pred_1 + x);
+ // 64 - mask
+ const __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+
+ const __m128i compound_pred_lo_0 =
+ _mm_mullo_epi16(pred_val_0, pred_mask_0);
+ const __m128i compound_pred_hi_0 =
+ _mm_mulhi_epu16(pred_val_0, pred_mask_0);
+ const __m128i compound_pred_lo_1 =
+ _mm_mullo_epi16(pred_val_1, pred_mask_1);
+ const __m128i compound_pred_hi_1 =
+ _mm_mulhi_epu16(pred_val_1, pred_mask_1);
+ const __m128i pack0_lo =
+ _mm_unpacklo_epi16(compound_pred_lo_0, compound_pred_hi_0);
+ const __m128i pack0_hi =
+ _mm_unpackhi_epi16(compound_pred_lo_0, compound_pred_hi_0);
+ const __m128i pack1_lo =
+ _mm_unpacklo_epi16(compound_pred_lo_1, compound_pred_hi_1);
+ const __m128i pack1_hi =
+ _mm_unpackhi_epi16(compound_pred_lo_1, compound_pred_hi_1);
+ const __m128i compound_pred_lo = _mm_add_epi32(pack0_lo, pack1_lo);
+ const __m128i compound_pred_hi = _mm_add_epi32(pack0_hi, pack1_hi);
+
+ const __m128i sub_0 =
+ _mm_sub_epi32(_mm_srli_epi32(compound_pred_lo, 6), offset);
+ const __m128i sub_1 =
+ _mm_sub_epi32(_mm_srli_epi32(compound_pred_hi, 6), offset);
+ const __m128i shift_0 =
+ RightShiftWithRoundingConst_S32(sub_0, kRoundBitsMaskBlend, shift4);
+ const __m128i shift_1 =
+ RightShiftWithRoundingConst_S32(sub_1, kRoundBitsMaskBlend, shift4);
+ const __m128i result =
+ _mm_min_epi16(_mm_packus_epi32(shift_0, shift_1), max);
+ StoreUnaligned16(dst + x, result);
+ x += 8;
+ } while (x < width);
+ dst += dst_stride;
+ pred_0 += pred_stride_0;
+ pred_1 += pred_stride_1;
+ mask += mask_stride_ss;
+ } while (--y != 0);
+}
+
+inline void InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(
+ const uint16_t* prediction_0, const uint16_t* prediction_1,
+ const ptrdiff_t pred_stride_1, const __m128i& pred_mask_0,
+ const __m128i& pred_mask_1, const __m128i& shift6, uint16_t* dst,
+ const ptrdiff_t dst_stride) {
+ const __m128i pred_val_0 = LoadUnaligned16(prediction_0);
+ const __m128i pred_val_1 =
+ LoadHi8(LoadLo8(prediction_1), prediction_1 + pred_stride_1);
+
+ const __m128i mask_0 = _mm_unpacklo_epi16(pred_mask_1, pred_mask_0);
+ const __m128i mask_1 = _mm_unpackhi_epi16(pred_mask_1, pred_mask_0);
+ const __m128i pred_0 = _mm_unpacklo_epi16(pred_val_0, pred_val_1);
+ const __m128i pred_1 = _mm_unpackhi_epi16(pred_val_0, pred_val_1);
+
+ const __m128i compound_pred_0 = _mm_madd_epi16(pred_0, mask_0);
+ const __m128i compound_pred_1 = _mm_madd_epi16(pred_1, mask_1);
+ const __m128i shift_0 =
+ RightShiftWithRoundingConst_S32(compound_pred_0, 6, shift6);
+ const __m128i shift_1 =
+ RightShiftWithRoundingConst_S32(compound_pred_1, 6, shift6);
+ const __m128i res = _mm_packus_epi32(shift_0, shift_1);
+ StoreLo8(dst, res);
+ StoreHi8(dst + dst_stride, res);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void InterIntraMaskBlend10bpp4x4_SSE4_1(
+ const uint16_t* pred_0, const uint16_t* pred_1,
+ const ptrdiff_t pred_stride_1, const uint8_t* mask,
+ const ptrdiff_t mask_stride, uint16_t* dst, const ptrdiff_t dst_stride) {
+ const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse);
+ const __m128i shift6 = _mm_set1_epi32((1 << 6) >> 1);
+ const __m128i zero = _mm_setzero_si128();
+ __m128i pred_mask_0 =
+ GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+ __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+ InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
+ pred_mask_0, pred_mask_1, shift6,
+ dst, dst_stride);
+ pred_0 += 4 << 1;
+ pred_1 += pred_stride_1 << 1;
+ mask += mask_stride << (1 + subsampling_y);
+ dst += dst_stride << 1;
+
+ pred_mask_0 =
+ GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+ pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+ InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
+ pred_mask_0, pred_mask_1, shift6,
+ dst, dst_stride);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void InterIntraMaskBlend10bpp4xH_SSE4_1(const uint16_t* pred_0,
+ const uint16_t* pred_1,
+ const ptrdiff_t pred_stride_1,
+ const uint8_t* const mask_ptr,
+ const ptrdiff_t mask_stride,
+ const int height, uint16_t* dst,
+ const ptrdiff_t dst_stride) {
+ const uint8_t* mask = mask_ptr;
+ if (height == 4) {
+ InterIntraMaskBlend10bpp4x4_SSE4_1<subsampling_x, subsampling_y>(
+ pred_0, pred_1, pred_stride_1, mask, mask_stride, dst, dst_stride);
+ return;
+ }
+ const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i shift6 = _mm_set1_epi32((1 << 6) >> 1);
+ const uint8_t pred0_stride2 = 4 << 1;
+ const ptrdiff_t pred1_stride2 = pred_stride_1 << 1;
+ const ptrdiff_t mask_stride2 = mask_stride << (1 + subsampling_y);
+ const ptrdiff_t dst_stride2 = dst_stride << 1;
+ int y = height;
+ do {
+ __m128i pred_mask_0 =
+ GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+ __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+ InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
+ pred_mask_0, pred_mask_1,
+ shift6, dst, dst_stride);
+ pred_0 += pred0_stride2;
+ pred_1 += pred1_stride2;
+ mask += mask_stride2;
+ dst += dst_stride2;
+
+ pred_mask_0 =
+ GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+ pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+ InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
+ pred_mask_0, pred_mask_1,
+ shift6, dst, dst_stride);
+ pred_0 += pred0_stride2;
+ pred_1 += pred1_stride2;
+ mask += mask_stride2;
+ dst += dst_stride2;
+
+ pred_mask_0 =
+ GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+ pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+ InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
+ pred_mask_0, pred_mask_1,
+ shift6, dst, dst_stride);
+ pred_0 += pred0_stride2;
+ pred_1 += pred1_stride2;
+ mask += mask_stride2;
+ dst += dst_stride2;
+
+ pred_mask_0 =
+ GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+ pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+ InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
+ pred_mask_0, pred_mask_1,
+ shift6, dst, dst_stride);
+ pred_0 += pred0_stride2;
+ pred_1 += pred1_stride2;
+ mask += mask_stride2;
+ dst += dst_stride2;
+ y -= 8;
+ } while (y != 0);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void InterIntraMaskBlend10bpp_SSE4_1(
+ const void* prediction_0, const void* prediction_1,
+ const ptrdiff_t prediction_stride_1, const uint8_t* const mask_ptr,
+ const ptrdiff_t mask_stride, const int width, const int height, void* dest,
+ const ptrdiff_t dest_stride) {
+ auto* dst = static_cast<uint16_t*>(dest);
+ const ptrdiff_t dst_stride = dest_stride / sizeof(dst[0]);
+ const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+ const ptrdiff_t pred_stride_0 = width;
+ const ptrdiff_t pred_stride_1 = prediction_stride_1;
+ if (width == 4) {
+ InterIntraMaskBlend10bpp4xH_SSE4_1<subsampling_x, subsampling_y>(
+ pred_0, pred_1, pred_stride_1, mask_ptr, mask_stride, height, dst,
+ dst_stride);
+ return;
+ }
+ const uint8_t* mask = mask_ptr;
+ const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse);
+ const __m128i shift6 = _mm_set1_epi32((1 << 6) >> 1);
+ const __m128i zero = _mm_setzero_si128();
+ const ptrdiff_t mask_stride_ss = mask_stride << subsampling_y;
+ int y = height;
+ do {
+ int x = 0;
+ do {
+ const __m128i pred_mask_0 = GetMask8<subsampling_x, subsampling_y>(
+ mask + (x << subsampling_x), mask_stride, zero);
+ const __m128i pred_val_0 = LoadUnaligned16(pred_0 + x);
+ const __m128i pred_val_1 = LoadUnaligned16(pred_1 + x);
+ // 64 - mask
+ const __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+ const __m128i mask_0 = _mm_unpacklo_epi16(pred_mask_1, pred_mask_0);
+ const __m128i mask_1 = _mm_unpackhi_epi16(pred_mask_1, pred_mask_0);
+ const __m128i pred_0 = _mm_unpacklo_epi16(pred_val_0, pred_val_1);
+ const __m128i pred_1 = _mm_unpackhi_epi16(pred_val_0, pred_val_1);
+
+ const __m128i compound_pred_0 = _mm_madd_epi16(pred_0, mask_0);
+ const __m128i compound_pred_1 = _mm_madd_epi16(pred_1, mask_1);
+ const __m128i shift_0 =
+ RightShiftWithRoundingConst_S32(compound_pred_0, 6, shift6);
+ const __m128i shift_1 =
+ RightShiftWithRoundingConst_S32(compound_pred_1, 6, shift6);
+ StoreUnaligned16(dst + x, _mm_packus_epi32(shift_0, shift_1));
+ x += 8;
+ } while (x < width);
+ dst += dst_stride;
+ pred_0 += pred_stride_0;
+ pred_1 += pred_stride_1;
+ mask += mask_stride_ss;
+ } while (--y != 0);
+}
+
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+
+#if DSP_ENABLED_10BPP_SSE4_1(MaskBlend444)
+ dsp->mask_blend[0][0] = MaskBlend10bpp_SSE4_1<0, 0>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(MaskBlend422)
+ dsp->mask_blend[1][0] = MaskBlend10bpp_SSE4_1<1, 0>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(MaskBlend420)
+ dsp->mask_blend[2][0] = MaskBlend10bpp_SSE4_1<1, 1>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(MaskBlendInterIntra444)
+ dsp->mask_blend[0][1] = InterIntraMaskBlend10bpp_SSE4_1<0, 0>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(MaskBlendInterIntra422)
+ dsp->mask_blend[1][1] = InterIntraMaskBlend10bpp_SSE4_1<1, 0>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(MaskBlendInterIntra420)
+ dsp->mask_blend[2][1] = InterIntraMaskBlend10bpp_SSE4_1<1, 1>;
+#endif
+}
+
+} // namespace
+} // namespace high_bitdepth
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+void MaskBlendInit_SSE4_1() {
+ low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ high_bitdepth::Init10bpp();
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+}
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_TARGETING_SSE4_1
+#else // !LIBGAV1_TARGETING_SSE4_1
namespace libgav1 {
namespace dsp {
diff --git a/src/dsp/x86/mask_blend_sse4.h b/src/dsp/x86/mask_blend_sse4.h
index 52b0b5c..4a95f0c 100644
--- a/src/dsp/x86/mask_blend_sse4.h
+++ b/src/dsp/x86/mask_blend_sse4.h
@@ -55,6 +55,30 @@ void MaskBlendInit_SSE4_1();
#define LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp420 LIBGAV1_CPU_SSE4_1
#endif
+#ifndef LIBGAV1_Dsp10bpp_MaskBlend444
+#define LIBGAV1_Dsp10bpp_MaskBlend444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_MaskBlend422
+#define LIBGAV1_Dsp10bpp_MaskBlend422 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_MaskBlend420
+#define LIBGAV1_Dsp10bpp_MaskBlend420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_MaskBlendInterIntra444
+#define LIBGAV1_Dsp10bpp_MaskBlendInterIntra444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_MaskBlendInterIntra422
+#define LIBGAV1_Dsp10bpp_MaskBlendInterIntra422 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_MaskBlendInterIntra420
+#define LIBGAV1_Dsp10bpp_MaskBlendInterIntra420 LIBGAV1_CPU_SSE4_1
+#endif
+
#endif // LIBGAV1_TARGETING_SSE4_1
#endif // LIBGAV1_SRC_DSP_X86_MASK_BLEND_SSE4_H_
diff --git a/src/dsp/x86/motion_field_projection_sse4.cc b/src/dsp/x86/motion_field_projection_sse4.cc
index c506941..e3f2cce 100644
--- a/src/dsp/x86/motion_field_projection_sse4.cc
+++ b/src/dsp/x86/motion_field_projection_sse4.cc
@@ -139,9 +139,9 @@ inline void Store(const __m128i position, const __m128i reference_offset,
const ptrdiff_t offset =
static_cast<int16_t>(_mm_extract_epi16(position, idx));
if ((idx & 3) == 0) {
- dst_mv[offset].mv32 = _mm_cvtsi128_si32(mv);
+ dst_mv[offset].mv32 = static_cast<uint32_t>(_mm_cvtsi128_si32(mv));
} else {
- dst_mv[offset].mv32 = _mm_extract_epi32(mv, idx & 3);
+ dst_mv[offset].mv32 = static_cast<uint32_t>(_mm_extract_epi32(mv, idx & 3));
}
dst_reference_offset[offset] = _mm_extract_epi8(reference_offset, idx);
}
@@ -386,7 +386,7 @@ void MotionFieldProjectionInit_SSE4_1() {
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_TARGETING_SSE4_1
+#else // !LIBGAV1_TARGETING_SSE4_1
namespace libgav1 {
namespace dsp {
diff --git a/src/dsp/x86/motion_vector_search_sse4.cc b/src/dsp/x86/motion_vector_search_sse4.cc
index e9cdd4c..7f5f035 100644
--- a/src/dsp/x86/motion_vector_search_sse4.cc
+++ b/src/dsp/x86/motion_vector_search_sse4.cc
@@ -251,7 +251,7 @@ void MotionVectorSearchInit_SSE4_1() {
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_TARGETING_SSE4_1
+#else // !LIBGAV1_TARGETING_SSE4_1
namespace libgav1 {
namespace dsp {
diff --git a/src/dsp/x86/obmc_sse4.cc b/src/dsp/x86/obmc_sse4.cc
index 3a1d1fd..c34a7f7 100644
--- a/src/dsp/x86/obmc_sse4.cc
+++ b/src/dsp/x86/obmc_sse4.cc
@@ -31,6 +31,7 @@
namespace libgav1 {
namespace dsp {
+namespace low_bitdepth {
namespace {
#include "src/dsp/obmc.inc"
@@ -311,13 +312,295 @@ void Init8bpp() {
}
} // namespace
+} // namespace low_bitdepth
-void ObmcInit_SSE4_1() { Init8bpp(); }
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+#include "src/dsp/obmc.inc"
+
+constexpr int kRoundBitsObmcBlend = 6;
+
+inline void OverlapBlendFromLeft2xH_SSE4_1(
+ uint16_t* const prediction, const ptrdiff_t pred_stride, const int height,
+ const uint16_t* const obmc_prediction, const ptrdiff_t obmc_pred_stride) {
+ uint16_t* pred = prediction;
+ const uint16_t* obmc_pred = obmc_prediction;
+ const ptrdiff_t pred_stride2 = pred_stride << 1;
+ const ptrdiff_t obmc_pred_stride2 = obmc_pred_stride << 1;
+ const __m128i mask_inverter = _mm_cvtsi32_si128(0x40404040);
+ const __m128i mask_val = _mm_shufflelo_epi16(Load2(kObmcMask), 0x00);
+ // 64 - mask.
+ const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
+ const __m128i masks =
+ _mm_cvtepi8_epi16(_mm_unpacklo_epi8(mask_val, obmc_mask_val));
+ int y = height;
+ do {
+ const __m128i pred_val = Load4x2(pred, pred + pred_stride);
+ const __m128i obmc_pred_val =
+ Load4x2(obmc_pred, obmc_pred + obmc_pred_stride);
+ const __m128i terms = _mm_unpacklo_epi16(pred_val, obmc_pred_val);
+ const __m128i result = RightShiftWithRounding_U32(
+ _mm_madd_epi16(terms, masks), kRoundBitsObmcBlend);
+ const __m128i packed_result = _mm_packus_epi32(result, result);
+ Store4(pred, packed_result);
+ Store4(pred + pred_stride, _mm_srli_si128(packed_result, 4));
+ pred += pred_stride2;
+ obmc_pred += obmc_pred_stride2;
+ y -= 2;
+ } while (y != 0);
+}
+
+inline void OverlapBlendFromLeft4xH_SSE4_1(
+ uint16_t* const prediction, const ptrdiff_t pred_stride, const int height,
+ const uint16_t* const obmc_prediction, const ptrdiff_t obmc_pred_stride) {
+ uint16_t* pred = prediction;
+ const uint16_t* obmc_pred = obmc_prediction;
+ const ptrdiff_t pred_stride2 = pred_stride << 1;
+ const ptrdiff_t obmc_pred_stride2 = obmc_pred_stride << 1;
+ const __m128i mask_inverter = _mm_cvtsi32_si128(0x40404040);
+ const __m128i mask_val = Load4(kObmcMask + 2);
+ // 64 - mask.
+ const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
+ const __m128i masks =
+ _mm_cvtepi8_epi16(_mm_unpacklo_epi8(mask_val, obmc_mask_val));
+ int y = height;
+ do {
+ const __m128i pred_val = LoadHi8(LoadLo8(pred), pred + pred_stride);
+ const __m128i obmc_pred_val =
+ LoadHi8(LoadLo8(obmc_pred), obmc_pred + obmc_pred_stride);
+ const __m128i terms_lo = _mm_unpacklo_epi16(pred_val, obmc_pred_val);
+ const __m128i terms_hi = _mm_unpackhi_epi16(pred_val, obmc_pred_val);
+ const __m128i result_lo = RightShiftWithRounding_U32(
+ _mm_madd_epi16(terms_lo, masks), kRoundBitsObmcBlend);
+ const __m128i result_hi = RightShiftWithRounding_U32(
+ _mm_madd_epi16(terms_hi, masks), kRoundBitsObmcBlend);
+ const __m128i packed_result = _mm_packus_epi32(result_lo, result_hi);
+ StoreLo8(pred, packed_result);
+ StoreHi8(pred + pred_stride, packed_result);
+ pred += pred_stride2;
+ obmc_pred += obmc_pred_stride2;
+ y -= 2;
+ } while (y != 0);
+}
+
+void OverlapBlendFromLeft10bpp_SSE4_1(void* const prediction,
+ const ptrdiff_t prediction_stride,
+ const int width, const int height,
+ const void* const obmc_prediction,
+ const ptrdiff_t obmc_prediction_stride) {
+ auto* pred = static_cast<uint16_t*>(prediction);
+ const auto* obmc_pred = static_cast<const uint16_t*>(obmc_prediction);
+ const ptrdiff_t pred_stride = prediction_stride / sizeof(pred[0]);
+ const ptrdiff_t obmc_pred_stride =
+ obmc_prediction_stride / sizeof(obmc_pred[0]);
+
+ if (width == 2) {
+ OverlapBlendFromLeft2xH_SSE4_1(pred, pred_stride, height, obmc_pred,
+ obmc_pred_stride);
+ return;
+ }
+ if (width == 4) {
+ OverlapBlendFromLeft4xH_SSE4_1(pred, pred_stride, height, obmc_pred,
+ obmc_pred_stride);
+ return;
+ }
+ const __m128i mask_inverter = _mm_set1_epi8(64);
+ const uint8_t* mask = kObmcMask + width - 2;
+ int x = 0;
+ do {
+ pred = static_cast<uint16_t*>(prediction) + x;
+ obmc_pred = static_cast<const uint16_t*>(obmc_prediction) + x;
+ const __m128i mask_val = LoadLo8(mask + x);
+ // 64 - mask
+ const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
+ const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
+ const __m128i masks_lo = _mm_cvtepi8_epi16(masks);
+ const __m128i masks_hi = _mm_cvtepi8_epi16(_mm_srli_si128(masks, 8));
+ int y = height;
+ do {
+ const __m128i pred_val = LoadUnaligned16(pred);
+ const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred);
+ const __m128i terms_lo = _mm_unpacklo_epi16(pred_val, obmc_pred_val);
+ const __m128i terms_hi = _mm_unpackhi_epi16(pred_val, obmc_pred_val);
+ const __m128i result_lo = RightShiftWithRounding_U32(
+ _mm_madd_epi16(terms_lo, masks_lo), kRoundBitsObmcBlend);
+ const __m128i result_hi = RightShiftWithRounding_U32(
+ _mm_madd_epi16(terms_hi, masks_hi), kRoundBitsObmcBlend);
+ StoreUnaligned16(pred, _mm_packus_epi32(result_lo, result_hi));
+
+ pred += pred_stride;
+ obmc_pred += obmc_pred_stride;
+ } while (--y != 0);
+ x += 8;
+ } while (x < width);
+}
+
+inline void OverlapBlendFromTop2xH_SSE4_1(uint16_t* const prediction,
+ const ptrdiff_t pred_stride,
+ const int height,
+ const uint16_t* const obmc_prediction,
+ const ptrdiff_t obmc_pred_stride) {
+ uint16_t* pred = prediction;
+ const uint16_t* obmc_pred = obmc_prediction;
+ const __m128i mask_inverter = _mm_set1_epi16(64);
+ const __m128i mask_shuffler = _mm_set_epi32(0x01010101, 0x01010101, 0, 0);
+ const __m128i mask_preinverter = _mm_set1_epi16(-256 | 1);
+ const uint8_t* mask = kObmcMask + height - 2;
+ const int compute_height =
+ height - (height >> 2); // compute_height based on 8-bit opt
+ const ptrdiff_t pred_stride2 = pred_stride << 1;
+ const ptrdiff_t obmc_pred_stride2 = obmc_pred_stride << 1;
+ int y = 0;
+ do {
+ // First mask in the first half, second mask in the second half.
+ const __m128i mask_val = _mm_shuffle_epi8(Load4(mask + y), mask_shuffler);
+ const __m128i masks =
+ _mm_sub_epi8(mask_inverter, _mm_sign_epi8(mask_val, mask_preinverter));
+ const __m128i masks_lo = _mm_cvtepi8_epi16(masks);
+ const __m128i masks_hi = _mm_cvtepi8_epi16(_mm_srli_si128(masks, 8));
+
+ const __m128i pred_val = LoadHi8(LoadLo8(pred), pred + pred_stride);
+ const __m128i obmc_pred_val =
+ LoadHi8(LoadLo8(obmc_pred), obmc_pred + obmc_pred_stride);
+ const __m128i terms_lo = _mm_unpacklo_epi16(obmc_pred_val, pred_val);
+ const __m128i terms_hi = _mm_unpackhi_epi16(obmc_pred_val, pred_val);
+ const __m128i result_lo = RightShiftWithRounding_U32(
+ _mm_madd_epi16(terms_lo, masks_lo), kRoundBitsObmcBlend);
+ const __m128i result_hi = RightShiftWithRounding_U32(
+ _mm_madd_epi16(terms_hi, masks_hi), kRoundBitsObmcBlend);
+ const __m128i packed_result = _mm_packus_epi32(result_lo, result_hi);
+
+ Store4(pred, packed_result);
+ Store4(pred + pred_stride, _mm_srli_si128(packed_result, 8));
+ pred += pred_stride2;
+ obmc_pred += obmc_pred_stride2;
+ y += 2;
+ } while (y < compute_height);
+}
+
+inline void OverlapBlendFromTop4xH_SSE4_1(uint16_t* const prediction,
+ const ptrdiff_t pred_stride,
+ const int height,
+ const uint16_t* const obmc_prediction,
+ const ptrdiff_t obmc_pred_stride) {
+ uint16_t* pred = prediction;
+ const uint16_t* obmc_pred = obmc_prediction;
+ const __m128i mask_inverter = _mm_set1_epi16(64);
+ const __m128i mask_shuffler = _mm_set_epi32(0x01010101, 0x01010101, 0, 0);
+ const __m128i mask_preinverter = _mm_set1_epi16(-256 | 1);
+ const uint8_t* mask = kObmcMask + height - 2;
+ const int compute_height = height - (height >> 2);
+ const ptrdiff_t pred_stride2 = pred_stride << 1;
+ const ptrdiff_t obmc_pred_stride2 = obmc_pred_stride << 1;
+ int y = 0;
+ do {
+ // First mask in the first half, second mask in the second half.
+ const __m128i mask_val = _mm_shuffle_epi8(Load4(mask + y), mask_shuffler);
+ const __m128i masks =
+ _mm_sub_epi8(mask_inverter, _mm_sign_epi8(mask_val, mask_preinverter));
+ const __m128i masks_lo = _mm_cvtepi8_epi16(masks);
+ const __m128i masks_hi = _mm_cvtepi8_epi16(_mm_srli_si128(masks, 8));
+
+ const __m128i pred_val = LoadHi8(LoadLo8(pred), pred + pred_stride);
+ const __m128i obmc_pred_val =
+ LoadHi8(LoadLo8(obmc_pred), obmc_pred + obmc_pred_stride);
+ const __m128i terms_lo = _mm_unpacklo_epi16(obmc_pred_val, pred_val);
+ const __m128i terms_hi = _mm_unpackhi_epi16(obmc_pred_val, pred_val);
+ const __m128i result_lo = RightShiftWithRounding_U32(
+ _mm_madd_epi16(terms_lo, masks_lo), kRoundBitsObmcBlend);
+ const __m128i result_hi = RightShiftWithRounding_U32(
+ _mm_madd_epi16(terms_hi, masks_hi), kRoundBitsObmcBlend);
+ const __m128i packed_result = _mm_packus_epi32(result_lo, result_hi);
+
+ StoreLo8(pred, packed_result);
+ StoreHi8(pred + pred_stride, packed_result);
+ pred += pred_stride2;
+ obmc_pred += obmc_pred_stride2;
+ y += 2;
+ } while (y < compute_height);
+}
+
+void OverlapBlendFromTop10bpp_SSE4_1(void* const prediction,
+ const ptrdiff_t prediction_stride,
+ const int width, const int height,
+ const void* const obmc_prediction,
+ const ptrdiff_t obmc_prediction_stride) {
+ auto* pred = static_cast<uint16_t*>(prediction);
+ const auto* obmc_pred = static_cast<const uint16_t*>(obmc_prediction);
+ const ptrdiff_t pred_stride = prediction_stride / sizeof(pred[0]);
+ const ptrdiff_t obmc_pred_stride =
+ obmc_prediction_stride / sizeof(obmc_pred[0]);
+
+ if (width == 2) {
+ OverlapBlendFromTop2xH_SSE4_1(pred, pred_stride, height, obmc_pred,
+ obmc_pred_stride);
+ return;
+ }
+ if (width == 4) {
+ OverlapBlendFromTop4xH_SSE4_1(pred, pred_stride, height, obmc_pred,
+ obmc_pred_stride);
+ return;
+ }
+
+ const __m128i mask_inverter = _mm_set1_epi8(64);
+ const int compute_height = height - (height >> 2);
+ const uint8_t* mask = kObmcMask + height - 2;
+ pred = static_cast<uint16_t*>(prediction);
+ obmc_pred = static_cast<const uint16_t*>(obmc_prediction);
+ int y = 0;
+ do {
+ const __m128i mask_val = _mm_set1_epi8(mask[y]);
+ // 64 - mask
+ const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
+ const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
+ const __m128i masks_lo = _mm_cvtepi8_epi16(masks);
+ const __m128i masks_hi = _mm_cvtepi8_epi16(_mm_srli_si128(masks, 8));
+ int x = 0;
+ do {
+ const __m128i pred_val = LoadUnaligned16(pred + x);
+ const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred + x);
+ const __m128i terms_lo = _mm_unpacklo_epi16(pred_val, obmc_pred_val);
+ const __m128i terms_hi = _mm_unpackhi_epi16(pred_val, obmc_pred_val);
+ const __m128i result_lo = RightShiftWithRounding_U32(
+ _mm_madd_epi16(terms_lo, masks_lo), kRoundBitsObmcBlend);
+ const __m128i result_hi = RightShiftWithRounding_U32(
+ _mm_madd_epi16(terms_hi, masks_hi), kRoundBitsObmcBlend);
+ StoreUnaligned16(pred + x, _mm_packus_epi32(result_lo, result_hi));
+ x += 8;
+ } while (x < width);
+ pred += pred_stride;
+ obmc_pred += obmc_pred_stride;
+ } while (++y < compute_height);
+}
+
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+#if DSP_ENABLED_10BPP_SSE4_1(ObmcVertical)
+ dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendFromTop10bpp_SSE4_1;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(ObmcHorizontal)
+ dsp->obmc_blend[kObmcDirectionHorizontal] = OverlapBlendFromLeft10bpp_SSE4_1;
+#endif
+}
+
+} // namespace
+} // namespace high_bitdepth
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+void ObmcInit_SSE4_1() {
+ low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ high_bitdepth::Init10bpp();
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+}
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_TARGETING_SSE4_1
+#else // !LIBGAV1_TARGETING_SSE4_1
namespace libgav1 {
namespace dsp {
diff --git a/src/dsp/x86/obmc_sse4.h b/src/dsp/x86/obmc_sse4.h
index bd8b416..448d2cf 100644
--- a/src/dsp/x86/obmc_sse4.h
+++ b/src/dsp/x86/obmc_sse4.h
@@ -38,6 +38,12 @@ void ObmcInit_SSE4_1();
#ifndef LIBGAV1_Dsp8bpp_ObmcHorizontal
#define LIBGAV1_Dsp8bpp_ObmcHorizontal LIBGAV1_CPU_SSE4_1
#endif
+#ifndef LIBGAV1_Dsp10bpp_ObmcVertical
+#define LIBGAV1_Dsp10bpp_ObmcVertical LIBGAV1_CPU_SSE4_1
+#endif
+#ifndef LIBGAV1_Dsp10bpp_ObmcHorizontal
+#define LIBGAV1_Dsp10bpp_ObmcHorizontal LIBGAV1_CPU_SSE4_1
+#endif
#endif // LIBGAV1_TARGETING_SSE4_1
#endif // LIBGAV1_SRC_DSP_X86_OBMC_SSE4_H_
diff --git a/src/dsp/x86/super_res_sse4.cc b/src/dsp/x86/super_res_sse4.cc
index b2bdfd2..85d05bc 100644
--- a/src/dsp/x86/super_res_sse4.cc
+++ b/src/dsp/x86/super_res_sse4.cc
@@ -91,10 +91,10 @@ void SuperResCoefficients_SSE4_1(const int upscaled_width,
}
void SuperRes_SSE4_1(const void* const coefficients, void* const source,
- const ptrdiff_t stride, const int height,
+ const ptrdiff_t source_stride, const int height,
const int downscaled_width, const int upscaled_width,
const int initial_subpixel_x, const int step,
- void* const dest) {
+ void* const dest, const ptrdiff_t dest_stride) {
auto* src = static_cast<uint8_t*>(source) - DivideBy2(kSuperResFilterTaps);
auto* dst = static_cast<uint8_t*>(dest);
int y = height;
@@ -104,16 +104,30 @@ void SuperRes_SSE4_1(const void* const coefficients, void* const source,
ExtendLine<uint8_t>(src + DivideBy2(kSuperResFilterTaps), downscaled_width,
kSuperResHorizontalBorder, kSuperResHorizontalBorder);
int subpixel_x = initial_subpixel_x;
- // The below code calculates up to 15 extra upscaled
- // pixels which will over-read up to 15 downscaled pixels in the end of each
- // row. kSuperResHorizontalBorder accounts for this.
+ // The below code calculates up to 15 extra upscaled pixels which will
+ // over-read up to 15 downscaled pixels in the end of each row.
+ // kSuperResHorizontalPadding protects this behavior from segmentation
+ // faults and threading issues.
int x = RightShiftWithCeiling(upscaled_width, 4);
do {
__m128i weighted_src[8];
for (int i = 0; i < 8; ++i, filter += 16) {
- __m128i s = LoadLo8(&src[subpixel_x >> kSuperResScaleBits]);
+ // TODO(b/178652672): Remove Msan loads when hadd bug is resolved.
+ // It's fine to write uninitialized bytes outside the frame, but the
+ // inside-frame pixels are incorrectly labeled uninitialized if
+ // uninitialized values go through the hadd intrinsics.
+ // |src| is offset 4 pixels to the left, and there are 4 extended border
+ // pixels, so a difference of 0 from |downscaled_width| indicates 8 good
+ // bytes. A difference of 1 indicates 7 good bytes.
+ const int msan_bytes_lo =
+ (subpixel_x >> kSuperResScaleBits) - downscaled_width;
+ __m128i s =
+ LoadLo8Msan(&src[subpixel_x >> kSuperResScaleBits], msan_bytes_lo);
subpixel_x += step;
- s = LoadHi8(s, &src[subpixel_x >> kSuperResScaleBits]);
+ const int msan_bytes_hi =
+ (subpixel_x >> kSuperResScaleBits) - downscaled_width;
+ s = LoadHi8Msan(s, &src[subpixel_x >> kSuperResScaleBits],
+ msan_bytes_hi);
subpixel_x += step;
const __m128i f = LoadAligned16(filter);
weighted_src[i] = _mm_maddubs_epi16(s, f);
@@ -135,26 +149,165 @@ void SuperRes_SSE4_1(const void* const coefficients, void* const source,
StoreAligned16(dst_ptr, _mm_packus_epi16(a[0], a[1]));
dst_ptr += 16;
} while (--x != 0);
- src += stride;
- dst += stride;
+ src += source_stride;
+ dst += dest_stride;
} while (--y != 0);
}
void Init8bpp() {
Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+#if DSP_ENABLED_8BPP_SSE4_1(SuperResCoefficients)
dsp->super_res_coefficients = SuperResCoefficients_SSE4_1;
+#endif // DSP_ENABLED_8BPP_SSE4_1(SuperResCoefficients)
+#if DSP_ENABLED_8BPP_SSE4_1(SuperRes)
dsp->super_res = SuperRes_SSE4_1;
+#endif // DSP_ENABLED_8BPP_SSE4_1(SuperRes)
}
} // namespace
} // namespace low_bitdepth
-void SuperResInit_SSE4_1() { low_bitdepth::Init8bpp(); }
+//------------------------------------------------------------------------------
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+// Upscale_Filter as defined in AV1 Section 7.16
+alignas(16) const int16_t
+ kUpscaleFilter[kSuperResFilterShifts][kSuperResFilterTaps] = {
+ {0, 0, 0, 128, 0, 0, 0, 0}, {0, 0, -1, 128, 2, -1, 0, 0},
+ {0, 1, -3, 127, 4, -2, 1, 0}, {0, 1, -4, 127, 6, -3, 1, 0},
+ {0, 2, -6, 126, 8, -3, 1, 0}, {0, 2, -7, 125, 11, -4, 1, 0},
+ {-1, 2, -8, 125, 13, -5, 2, 0}, {-1, 3, -9, 124, 15, -6, 2, 0},
+ {-1, 3, -10, 123, 18, -6, 2, -1}, {-1, 3, -11, 122, 20, -7, 3, -1},
+ {-1, 4, -12, 121, 22, -8, 3, -1}, {-1, 4, -13, 120, 25, -9, 3, -1},
+ {-1, 4, -14, 118, 28, -9, 3, -1}, {-1, 4, -15, 117, 30, -10, 4, -1},
+ {-1, 5, -16, 116, 32, -11, 4, -1}, {-1, 5, -16, 114, 35, -12, 4, -1},
+ {-1, 5, -17, 112, 38, -12, 4, -1}, {-1, 5, -18, 111, 40, -13, 5, -1},
+ {-1, 5, -18, 109, 43, -14, 5, -1}, {-1, 6, -19, 107, 45, -14, 5, -1},
+ {-1, 6, -19, 105, 48, -15, 5, -1}, {-1, 6, -19, 103, 51, -16, 5, -1},
+ {-1, 6, -20, 101, 53, -16, 6, -1}, {-1, 6, -20, 99, 56, -17, 6, -1},
+ {-1, 6, -20, 97, 58, -17, 6, -1}, {-1, 6, -20, 95, 61, -18, 6, -1},
+ {-2, 7, -20, 93, 64, -18, 6, -2}, {-2, 7, -20, 91, 66, -19, 6, -1},
+ {-2, 7, -20, 88, 69, -19, 6, -1}, {-2, 7, -20, 86, 71, -19, 6, -1},
+ {-2, 7, -20, 84, 74, -20, 7, -2}, {-2, 7, -20, 81, 76, -20, 7, -1},
+ {-2, 7, -20, 79, 79, -20, 7, -2}, {-1, 7, -20, 76, 81, -20, 7, -2},
+ {-2, 7, -20, 74, 84, -20, 7, -2}, {-1, 6, -19, 71, 86, -20, 7, -2},
+ {-1, 6, -19, 69, 88, -20, 7, -2}, {-1, 6, -19, 66, 91, -20, 7, -2},
+ {-2, 6, -18, 64, 93, -20, 7, -2}, {-1, 6, -18, 61, 95, -20, 6, -1},
+ {-1, 6, -17, 58, 97, -20, 6, -1}, {-1, 6, -17, 56, 99, -20, 6, -1},
+ {-1, 6, -16, 53, 101, -20, 6, -1}, {-1, 5, -16, 51, 103, -19, 6, -1},
+ {-1, 5, -15, 48, 105, -19, 6, -1}, {-1, 5, -14, 45, 107, -19, 6, -1},
+ {-1, 5, -14, 43, 109, -18, 5, -1}, {-1, 5, -13, 40, 111, -18, 5, -1},
+ {-1, 4, -12, 38, 112, -17, 5, -1}, {-1, 4, -12, 35, 114, -16, 5, -1},
+ {-1, 4, -11, 32, 116, -16, 5, -1}, {-1, 4, -10, 30, 117, -15, 4, -1},
+ {-1, 3, -9, 28, 118, -14, 4, -1}, {-1, 3, -9, 25, 120, -13, 4, -1},
+ {-1, 3, -8, 22, 121, -12, 4, -1}, {-1, 3, -7, 20, 122, -11, 3, -1},
+ {-1, 2, -6, 18, 123, -10, 3, -1}, {0, 2, -6, 15, 124, -9, 3, -1},
+ {0, 2, -5, 13, 125, -8, 2, -1}, {0, 1, -4, 11, 125, -7, 2, 0},
+ {0, 1, -3, 8, 126, -6, 2, 0}, {0, 1, -3, 6, 127, -4, 1, 0},
+ {0, 1, -2, 4, 127, -3, 1, 0}, {0, 0, -1, 2, 128, -1, 0, 0},
+};
+
+void SuperResCoefficients_SSE4_1(const int upscaled_width,
+ const int initial_subpixel_x, const int step,
+ void* const coefficients) {
+ auto* dst = static_cast<uint16_t*>(coefficients);
+ int subpixel_x = initial_subpixel_x;
+ int x = RightShiftWithCeiling(upscaled_width, 3);
+ do {
+ for (int i = 0; i < 8; ++i, dst += 8) {
+ int remainder = subpixel_x & kSuperResScaleMask;
+ __m128i filter =
+ LoadAligned16(kUpscaleFilter[remainder >> kSuperResExtraBits]);
+ subpixel_x += step;
+ StoreAligned16(dst, filter);
+ }
+ } while (--x != 0);
+}
+
+template <int bitdepth>
+void SuperRes_SSE4_1(const void* const coefficients, void* const source,
+ const ptrdiff_t source_stride, const int height,
+ const int downscaled_width, const int upscaled_width,
+ const int initial_subpixel_x, const int step,
+ void* const dest, const ptrdiff_t dest_stride) {
+ auto* src = static_cast<uint16_t*>(source) - DivideBy2(kSuperResFilterTaps);
+ auto* dst = static_cast<uint16_t*>(dest);
+ int y = height;
+ do {
+ const auto* filter = static_cast<const uint16_t*>(coefficients);
+ uint16_t* dst_ptr = dst;
+ ExtendLine<uint16_t>(src + DivideBy2(kSuperResFilterTaps), downscaled_width,
+ kSuperResHorizontalBorder, kSuperResHorizontalPadding);
+ int subpixel_x = initial_subpixel_x;
+ // The below code calculates up to 7 extra upscaled
+ // pixels which will over-read up to 7 downscaled pixels in the end of each
+ // row. kSuperResHorizontalPadding accounts for this.
+ int x = RightShiftWithCeiling(upscaled_width, 3);
+ do {
+ __m128i weighted_src[8];
+ for (int i = 0; i < 8; ++i, filter += 8) {
+ const __m128i s =
+ LoadUnaligned16(&src[subpixel_x >> kSuperResScaleBits]);
+ subpixel_x += step;
+ const __m128i f = LoadAligned16(filter);
+ weighted_src[i] = _mm_madd_epi16(s, f);
+ }
+
+ __m128i a[4];
+ a[0] = _mm_hadd_epi32(weighted_src[0], weighted_src[1]);
+ a[1] = _mm_hadd_epi32(weighted_src[2], weighted_src[3]);
+ a[2] = _mm_hadd_epi32(weighted_src[4], weighted_src[5]);
+ a[3] = _mm_hadd_epi32(weighted_src[6], weighted_src[7]);
+
+ a[0] = _mm_hadd_epi32(a[0], a[1]);
+ a[1] = _mm_hadd_epi32(a[2], a[3]);
+ a[0] = RightShiftWithRounding_S32(a[0], kFilterBits);
+ a[1] = RightShiftWithRounding_S32(a[1], kFilterBits);
+
+ // Clip the values at (1 << bd) - 1
+ const __m128i clipped_16 = _mm_min_epi16(
+ _mm_packus_epi32(a[0], a[1]), _mm_set1_epi16((1 << bitdepth) - 1));
+ StoreAligned16(dst_ptr, clipped_16);
+ dst_ptr += 8;
+ } while (--x != 0);
+ src += source_stride;
+ dst += dest_stride;
+ } while (--y != 0);
+}
+
+void Init10bpp() {
+ Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+ static_cast<void>(dsp);
+#if DSP_ENABLED_10BPP_SSE4_1(SuperResCoefficients)
+ dsp->super_res_coefficients = SuperResCoefficients_SSE4_1;
+#else
+ static_cast<void>(SuperResCoefficients_SSE4_1);
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(SuperRes)
+ dsp->super_res = SuperRes_SSE4_1<10>;
+#else
+ static_cast<void>(SuperRes_SSE4_1);
+#endif
+}
+
+} // namespace
+} // namespace high_bitdepth
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+void SuperResInit_SSE4_1() {
+ low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ high_bitdepth::Init10bpp();
+#endif
+}
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_TARGETING_SSE4_1
+#else // !LIBGAV1_TARGETING_SSE4_1
namespace libgav1 {
namespace dsp {
diff --git a/src/dsp/x86/super_res_sse4.h b/src/dsp/x86/super_res_sse4.h
index aef5147..07a7ef4 100644
--- a/src/dsp/x86/super_res_sse4.h
+++ b/src/dsp/x86/super_res_sse4.h
@@ -30,9 +30,21 @@ void SuperResInit_SSE4_1();
} // namespace libgav1
#if LIBGAV1_TARGETING_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_SuperResCoefficients
+#define LIBGAV1_Dsp8bpp_SuperResCoefficients LIBGAV1_CPU_SSE4_1
+#endif
+
#ifndef LIBGAV1_Dsp8bpp_SuperRes
#define LIBGAV1_Dsp8bpp_SuperRes LIBGAV1_CPU_SSE4_1
#endif
+
+#ifndef LIBGAV1_Dsp10bpp_SuperResCoefficients
+#define LIBGAV1_Dsp10bpp_SuperResCoefficients LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_SuperRes
+#define LIBGAV1_Dsp10bpp_SuperRes LIBGAV1_CPU_SSE4_1
+#endif
#endif // LIBGAV1_TARGETING_SSE4_1
#endif // LIBGAV1_SRC_DSP_X86_SUPER_RES_SSE4_H_
diff --git a/src/dsp/x86/transpose_sse4.h b/src/dsp/x86/transpose_sse4.h
index 208b301..9726495 100644
--- a/src/dsp/x86/transpose_sse4.h
+++ b/src/dsp/x86/transpose_sse4.h
@@ -30,9 +30,9 @@ LIBGAV1_ALWAYS_INLINE void Transpose2x16_U16(const __m128i* const in,
__m128i* const out) {
// Unpack 16 bit elements. Goes from:
// in[0]: 00 01 10 11 20 21 30 31
- // in[0]: 40 41 50 51 60 61 70 71
- // in[0]: 80 81 90 91 a0 a1 b0 b1
- // in[0]: c0 c1 d0 d1 e0 e1 f0 f1
+ // in[1]: 40 41 50 51 60 61 70 71
+ // in[2]: 80 81 90 91 a0 a1 b0 b1
+ // in[3]: c0 c1 d0 d1 e0 e1 f0 f1
// to:
// a0: 00 40 01 41 10 50 11 51
// a1: 20 60 21 61 30 70 31 71
diff --git a/src/dsp/x86/warp_sse4.cc b/src/dsp/x86/warp_sse4.cc
index 43279ab..9ddfeac 100644
--- a/src/dsp/x86/warp_sse4.cc
+++ b/src/dsp/x86/warp_sse4.cc
@@ -513,7 +513,7 @@ void WarpInit_SSE4_1() { low_bitdepth::Init8bpp(); }
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_TARGETING_SSE4_1
+#else // !LIBGAV1_TARGETING_SSE4_1
namespace libgav1 {
namespace dsp {
diff --git a/src/dsp/x86/weight_mask_sse4.cc b/src/dsp/x86/weight_mask_sse4.cc
index dfd5662..08a1739 100644
--- a/src/dsp/x86/weight_mask_sse4.cc
+++ b/src/dsp/x86/weight_mask_sse4.cc
@@ -36,47 +36,65 @@ namespace {
constexpr int kRoundingBits8bpp = 4;
-template <bool mask_is_inverse>
-inline void WeightMask8_SSE4(const int16_t* prediction_0,
- const int16_t* prediction_1, uint8_t* mask) {
- const __m128i pred_0 = LoadAligned16(prediction_0);
- const __m128i pred_1 = LoadAligned16(prediction_1);
- const __m128i difference = RightShiftWithRounding_U16(
- _mm_abs_epi16(_mm_sub_epi16(pred_0, pred_1)), kRoundingBits8bpp);
- const __m128i scaled_difference = _mm_srli_epi16(difference, 4);
+template <bool mask_is_inverse, bool is_store_16>
+inline void WeightMask16_SSE4(const int16_t* prediction_0,
+ const int16_t* prediction_1, uint8_t* mask,
+ ptrdiff_t mask_stride) {
+ const __m128i pred_00 = LoadAligned16(prediction_0);
+ const __m128i pred_10 = LoadAligned16(prediction_1);
+ const __m128i difference_0 = RightShiftWithRounding_U16(
+ _mm_abs_epi16(_mm_sub_epi16(pred_00, pred_10)), kRoundingBits8bpp);
+ const __m128i scaled_difference_0 = _mm_srli_epi16(difference_0, 4);
+
+ const __m128i pred_01 = LoadAligned16(prediction_0 + 8);
+ const __m128i pred_11 = LoadAligned16(prediction_1 + 8);
+ const __m128i difference_1 = RightShiftWithRounding_U16(
+ _mm_abs_epi16(_mm_sub_epi16(pred_01, pred_11)), kRoundingBits8bpp);
+ const __m128i scaled_difference_1 = _mm_srli_epi16(difference_1, 4);
+
const __m128i difference_offset = _mm_set1_epi8(38);
const __m128i adjusted_difference =
- _mm_adds_epu8(_mm_packus_epi16(scaled_difference, scaled_difference),
+ _mm_adds_epu8(_mm_packus_epi16(scaled_difference_0, scaled_difference_1),
difference_offset);
const __m128i mask_ceiling = _mm_set1_epi8(64);
const __m128i mask_value = _mm_min_epi8(adjusted_difference, mask_ceiling);
if (mask_is_inverse) {
const __m128i inverted_mask_value = _mm_sub_epi8(mask_ceiling, mask_value);
- StoreLo8(mask, inverted_mask_value);
+ if (is_store_16) {
+ StoreAligned16(mask, inverted_mask_value);
+ } else {
+ StoreLo8(mask, inverted_mask_value);
+ StoreHi8(mask + mask_stride, inverted_mask_value);
+ }
} else {
- StoreLo8(mask, mask_value);
+ if (is_store_16) {
+ StoreAligned16(mask, mask_value);
+ } else {
+ StoreLo8(mask, mask_value);
+ StoreHi8(mask + mask_stride, mask_value);
+ }
}
}
-#define WEIGHT8_WITHOUT_STRIDE \
- WeightMask8_SSE4<mask_is_inverse>(pred_0, pred_1, mask)
+#define WEIGHT8_PAIR_WITHOUT_STRIDE \
+ WeightMask16_SSE4<mask_is_inverse, false>(pred_0, pred_1, mask, mask_stride)
-#define WEIGHT8_AND_STRIDE \
- WEIGHT8_WITHOUT_STRIDE; \
- pred_0 += 8; \
- pred_1 += 8; \
- mask += mask_stride
+#define WEIGHT8_PAIR_AND_STRIDE \
+ WEIGHT8_PAIR_WITHOUT_STRIDE; \
+ pred_0 += 8 << 1; \
+ pred_1 += 8 << 1; \
+ mask += mask_stride << 1
template <bool mask_is_inverse>
void WeightMask8x8_SSE4(const void* prediction_0, const void* prediction_1,
uint8_t* mask, ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
- int y = 0;
- do {
- WEIGHT8_AND_STRIDE;
- } while (++y < 7);
- WEIGHT8_WITHOUT_STRIDE;
+
+ WEIGHT8_PAIR_AND_STRIDE;
+ WEIGHT8_PAIR_AND_STRIDE;
+ WEIGHT8_PAIR_AND_STRIDE;
+ WEIGHT8_PAIR_WITHOUT_STRIDE;
}
template <bool mask_is_inverse>
@@ -84,13 +102,13 @@ void WeightMask8x16_SSE4(const void* prediction_0, const void* prediction_1,
uint8_t* mask, ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
- int y3 = 0;
+ int y3 = 3;
do {
- WEIGHT8_AND_STRIDE;
- WEIGHT8_AND_STRIDE;
- WEIGHT8_AND_STRIDE;
- } while (++y3 < 5);
- WEIGHT8_WITHOUT_STRIDE;
+ WEIGHT8_PAIR_AND_STRIDE;
+ WEIGHT8_PAIR_AND_STRIDE;
+ } while (--y3 != 0);
+ WEIGHT8_PAIR_AND_STRIDE;
+ WEIGHT8_PAIR_WITHOUT_STRIDE;
}
template <bool mask_is_inverse>
@@ -98,21 +116,17 @@ void WeightMask8x32_SSE4(const void* prediction_0, const void* prediction_1,
uint8_t* mask, ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
- int y5 = 0;
+ int y5 = 5;
do {
- WEIGHT8_AND_STRIDE;
- WEIGHT8_AND_STRIDE;
- WEIGHT8_AND_STRIDE;
- WEIGHT8_AND_STRIDE;
- WEIGHT8_AND_STRIDE;
- } while (++y5 < 6);
- WEIGHT8_AND_STRIDE;
- WEIGHT8_WITHOUT_STRIDE;
+ WEIGHT8_PAIR_AND_STRIDE;
+ WEIGHT8_PAIR_AND_STRIDE;
+ WEIGHT8_PAIR_AND_STRIDE;
+ } while (--y5 != 0);
+ WEIGHT8_PAIR_WITHOUT_STRIDE;
}
-#define WEIGHT16_WITHOUT_STRIDE \
- WeightMask8_SSE4<mask_is_inverse>(pred_0, pred_1, mask); \
- WeightMask8_SSE4<mask_is_inverse>(pred_0 + 8, pred_1 + 8, mask + 8)
+#define WEIGHT16_WITHOUT_STRIDE \
+ WeightMask16_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask, mask_stride)
#define WEIGHT16_AND_STRIDE \
WEIGHT16_WITHOUT_STRIDE; \
@@ -125,10 +139,10 @@ void WeightMask16x8_SSE4(const void* prediction_0, const void* prediction_1,
uint8_t* mask, ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
- int y = 0;
+ int y = 7;
do {
WEIGHT16_AND_STRIDE;
- } while (++y < 7);
+ } while (--y != 0);
WEIGHT16_WITHOUT_STRIDE;
}
@@ -137,12 +151,12 @@ void WeightMask16x16_SSE4(const void* prediction_0, const void* prediction_1,
uint8_t* mask, ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
- int y3 = 0;
+ int y3 = 5;
do {
WEIGHT16_AND_STRIDE;
WEIGHT16_AND_STRIDE;
WEIGHT16_AND_STRIDE;
- } while (++y3 < 5);
+ } while (--y3 != 0);
WEIGHT16_WITHOUT_STRIDE;
}
@@ -151,14 +165,14 @@ void WeightMask16x32_SSE4(const void* prediction_0, const void* prediction_1,
uint8_t* mask, ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
- int y5 = 0;
+ int y5 = 6;
do {
WEIGHT16_AND_STRIDE;
WEIGHT16_AND_STRIDE;
WEIGHT16_AND_STRIDE;
WEIGHT16_AND_STRIDE;
WEIGHT16_AND_STRIDE;
- } while (++y5 < 6);
+ } while (--y5 != 0);
WEIGHT16_AND_STRIDE;
WEIGHT16_WITHOUT_STRIDE;
}
@@ -168,20 +182,19 @@ void WeightMask16x64_SSE4(const void* prediction_0, const void* prediction_1,
uint8_t* mask, ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
- int y3 = 0;
+ int y3 = 21;
do {
WEIGHT16_AND_STRIDE;
WEIGHT16_AND_STRIDE;
WEIGHT16_AND_STRIDE;
- } while (++y3 < 21);
+ } while (--y3 != 0);
WEIGHT16_WITHOUT_STRIDE;
}
-#define WEIGHT32_WITHOUT_STRIDE \
- WeightMask8_SSE4<mask_is_inverse>(pred_0, pred_1, mask); \
- WeightMask8_SSE4<mask_is_inverse>(pred_0 + 8, pred_1 + 8, mask + 8); \
- WeightMask8_SSE4<mask_is_inverse>(pred_0 + 16, pred_1 + 16, mask + 16); \
- WeightMask8_SSE4<mask_is_inverse>(pred_0 + 24, pred_1 + 24, mask + 24)
+#define WEIGHT32_WITHOUT_STRIDE \
+ WeightMask16_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask, mask_stride); \
+ WeightMask16_SSE4<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16, \
+ mask + 16, mask_stride)
#define WEIGHT32_AND_STRIDE \
WEIGHT32_WITHOUT_STRIDE; \
@@ -209,12 +222,12 @@ void WeightMask32x16_SSE4(const void* prediction_0, const void* prediction_1,
uint8_t* mask, ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
- int y3 = 0;
+ int y3 = 5;
do {
WEIGHT32_AND_STRIDE;
WEIGHT32_AND_STRIDE;
WEIGHT32_AND_STRIDE;
- } while (++y3 < 5);
+ } while (--y3 != 0);
WEIGHT32_WITHOUT_STRIDE;
}
@@ -223,14 +236,14 @@ void WeightMask32x32_SSE4(const void* prediction_0, const void* prediction_1,
uint8_t* mask, ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
- int y5 = 0;
+ int y5 = 6;
do {
WEIGHT32_AND_STRIDE;
WEIGHT32_AND_STRIDE;
WEIGHT32_AND_STRIDE;
WEIGHT32_AND_STRIDE;
WEIGHT32_AND_STRIDE;
- } while (++y5 < 6);
+ } while (--y5 != 0);
WEIGHT32_AND_STRIDE;
WEIGHT32_WITHOUT_STRIDE;
}
@@ -240,24 +253,23 @@ void WeightMask32x64_SSE4(const void* prediction_0, const void* prediction_1,
uint8_t* mask, ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
- int y3 = 0;
+ int y3 = 21;
do {
WEIGHT32_AND_STRIDE;
WEIGHT32_AND_STRIDE;
WEIGHT32_AND_STRIDE;
- } while (++y3 < 21);
+ } while (--y3 != 0);
WEIGHT32_WITHOUT_STRIDE;
}
-#define WEIGHT64_WITHOUT_STRIDE \
- WeightMask8_SSE4<mask_is_inverse>(pred_0, pred_1, mask); \
- WeightMask8_SSE4<mask_is_inverse>(pred_0 + 8, pred_1 + 8, mask + 8); \
- WeightMask8_SSE4<mask_is_inverse>(pred_0 + 16, pred_1 + 16, mask + 16); \
- WeightMask8_SSE4<mask_is_inverse>(pred_0 + 24, pred_1 + 24, mask + 24); \
- WeightMask8_SSE4<mask_is_inverse>(pred_0 + 32, pred_1 + 32, mask + 32); \
- WeightMask8_SSE4<mask_is_inverse>(pred_0 + 40, pred_1 + 40, mask + 40); \
- WeightMask8_SSE4<mask_is_inverse>(pred_0 + 48, pred_1 + 48, mask + 48); \
- WeightMask8_SSE4<mask_is_inverse>(pred_0 + 56, pred_1 + 56, mask + 56)
+#define WEIGHT64_WITHOUT_STRIDE \
+ WeightMask16_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask, mask_stride); \
+ WeightMask16_SSE4<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16, \
+ mask + 16, mask_stride); \
+ WeightMask16_SSE4<mask_is_inverse, true>(pred_0 + 32, pred_1 + 32, \
+ mask + 32, mask_stride); \
+ WeightMask16_SSE4<mask_is_inverse, true>(pred_0 + 48, pred_1 + 48, \
+ mask + 48, mask_stride)
#define WEIGHT64_AND_STRIDE \
WEIGHT64_WITHOUT_STRIDE; \
@@ -447,12 +459,491 @@ void Init8bpp() {
} // namespace
} // namespace low_bitdepth
-void WeightMaskInit_SSE4_1() { low_bitdepth::Init8bpp(); }
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+constexpr int kRoundingBits10bpp = 6;
+constexpr int kScaledDiffShift = 4;
+
+template <bool mask_is_inverse, bool is_store_16>
+inline void WeightMask16_10bpp_SSE4(const uint16_t* prediction_0,
+ const uint16_t* prediction_1, uint8_t* mask,
+ ptrdiff_t mask_stride) {
+ const __m128i diff_offset = _mm_set1_epi8(38);
+ const __m128i mask_ceiling = _mm_set1_epi8(64);
+ const __m128i zero = _mm_setzero_si128();
+
+ // Range of prediction: [3988, 61532].
+ const __m128i pred_00 = LoadAligned16(prediction_0);
+ const __m128i pred_10 = LoadAligned16(prediction_1);
+ const __m128i pred_lo_00 = _mm_cvtepu16_epi32(pred_00);
+ const __m128i pred_lo_10 = _mm_cvtepu16_epi32(pred_10);
+ const __m128i diff_lo_0 = RightShiftWithRounding_U32(
+ _mm_abs_epi32(_mm_sub_epi32(pred_lo_00, pred_lo_10)), kRoundingBits10bpp);
+
+ const __m128i pred_hi_00 = _mm_unpackhi_epi16(pred_00, zero);
+ const __m128i pred_hi_10 = _mm_unpackhi_epi16(pred_10, zero);
+ const __m128i diff_hi_0 = RightShiftWithRounding_U32(
+ _mm_abs_epi32(_mm_sub_epi32(pred_hi_00, pred_hi_10)), kRoundingBits10bpp);
+
+ const __m128i diff_0 = _mm_packus_epi32(diff_lo_0, diff_hi_0);
+ const __m128i scaled_diff_0 = _mm_srli_epi16(diff_0, kScaledDiffShift);
+
+ const __m128i pred_01 = LoadAligned16(prediction_0 + 8);
+ const __m128i pred_11 = LoadAligned16(prediction_1 + 8);
+ const __m128i pred_lo_01 = _mm_cvtepu16_epi32(pred_01);
+ const __m128i pred_lo_11 = _mm_cvtepu16_epi32(pred_11);
+ const __m128i diff_lo_1 = RightShiftWithRounding_U32(
+ _mm_abs_epi32(_mm_sub_epi32(pred_lo_01, pred_lo_11)), kRoundingBits10bpp);
+
+ const __m128i pred_hi_01 = _mm_unpackhi_epi16(pred_01, zero);
+ const __m128i pred_hi_11 = _mm_unpackhi_epi16(pred_11, zero);
+ const __m128i diff_hi_1 = RightShiftWithRounding_U32(
+ _mm_abs_epi32(_mm_sub_epi32(pred_hi_01, pred_hi_11)), kRoundingBits10bpp);
+
+ const __m128i diff_1 = _mm_packus_epi32(diff_lo_1, diff_hi_1);
+ const __m128i scaled_diff_1 = _mm_srli_epi16(diff_1, kScaledDiffShift);
+
+ const __m128i adjusted_diff = _mm_adds_epu8(
+ _mm_packus_epi16(scaled_diff_0, scaled_diff_1), diff_offset);
+ const __m128i mask_value = _mm_min_epi8(adjusted_diff, mask_ceiling);
+
+ if (mask_is_inverse) {
+ const __m128i inverted_mask_value = _mm_sub_epi8(mask_ceiling, mask_value);
+ if (is_store_16) {
+ StoreAligned16(mask, inverted_mask_value);
+ } else {
+ StoreLo8(mask, inverted_mask_value);
+ StoreHi8(mask + mask_stride, inverted_mask_value);
+ }
+ } else {
+ if (is_store_16) {
+ StoreAligned16(mask, mask_value);
+ } else {
+ StoreLo8(mask, mask_value);
+ StoreHi8(mask + mask_stride, mask_value);
+ }
+ }
+}
+
+#define WEIGHT8_PAIR_WITHOUT_STRIDE_10BPP \
+ WeightMask16_10bpp_SSE4<mask_is_inverse, false>(pred_0, pred_1, mask, \
+ mask_stride)
+
+#define WEIGHT8_PAIR_AND_STRIDE_10BPP \
+ WEIGHT8_PAIR_WITHOUT_STRIDE_10BPP; \
+ pred_0 += 8 << 1; \
+ pred_1 += 8 << 1; \
+ mask += mask_stride << 1
+
+template <bool mask_is_inverse>
+void WeightMask8x8_10bpp_SSE4(const void* prediction_0,
+ const void* prediction_1, uint8_t* mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+
+ WEIGHT8_PAIR_AND_STRIDE_10BPP;
+ WEIGHT8_PAIR_AND_STRIDE_10BPP;
+ WEIGHT8_PAIR_AND_STRIDE_10BPP;
+ WEIGHT8_PAIR_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask8x16_10bpp_SSE4(const void* prediction_0,
+ const void* prediction_1, uint8_t* mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+ int y3 = 3;
+ do {
+ WEIGHT8_PAIR_AND_STRIDE_10BPP;
+ WEIGHT8_PAIR_AND_STRIDE_10BPP;
+ } while (--y3 != 0);
+ WEIGHT8_PAIR_AND_STRIDE_10BPP;
+ WEIGHT8_PAIR_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask8x32_10bpp_SSE4(const void* prediction_0,
+ const void* prediction_1, uint8_t* mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+ int y5 = 5;
+ do {
+ WEIGHT8_PAIR_AND_STRIDE_10BPP;
+ WEIGHT8_PAIR_AND_STRIDE_10BPP;
+ WEIGHT8_PAIR_AND_STRIDE_10BPP;
+ } while (--y5 != 0);
+ WEIGHT8_PAIR_WITHOUT_STRIDE_10BPP;
+}
+
+#define WEIGHT16_WITHOUT_STRIDE_10BPP \
+ WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask, \
+ mask_stride)
+
+#define WEIGHT16_AND_STRIDE_10BPP \
+ WEIGHT16_WITHOUT_STRIDE_10BPP; \
+ pred_0 += 16; \
+ pred_1 += 16; \
+ mask += mask_stride
+
+template <bool mask_is_inverse>
+void WeightMask16x8_10bpp_SSE4(const void* prediction_0,
+ const void* prediction_1, uint8_t* mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+ int y = 7;
+ do {
+ WEIGHT16_AND_STRIDE_10BPP;
+ } while (--y != 0);
+ WEIGHT16_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask16x16_10bpp_SSE4(const void* prediction_0,
+ const void* prediction_1, uint8_t* mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+ int y3 = 5;
+ do {
+ WEIGHT16_AND_STRIDE_10BPP;
+ WEIGHT16_AND_STRIDE_10BPP;
+ WEIGHT16_AND_STRIDE_10BPP;
+ } while (--y3 != 0);
+ WEIGHT16_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask16x32_10bpp_SSE4(const void* prediction_0,
+ const void* prediction_1, uint8_t* mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+ int y5 = 6;
+ do {
+ WEIGHT16_AND_STRIDE_10BPP;
+ WEIGHT16_AND_STRIDE_10BPP;
+ WEIGHT16_AND_STRIDE_10BPP;
+ WEIGHT16_AND_STRIDE_10BPP;
+ WEIGHT16_AND_STRIDE_10BPP;
+ } while (--y5 != 0);
+ WEIGHT16_AND_STRIDE_10BPP;
+ WEIGHT16_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask16x64_10bpp_SSE4(const void* prediction_0,
+ const void* prediction_1, uint8_t* mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+ int y3 = 21;
+ do {
+ WEIGHT16_AND_STRIDE_10BPP;
+ WEIGHT16_AND_STRIDE_10BPP;
+ WEIGHT16_AND_STRIDE_10BPP;
+ } while (--y3 != 0);
+ WEIGHT16_WITHOUT_STRIDE_10BPP;
+}
+
+#define WEIGHT32_WITHOUT_STRIDE_10BPP \
+ WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask, \
+ mask_stride); \
+ WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16, \
+ mask + 16, mask_stride)
+
+#define WEIGHT32_AND_STRIDE_10BPP \
+ WEIGHT32_WITHOUT_STRIDE_10BPP; \
+ pred_0 += 32; \
+ pred_1 += 32; \
+ mask += mask_stride
+
+template <bool mask_is_inverse>
+void WeightMask32x8_10bpp_SSE4(const void* prediction_0,
+ const void* prediction_1, uint8_t* mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+ WEIGHT32_AND_STRIDE_10BPP;
+ WEIGHT32_AND_STRIDE_10BPP;
+ WEIGHT32_AND_STRIDE_10BPP;
+ WEIGHT32_AND_STRIDE_10BPP;
+ WEIGHT32_AND_STRIDE_10BPP;
+ WEIGHT32_AND_STRIDE_10BPP;
+ WEIGHT32_AND_STRIDE_10BPP;
+ WEIGHT32_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask32x16_10bpp_SSE4(const void* prediction_0,
+ const void* prediction_1, uint8_t* mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+ int y3 = 5;
+ do {
+ WEIGHT32_AND_STRIDE_10BPP;
+ WEIGHT32_AND_STRIDE_10BPP;
+ WEIGHT32_AND_STRIDE_10BPP;
+ } while (--y3 != 0);
+ WEIGHT32_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask32x32_10bpp_SSE4(const void* prediction_0,
+ const void* prediction_1, uint8_t* mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+ int y5 = 6;
+ do {
+ WEIGHT32_AND_STRIDE_10BPP;
+ WEIGHT32_AND_STRIDE_10BPP;
+ WEIGHT32_AND_STRIDE_10BPP;
+ WEIGHT32_AND_STRIDE_10BPP;
+ WEIGHT32_AND_STRIDE_10BPP;
+ } while (--y5 != 0);
+ WEIGHT32_AND_STRIDE_10BPP;
+ WEIGHT32_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask32x64_10bpp_SSE4(const void* prediction_0,
+ const void* prediction_1, uint8_t* mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+ int y3 = 21;
+ do {
+ WEIGHT32_AND_STRIDE_10BPP;
+ WEIGHT32_AND_STRIDE_10BPP;
+ WEIGHT32_AND_STRIDE_10BPP;
+ } while (--y3 != 0);
+ WEIGHT32_WITHOUT_STRIDE_10BPP;
+}
+
+#define WEIGHT64_WITHOUT_STRIDE_10BPP \
+ WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask, \
+ mask_stride); \
+ WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16, \
+ mask + 16, mask_stride); \
+ WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0 + 32, pred_1 + 32, \
+ mask + 32, mask_stride); \
+ WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0 + 48, pred_1 + 48, \
+ mask + 48, mask_stride)
+
+#define WEIGHT64_AND_STRIDE_10BPP \
+ WEIGHT64_WITHOUT_STRIDE_10BPP; \
+ pred_0 += 64; \
+ pred_1 += 64; \
+ mask += mask_stride
+
+template <bool mask_is_inverse>
+void WeightMask64x16_10bpp_SSE4(const void* prediction_0,
+ const void* prediction_1, uint8_t* mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+ int y3 = 5;
+ do {
+ WEIGHT64_AND_STRIDE_10BPP;
+ WEIGHT64_AND_STRIDE_10BPP;
+ WEIGHT64_AND_STRIDE_10BPP;
+ } while (--y3 != 0);
+ WEIGHT64_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask64x32_10bpp_SSE4(const void* prediction_0,
+ const void* prediction_1, uint8_t* mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+ int y5 = 6;
+ do {
+ WEIGHT64_AND_STRIDE_10BPP;
+ WEIGHT64_AND_STRIDE_10BPP;
+ WEIGHT64_AND_STRIDE_10BPP;
+ WEIGHT64_AND_STRIDE_10BPP;
+ WEIGHT64_AND_STRIDE_10BPP;
+ } while (--y5 != 0);
+ WEIGHT64_AND_STRIDE_10BPP;
+ WEIGHT64_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask64x64_10bpp_SSE4(const void* prediction_0,
+ const void* prediction_1, uint8_t* mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+ int y3 = 21;
+ do {
+ WEIGHT64_AND_STRIDE_10BPP;
+ WEIGHT64_AND_STRIDE_10BPP;
+ WEIGHT64_AND_STRIDE_10BPP;
+ } while (--y3 != 0);
+ WEIGHT64_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask64x128_10bpp_SSE4(const void* prediction_0,
+ const void* prediction_1, uint8_t* mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+ int y3 = 42;
+ do {
+ WEIGHT64_AND_STRIDE_10BPP;
+ WEIGHT64_AND_STRIDE_10BPP;
+ WEIGHT64_AND_STRIDE_10BPP;
+ } while (--y3 != 0);
+ WEIGHT64_AND_STRIDE_10BPP;
+ WEIGHT64_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask128x64_10bpp_SSE4(const void* prediction_0,
+ const void* prediction_1, uint8_t* mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+ int y3 = 21;
+ const ptrdiff_t adjusted_mask_stride = mask_stride - 64;
+ do {
+ WEIGHT64_WITHOUT_STRIDE_10BPP;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += 64;
+ WEIGHT64_WITHOUT_STRIDE_10BPP;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += adjusted_mask_stride;
+
+ WEIGHT64_WITHOUT_STRIDE_10BPP;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += 64;
+ WEIGHT64_WITHOUT_STRIDE_10BPP;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += adjusted_mask_stride;
+
+ WEIGHT64_WITHOUT_STRIDE_10BPP;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += 64;
+ WEIGHT64_WITHOUT_STRIDE_10BPP;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += adjusted_mask_stride;
+ } while (--y3 != 0);
+ WEIGHT64_WITHOUT_STRIDE_10BPP;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += 64;
+ WEIGHT64_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask128x128_10bpp_SSE4(const void* prediction_0,
+ const void* prediction_1, uint8_t* mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+ int y3 = 42;
+ const ptrdiff_t adjusted_mask_stride = mask_stride - 64;
+ do {
+ WEIGHT64_WITHOUT_STRIDE_10BPP;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += 64;
+ WEIGHT64_WITHOUT_STRIDE_10BPP;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += adjusted_mask_stride;
+
+ WEIGHT64_WITHOUT_STRIDE_10BPP;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += 64;
+ WEIGHT64_WITHOUT_STRIDE_10BPP;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += adjusted_mask_stride;
+
+ WEIGHT64_WITHOUT_STRIDE_10BPP;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += 64;
+ WEIGHT64_WITHOUT_STRIDE_10BPP;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += adjusted_mask_stride;
+ } while (--y3 != 0);
+ WEIGHT64_WITHOUT_STRIDE_10BPP;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += 64;
+ WEIGHT64_WITHOUT_STRIDE_10BPP;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += adjusted_mask_stride;
+
+ WEIGHT64_WITHOUT_STRIDE_10BPP;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += 64;
+ WEIGHT64_WITHOUT_STRIDE_10BPP;
+}
+
+#define INIT_WEIGHT_MASK_10BPP(width, height, w_index, h_index) \
+ dsp->weight_mask[w_index][h_index][0] = \
+ WeightMask##width##x##height##_10bpp_SSE4<0>; \
+ dsp->weight_mask[w_index][h_index][1] = \
+ WeightMask##width##x##height##_10bpp_SSE4<1>
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+ INIT_WEIGHT_MASK_10BPP(8, 8, 0, 0);
+ INIT_WEIGHT_MASK_10BPP(8, 16, 0, 1);
+ INIT_WEIGHT_MASK_10BPP(8, 32, 0, 2);
+ INIT_WEIGHT_MASK_10BPP(16, 8, 1, 0);
+ INIT_WEIGHT_MASK_10BPP(16, 16, 1, 1);
+ INIT_WEIGHT_MASK_10BPP(16, 32, 1, 2);
+ INIT_WEIGHT_MASK_10BPP(16, 64, 1, 3);
+ INIT_WEIGHT_MASK_10BPP(32, 8, 2, 0);
+ INIT_WEIGHT_MASK_10BPP(32, 16, 2, 1);
+ INIT_WEIGHT_MASK_10BPP(32, 32, 2, 2);
+ INIT_WEIGHT_MASK_10BPP(32, 64, 2, 3);
+ INIT_WEIGHT_MASK_10BPP(64, 16, 3, 1);
+ INIT_WEIGHT_MASK_10BPP(64, 32, 3, 2);
+ INIT_WEIGHT_MASK_10BPP(64, 64, 3, 3);
+ INIT_WEIGHT_MASK_10BPP(64, 128, 3, 4);
+ INIT_WEIGHT_MASK_10BPP(128, 64, 4, 3);
+ INIT_WEIGHT_MASK_10BPP(128, 128, 4, 4);
+}
+
+} // namespace
+} // namespace high_bitdepth
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+void WeightMaskInit_SSE4_1() {
+ low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ high_bitdepth::Init10bpp();
+#endif
+}
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_TARGETING_SSE4_1
+#else // !LIBGAV1_TARGETING_SSE4_1
namespace libgav1 {
namespace dsp {
diff --git a/src/dsp/x86/weight_mask_sse4.h b/src/dsp/x86/weight_mask_sse4.h
index 07636b7..e5d9d70 100644
--- a/src/dsp/x86/weight_mask_sse4.h
+++ b/src/dsp/x86/weight_mask_sse4.h
@@ -99,6 +99,73 @@ void WeightMaskInit_SSE4_1();
#define LIBGAV1_Dsp8bpp_WeightMask_128x128 LIBGAV1_CPU_SSE4_1
#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_8x8
+#define LIBGAV1_Dsp10bpp_WeightMask_8x8 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_8x16
+#define LIBGAV1_Dsp10bpp_WeightMask_8x16 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_8x32
+#define LIBGAV1_Dsp10bpp_WeightMask_8x32 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_16x8
+#define LIBGAV1_Dsp10bpp_WeightMask_16x8 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_16x16
+#define LIBGAV1_Dsp10bpp_WeightMask_16x16 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_16x32
+#define LIBGAV1_Dsp10bpp_WeightMask_16x32 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_16x64
+#define LIBGAV1_Dsp10bpp_WeightMask_16x64 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_32x8
+#define LIBGAV1_Dsp10bpp_WeightMask_32x8 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_32x16
+#define LIBGAV1_Dsp10bpp_WeightMask_32x16 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_32x32
+#define LIBGAV1_Dsp10bpp_WeightMask_32x32 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_32x64
+#define LIBGAV1_Dsp10bpp_WeightMask_32x64 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_64x16
+#define LIBGAV1_Dsp10bpp_WeightMask_64x16 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_64x32
+#define LIBGAV1_Dsp10bpp_WeightMask_64x32 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_64x64
+#define LIBGAV1_Dsp10bpp_WeightMask_64x64 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_64x128
+#define LIBGAV1_Dsp10bpp_WeightMask_64x128 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_128x64
+#define LIBGAV1_Dsp10bpp_WeightMask_128x64 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_128x128
+#define LIBGAV1_Dsp10bpp_WeightMask_128x128 LIBGAV1_CPU_SSE4_1
+#endif
#endif // LIBGAV1_TARGETING_SSE4_1
#endif // LIBGAV1_SRC_DSP_ARM_WEIGHT_MASK_SSE4_H_
diff --git a/src/gav1/decoder_settings.h b/src/gav1/decoder_settings.h
index ab22a4d..7ee487f 100644
--- a/src/gav1/decoder_settings.h
+++ b/src/gav1/decoder_settings.h
@@ -62,7 +62,8 @@ typedef struct Libgav1DecoderSettings {
Libgav1GetFrameBufferCallback get_frame_buffer;
// Release frame buffer callback.
Libgav1ReleaseFrameBufferCallback release_frame_buffer;
- // Release input frame buffer callback.
+ // Release input frame buffer callback. This callback must be set when
+ // |frame_parallel| is true.
Libgav1ReleaseInputBufferCallback release_input_buffer;
// Passed as the private_data argument to the callbacks.
void* callback_private_data;
@@ -117,7 +118,8 @@ struct DecoderSettings {
GetFrameBufferCallback get_frame_buffer = nullptr;
// Release frame buffer callback.
ReleaseFrameBufferCallback release_frame_buffer = nullptr;
- // Release input frame buffer callback.
+ // Release input frame buffer callback. This callback must be set when
+ // |frame_parallel| is true.
ReleaseInputBufferCallback release_input_buffer = nullptr;
// Passed as the private_data argument to the callbacks.
void* callback_private_data = nullptr;
diff --git a/src/gav1/symbol_visibility.h b/src/gav1/symbol_visibility.h
index ad7498c..116a514 100644
--- a/src/gav1/symbol_visibility.h
+++ b/src/gav1/symbol_visibility.h
@@ -58,6 +58,11 @@
//
// Much of the above information and more can be found at
// https://gcc.gnu.org/wiki/Visibility
+//
+// NOTE: A third-party build system for libgav1 can add -DLIBGAV1_PUBLIC= to the
+// compiler command line to override the definition of LIBGAV1_PUBLIC in this
+// header. This can be used to create a libgav1 static library that will not
+// export any symbols when it is linked into a shared library.
#if !defined(LIBGAV1_PUBLIC)
#if defined(_WIN32)
@@ -76,7 +81,7 @@
#else
#define LIBGAV1_PUBLIC
#endif // defined(LIBGAV1_BUILDING_DLL) && LIBGAV1_BUILDING_DLL
-#else
+#else // !defined(_WIN32)
#if defined(__GNUC__) && __GNUC__ >= 4
#define LIBGAV1_PUBLIC __attribute__((visibility("default")))
#else
diff --git a/src/gav1/version.h b/src/gav1/version.h
index 78a573e..c018928 100644
--- a/src/gav1/version.h
+++ b/src/gav1/version.h
@@ -24,7 +24,7 @@
#define LIBGAV1_MAJOR_VERSION 0
#define LIBGAV1_MINOR_VERSION 16
-#define LIBGAV1_PATCH_VERSION 1
+#define LIBGAV1_PATCH_VERSION 3
#define LIBGAV1_VERSION \
((LIBGAV1_MAJOR_VERSION << 16) | (LIBGAV1_MINOR_VERSION << 8) | \
diff --git a/src/obu_parser.cc b/src/obu_parser.cc
index bbf00ed..69480d7 100644
--- a/src/obu_parser.cc
+++ b/src/obu_parser.cc
@@ -479,9 +479,13 @@ bool ObuParser::ParseSequenceHeader(bool seen_frame_header) {
LIBGAV1_DLOG(ERROR, "Sequence header changed in the middle of a frame.");
return false;
}
+ sequence_header_changed_ = true;
decoder_state_.ClearReferenceFrames();
}
sequence_header_ = sequence_header;
+ if (!has_sequence_header_) {
+ sequence_header_changed_ = true;
+ }
has_sequence_header_ = true;
// Section 6.4.1: It is a requirement of bitstream conformance that if
// OperatingPointIdc is equal to 0, then obu_extension_flag is equal to 0 for
@@ -509,12 +513,12 @@ void ObuParser::MarkInvalidReferenceFrames() {
if (lower_bound_is_smaller) {
if (reference_frame_id > decoder_state_.current_frame_id ||
reference_frame_id < lower_bound) {
- decoder_state_.reference_valid[i] = false;
+ decoder_state_.reference_frame[i] = nullptr;
}
} else {
if (reference_frame_id > decoder_state_.current_frame_id &&
reference_frame_id < lower_bound) {
- decoder_state_.reference_valid[i] = false;
+ decoder_state_.reference_frame[i] = nullptr;
}
}
}
@@ -621,7 +625,7 @@ bool ObuParser::ParseReferenceOrderHint() {
frame_header_.reference_order_hint[i] = scratch;
if (frame_header_.reference_order_hint[i] !=
decoder_state_.reference_order_hint[i]) {
- decoder_state_.reference_valid[i] = false;
+ decoder_state_.reference_frame[i] = nullptr;
}
}
return true;
@@ -1787,10 +1791,11 @@ bool ObuParser::ParseFrameParameters() {
// whenever display_frame_id is read, the value matches
// RefFrameId[ frame_to_show_map_idx ] ..., and that
// RefValid[ frame_to_show_map_idx ] is equal to 1.
+ //
+ // The current_frame_ == nullptr check below is equivalent to checking
+ // if RefValid[ frame_to_show_map_idx ] is equal to 1.
if (frame_header_.display_frame_id !=
- decoder_state_
- .reference_frame_id[frame_header_.frame_to_show] ||
- !decoder_state_.reference_valid[frame_header_.frame_to_show]) {
+ decoder_state_.reference_frame_id[frame_header_.frame_to_show]) {
LIBGAV1_DLOG(ERROR,
"Reference buffer %d has a frame id number mismatch.",
frame_header_.frame_to_show);
@@ -1868,7 +1873,6 @@ bool ObuParser::ParseFrameParameters() {
}
}
if (frame_header_.frame_type == kFrameKey && frame_header_.show_frame) {
- decoder_state_.reference_valid.fill(false);
decoder_state_.reference_order_hint.fill(0);
decoder_state_.reference_frame.fill(nullptr);
}
@@ -2019,15 +2023,8 @@ bool ObuParser::ParseFrameParameters() {
// Note if support for Annex C: Error resilience behavior is added this
// check should be omitted per C.5 Decoder consequences of processable
// frames.
- if (!decoder_state_.reference_valid[reference_frame_index]) {
- LIBGAV1_DLOG(ERROR, "ref_frame_idx[%d] (%d) is not valid.", i,
- reference_frame_index);
- return false;
- }
- // Check if the inter frame requests a nonexistent reference, whether or
- // not frame_refs_short_signaling is used.
if (decoder_state_.reference_frame[reference_frame_index] == nullptr) {
- LIBGAV1_DLOG(ERROR, "ref_frame_idx[%d] (%d) is not a decoded frame.", i,
+ LIBGAV1_DLOG(ERROR, "ref_frame_idx[%d] (%d) is not valid.", i,
reference_frame_index);
return false;
}
@@ -2043,12 +2040,8 @@ bool ObuParser::ParseFrameParameters() {
// Section 6.8.2: It is a requirement of bitstream conformance that
// whenever expectedFrameId[ i ] is calculated, the value matches
// RefFrameId[ ref_frame_idx[ i ] ] ...
- //
- // Section 6.8.2: It is a requirement of bitstream conformance that
- // RefValid[ ref_frame_idx[ i ] ] is equal to 1, ...
if (frame_header_.expected_frame_id[i] !=
- decoder_state_.reference_frame_id[reference_frame_index] ||
- !decoder_state_.reference_valid[reference_frame_index]) {
+ decoder_state_.reference_frame_id[reference_frame_index]) {
LIBGAV1_DLOG(ERROR,
"Reference buffer %d has a frame id number mismatch.",
reference_frame_index);
@@ -2665,6 +2658,7 @@ StatusCode ObuParser::ParseOneFrame(RefCountedBufferPtr* const current_frame) {
metadata_ = {};
tile_buffers_.clear();
next_tile_group_start_ = 0;
+ sequence_header_changed_ = false;
bool parsed_one_full_frame = false;
bool seen_frame_header = false;
diff --git a/src/obu_parser.h b/src/obu_parser.h
index 86d165f..c4619ed 100644
--- a/src/obu_parser.h
+++ b/src/obu_parser.h
@@ -276,6 +276,9 @@ class ObuParser : public Allocable {
const ObuFrameHeader& frame_header() const { return frame_header_; }
const Vector<TileBuffer>& tile_buffers() const { return tile_buffers_; }
const ObuMetadata& metadata() const { return metadata_; }
+ // Returns true if the last call to ParseOneFrame() encountered a sequence
+ // header change.
+ bool sequence_header_changed() const { return sequence_header_changed_; }
// Setters.
void set_sequence_header(const ObuSequenceHeader& sequence_header) {
@@ -384,6 +387,9 @@ class ObuParser : public Allocable {
int next_tile_group_start_ = 0;
// If true, the sequence_header_ field is valid.
bool has_sequence_header_ = false;
+ // If true, it means that the last call to ParseOneFrame() encountered a
+ // sequence header change.
+ bool sequence_header_changed_ = false;
// If true, the obu_extension_flag syntax element in the OBU header must be
// 0. Set to true when parsing a sequence header if OperatingPointIdc is 0.
bool extension_disallowed_ = false;
diff --git a/src/post_filter.h b/src/post_filter.h
index 800d51d..dfcd08e 100644
--- a/src/post_filter.h
+++ b/src/post_filter.h
@@ -272,8 +272,6 @@ class PostFilter {
void CopyBordersForOneSuperBlockRow(int row4x4, int sb4x4,
bool for_loop_restoration);
// Sets up the |loop_restoration_border_| for loop restoration.
- // TODO(linfengz): Unify duplicates in the following two functions if
- // possible.
// This is called when there is no CDEF filter. We copy rows from
// |superres_buffer_| and do the line extension.
void SetupLoopRestorationBorder(int row4x4_start);
@@ -401,11 +399,14 @@ class PostFilter {
// Applies super resolution for the |src| for |rows[plane]| rows of each
// plane. If |line_buffer_row| is larger than or equal to 0, one more row will
// be processed, the line buffer indicated by |line_buffer_row| will be used
- // as the source.
+ // as the source. If |dst_is_loop_restoration_border| is true, then it means
+ // that the |dst| pointers come from |loop_restoration_border_| and the
+ // strides will be populated from that buffer.
void ApplySuperRes(
const std::array<uint8_t*, kMaxPlanes>& src,
const std::array<int, kMaxPlanes>& rows, int line_buffer_row,
- const std::array<uint8_t*, kMaxPlanes>& dst); // Section 7.16.
+ const std::array<uint8_t*, kMaxPlanes>& dst,
+ bool dst_is_loop_restoration_border = false); // Section 7.16.
// Applies SuperRes for the superblock row starting at |row4x4| with a height
// of 4*|sb4x4|.
void ApplySuperResForOneSuperBlockRow(int row4x4, int sb4x4,
diff --git a/src/post_filter/cdef.cc b/src/post_filter/cdef.cc
index 994f448..f32b0a0 100644
--- a/src/post_filter/cdef.cc
+++ b/src/post_filter/cdef.cc
@@ -272,7 +272,7 @@ void PostFilter::ApplyCdefForOneUnit(uint16_t* cdef_block, const int index,
const uint16_t* cdef_src_row_base[kMaxPlanes];
int cdef_src_row_base_stride[kMaxPlanes];
int column_step[kMaxPlanes];
- assert(planes_ >= 1);
+ assert(planes_ == kMaxPlanesMonochrome || planes_ == kMaxPlanes);
int plane = kPlaneY;
do {
cdef_buffer_row_base[plane] =
diff --git a/src/post_filter/loop_restoration.cc b/src/post_filter/loop_restoration.cc
index 3d5da90..826ef48 100644
--- a/src/post_filter/loop_restoration.cc
+++ b/src/post_filter/loop_restoration.cc
@@ -29,15 +29,15 @@ void PostFilter::ApplyLoopRestorationForOneRow(
unit_row * num_horizontal_units);
const bool in_place = DoCdef() || thread_pool_ != nullptr;
const Pixel* border = nullptr;
+ ptrdiff_t border_stride = 0;
src_buffer += unit_y * stride;
if (in_place) {
- assert(loop_restoration_border_.stride(plane) ==
- static_cast<int>(sizeof(Pixel) * stride));
const int border_unit_y = std::max(
RightShiftWithCeiling(unit_y, 4 - subsampling_y_[plane]) - 4, 0);
+ border_stride = loop_restoration_border_.stride(plane) / sizeof(Pixel);
border =
reinterpret_cast<const Pixel*>(loop_restoration_border_.data(plane)) +
- border_unit_y * stride;
+ border_unit_y * border_stride;
}
int unit_column = 0;
int column = 0;
@@ -61,18 +61,22 @@ void PostFilter::ApplyLoopRestorationForOneRow(
}
} else {
const Pixel* top_border = src - kRestorationVerticalBorder * stride;
+ ptrdiff_t top_border_stride = stride;
const Pixel* bottom_border = src + current_process_unit_height * stride;
+ ptrdiff_t bottom_border_stride = stride;
const bool frame_bottom_border =
(unit_y + current_process_unit_height >= plane_height);
if (in_place && (unit_y != 0 || !frame_bottom_border)) {
const Pixel* loop_restoration_border = border + column;
if (unit_y != 0) {
top_border = loop_restoration_border;
- loop_restoration_border += 4 * stride;
+ top_border_stride = border_stride;
+ loop_restoration_border += 4 * border_stride;
}
if (!frame_bottom_border) {
- bottom_border =
- loop_restoration_border + kRestorationVerticalBorder * stride;
+ bottom_border = loop_restoration_border +
+ kRestorationVerticalBorder * border_stride;
+ bottom_border_stride = border_stride;
}
}
RestorationBuffer restoration_buffer;
@@ -81,10 +85,10 @@ void PostFilter::ApplyLoopRestorationForOneRow(
type == kLoopRestorationTypeWiener);
const dsp::LoopRestorationFunc restoration_func =
dsp_.loop_restorations[type - 2];
- restoration_func(restoration_info[unit_column], src, top_border,
- bottom_border, stride, current_process_unit_width,
- current_process_unit_height, &restoration_buffer,
- dst_buffer + column);
+ restoration_func(restoration_info[unit_column], src, stride, top_border,
+ top_border_stride, bottom_border, bottom_border_stride,
+ current_process_unit_width, current_process_unit_height,
+ &restoration_buffer, dst_buffer + column);
}
++unit_column;
column += plane_unit_size;
diff --git a/src/post_filter/post_filter.cc b/src/post_filter/post_filter.cc
index 0eacf34..7671f01 100644
--- a/src/post_filter/post_filter.cc
+++ b/src/post_filter/post_filter.cc
@@ -306,11 +306,11 @@ void PostFilter::ExtendBordersForReferenceFrame() {
}
void PostFilter::CopyDeblockedPixels(Plane plane, int row4x4) {
- assert(frame_buffer_.stride(plane) == loop_restoration_border_.stride(plane));
- const ptrdiff_t stride = frame_buffer_.stride(plane);
+ const ptrdiff_t src_stride = frame_buffer_.stride(plane);
const uint8_t* const src = GetSourceBuffer(plane, row4x4, 0);
const int row_offset = DivideBy4(row4x4);
- uint8_t* dst = loop_restoration_border_.data(plane) + row_offset * stride;
+ const ptrdiff_t dst_stride = loop_restoration_border_.stride(plane);
+ uint8_t* dst = loop_restoration_border_.data(plane) + row_offset * dst_stride;
const int num_pixels = SubsampledValue(MultiplyBy4(frame_header_.columns4x4),
subsampling_x_[plane]);
const int row_width = num_pixels << pixel_size_log2_;
@@ -326,9 +326,9 @@ void PostFilter::CopyDeblockedPixels(Plane plane, int row4x4) {
// border extension).
row = last_valid_row;
}
- memcpy(dst, src + row * stride, row_width);
+ memcpy(dst, src + row * src_stride, row_width);
last_valid_row = row;
- dst += stride;
+ dst += dst_stride;
}
}
@@ -395,9 +395,6 @@ void PostFilter::SetupLoopRestorationBorder(const int row4x4) {
if (loop_restoration_.type[plane] == kLoopRestorationTypeNone) {
continue;
}
- assert(frame_buffer_.stride(plane) ==
- loop_restoration_border_.stride(plane));
- const ptrdiff_t stride = frame_buffer_.stride(plane);
const int row_offset = DivideBy4(row4x4);
const int num_pixels =
SubsampledValue(upscaled_width_, subsampling_x_[plane]);
@@ -406,9 +403,13 @@ void PostFilter::SetupLoopRestorationBorder(const int row4x4) {
const int row = kLoopRestorationBorderRows[subsampling_y_[plane]];
const int absolute_row =
(MultiplyBy4(row4x4) >> subsampling_y_[plane]) + row;
+ const ptrdiff_t src_stride = frame_buffer_.stride(plane);
const uint8_t* src =
- GetSuperResBuffer(static_cast<Plane>(plane), row4x4, 0) + row * stride;
- uint8_t* dst = loop_restoration_border_.data(plane) + row_offset * stride;
+ GetSuperResBuffer(static_cast<Plane>(plane), row4x4, 0) +
+ row * src_stride;
+ const ptrdiff_t dst_stride = loop_restoration_border_.stride(plane);
+ uint8_t* dst =
+ loop_restoration_border_.data(plane) + row_offset * dst_stride;
for (int i = 0; i < 4; ++i) {
memcpy(dst, src, row_width);
#if LIBGAV1_MAX_BITDEPTH >= 10
@@ -421,8 +422,8 @@ void PostFilter::SetupLoopRestorationBorder(const int row4x4) {
kRestorationHorizontalBorder);
// If we run out of rows, copy the last valid row (mimics the bottom
// border extension).
- if (absolute_row + i < plane_height - 1) src += stride;
- dst += stride;
+ if (absolute_row + i < plane_height - 1) src += src_stride;
+ dst += dst_stride;
}
} while (++plane < planes_);
}
@@ -434,7 +435,7 @@ void PostFilter::SetupLoopRestorationBorder(int row4x4_start, int sb4x4) {
for (int sb_y = 0; sb_y < sb4x4; sb_y += 16) {
const int row4x4 = row4x4_start + sb_y;
const int row_offset_start = DivideBy4(row4x4);
- std::array<uint8_t*, kMaxPlanes> dst = {
+ const std::array<uint8_t*, kMaxPlanes> dst = {
loop_restoration_border_.data(kPlaneY) +
row_offset_start * loop_restoration_border_.stride(kPlaneY),
loop_restoration_border_.data(kPlaneU) +
@@ -462,13 +463,14 @@ void PostFilter::SetupLoopRestorationBorder(int row4x4_start, int sb4x4) {
row * frame_buffer_.stride(plane);
rows[plane] = Clip3(plane_height - absolute_row, 0, 4);
} while (++plane < planes_);
- ApplySuperRes(src, rows, /*line_buffer_row=*/-1, dst);
+ ApplySuperRes(src, rows, /*line_buffer_row=*/-1, dst,
+ /*dst_is_loop_restoration_border=*/true);
// If we run out of rows, copy the last valid row (mimics the bottom
// border extension).
plane = kPlaneY;
do {
if (rows[plane] == 0 || rows[plane] >= 4) continue;
- const ptrdiff_t stride = frame_buffer_.stride(plane);
+ const ptrdiff_t stride = loop_restoration_border_.stride(plane);
uint8_t* dst_line = dst[plane] + rows[plane] * stride;
const uint8_t* const src_line = dst_line - stride;
const int upscaled_width = super_res_info_[plane].upscaled_width
diff --git a/src/post_filter/super_res.cc b/src/post_filter/super_res.cc
index a70e4ed..554e537 100644
--- a/src/post_filter/super_res.cc
+++ b/src/post_filter/super_res.cc
@@ -19,7 +19,8 @@ namespace libgav1 {
void PostFilter::ApplySuperRes(const std::array<uint8_t*, kMaxPlanes>& src,
const std::array<int, kMaxPlanes>& rows,
const int line_buffer_row,
- const std::array<uint8_t*, kMaxPlanes>& dst) {
+ const std::array<uint8_t*, kMaxPlanes>& dst,
+ bool dst_is_loop_restoration_border /*=false*/) {
int plane = kPlaneY;
do {
const int plane_width =
@@ -28,13 +29,19 @@ void PostFilter::ApplySuperRes(const std::array<uint8_t*, kMaxPlanes>& src,
if (bitdepth_ >= 10) {
auto* input = reinterpret_cast<uint16_t*>(src[plane]);
auto* output = reinterpret_cast<uint16_t*>(dst[plane]);
- const ptrdiff_t stride = frame_buffer_.stride(plane) / sizeof(uint16_t);
+ const ptrdiff_t input_stride =
+ frame_buffer_.stride(plane) / sizeof(uint16_t);
+ const ptrdiff_t output_stride =
+ (dst_is_loop_restoration_border
+ ? loop_restoration_border_.stride(plane)
+ : frame_buffer_.stride(plane)) /
+ sizeof(uint16_t);
if (rows[plane] > 0) {
dsp_.super_res(superres_coefficients_[static_cast<int>(plane != 0)],
- input, stride, rows[plane], plane_width,
+ input, input_stride, rows[plane], plane_width,
super_res_info_[plane].upscaled_width,
super_res_info_[plane].initial_subpixel_x,
- super_res_info_[plane].step, output);
+ super_res_info_[plane].step, output, output_stride);
}
// In the multi-threaded case, the |superres_line_buffer_| holds the last
// input row. Apply SuperRes for that row.
@@ -44,24 +51,29 @@ void PostFilter::ApplySuperRes(const std::array<uint8_t*, kMaxPlanes>& src,
line_buffer_row * superres_line_buffer_.stride(plane) /
sizeof(uint16_t) +
kSuperResHorizontalBorder;
- dsp_.super_res(
- superres_coefficients_[static_cast<int>(plane != 0)],
- line_buffer_start, /*stride=*/0,
- /*height=*/1, plane_width, super_res_info_[plane].upscaled_width,
- super_res_info_[plane].initial_subpixel_x,
- super_res_info_[plane].step, output + rows[plane] * stride);
+ dsp_.super_res(superres_coefficients_[static_cast<int>(plane != 0)],
+ line_buffer_start, /*source_stride=*/0,
+ /*height=*/1, plane_width,
+ super_res_info_[plane].upscaled_width,
+ super_res_info_[plane].initial_subpixel_x,
+ super_res_info_[plane].step,
+ output + rows[plane] * output_stride, /*dest_stride=*/0);
}
continue;
}
#endif // LIBGAV1_MAX_BITDEPTH >= 10
uint8_t* input = src[plane];
uint8_t* output = dst[plane];
+ const ptrdiff_t input_stride = frame_buffer_.stride(plane);
+ const ptrdiff_t output_stride = dst_is_loop_restoration_border
+ ? loop_restoration_border_.stride(plane)
+ : frame_buffer_.stride(plane);
if (rows[plane] > 0) {
dsp_.super_res(superres_coefficients_[static_cast<int>(plane != 0)],
- input, frame_buffer_.stride(plane), rows[plane],
- plane_width, super_res_info_[plane].upscaled_width,
+ input, input_stride, rows[plane], plane_width,
+ super_res_info_[plane].upscaled_width,
super_res_info_[plane].initial_subpixel_x,
- super_res_info_[plane].step, output);
+ super_res_info_[plane].step, output, output_stride);
}
// In the multi-threaded case, the |superres_line_buffer_| holds the last
// input row. Apply SuperRes for that row.
@@ -70,13 +82,13 @@ void PostFilter::ApplySuperRes(const std::array<uint8_t*, kMaxPlanes>& src,
superres_line_buffer_.data(plane) +
line_buffer_row * superres_line_buffer_.stride(plane) +
kSuperResHorizontalBorder;
- dsp_.super_res(superres_coefficients_[static_cast<int>(plane != 0)],
- line_buffer_start, /*stride=*/0,
- /*height=*/1, plane_width,
- super_res_info_[plane].upscaled_width,
- super_res_info_[plane].initial_subpixel_x,
- super_res_info_[plane].step,
- output + rows[plane] * frame_buffer_.stride(plane));
+ dsp_.super_res(
+ superres_coefficients_[static_cast<int>(plane != 0)],
+ line_buffer_start, /*source_stride=*/0,
+ /*height=*/1, plane_width, super_res_info_[plane].upscaled_width,
+ super_res_info_[plane].initial_subpixel_x,
+ super_res_info_[plane].step, output + rows[plane] * output_stride,
+ /*dest_stride=*/0);
}
} while (++plane < planes_);
}
diff --git a/src/residual_buffer_pool.cc b/src/residual_buffer_pool.cc
index e166392..44a842c 100644
--- a/src/residual_buffer_pool.cc
+++ b/src/residual_buffer_pool.cc
@@ -129,7 +129,8 @@ std::unique_ptr<ResidualBuffer> ResidualBufferPool::Get() {
}
void ResidualBufferPool::Release(std::unique_ptr<ResidualBuffer> buffer) {
- buffer->transform_parameters()->Reset();
+ buffer->transform_parameters()->Clear();
+ buffer->partition_tree_order()->Clear();
std::lock_guard<std::mutex> lock(mutex_);
buffers_.Push(std::move(buffer));
}
diff --git a/src/residual_buffer_pool.h b/src/residual_buffer_pool.h
index f7bc75d..75924db 100644
--- a/src/residual_buffer_pool.h
+++ b/src/residual_buffer_pool.h
@@ -27,73 +27,11 @@
#include "src/utils/compiler_attributes.h"
#include "src/utils/constants.h"
#include "src/utils/memory.h"
+#include "src/utils/queue.h"
#include "src/utils/types.h"
namespace libgav1 {
-// A simple fixed size queue implementation to hold the transform parameters
-// when |Tile::split_parse_and_decode_| is true. We don't have to do any
-// boundary checks since we always push data into the queue before accessing it.
-class TransformParameterQueue {
- public:
- TransformParameterQueue() = default;
-
- // Move only.
- TransformParameterQueue(TransformParameterQueue&& other) = default;
- TransformParameterQueue& operator=(TransformParameterQueue&& other) = default;
-
- LIBGAV1_MUST_USE_RESULT bool Init(int max_size) {
- max_size_ = max_size;
- // No initialization is necessary since the data will be always written to
- // before being read.
- non_zero_coeff_count_.reset(new (std::nothrow) int16_t[max_size_]);
- tx_type_.reset(new (std::nothrow) TransformType[max_size_]);
- return non_zero_coeff_count_ != nullptr && tx_type_ != nullptr;
- }
-
- // Adds the |non_zero_coeff_count| and the |tx_type| to the back of the queue.
- void Push(int non_zero_coeff_count, TransformType tx_type) {
- assert(back_ < max_size_);
- non_zero_coeff_count_[back_] = non_zero_coeff_count;
- tx_type_[back_++] = tx_type;
- }
-
- // Returns the non_zero_coeff_count at the front of the queue.
- int16_t NonZeroCoeffCount() const {
- assert(front_ != back_);
- return non_zero_coeff_count_[front_];
- }
-
- // Returns the tx_type at the front of the queue.
- TransformType Type() const {
- assert(front_ != back_);
- return tx_type_[front_];
- }
-
- // Removes the |non_zero_coeff_count| and the |tx_type| from the front of the
- // queue.
- void Pop() {
- assert(front_ != back_);
- ++front_;
- }
-
- // Clears the queue.
- void Reset() {
- front_ = 0;
- back_ = 0;
- }
-
- // Used only in the tests. Returns the number of elements in the queue.
- int Size() const { return back_ - front_; }
-
- private:
- int max_size_ = 0;
- std::unique_ptr<int16_t[]> non_zero_coeff_count_;
- std::unique_ptr<TransformType[]> tx_type_;
- int front_ = 0;
- int back_ = 0;
-};
-
// This class is used for parsing and decoding a superblock. Members of this
// class are populated in the "parse" step and consumed in the "decode" step.
class ResidualBuffer : public Allocable {
@@ -104,7 +42,8 @@ class ResidualBuffer : public Allocable {
if (buffer != nullptr) {
buffer->buffer_ = MakeAlignedUniquePtr<uint8_t>(32, buffer_size);
if (buffer->buffer_ == nullptr ||
- !buffer->transform_parameters_.Init(queue_size)) {
+ !buffer->transform_parameters_.Init(queue_size) ||
+ !buffer->partition_tree_order_.Init(queue_size)) {
buffer = nullptr;
}
}
@@ -118,9 +57,14 @@ class ResidualBuffer : public Allocable {
// Buffer used to store the residual values.
uint8_t* buffer() { return buffer_.get(); }
// Queue used to store the transform parameters.
- TransformParameterQueue* transform_parameters() {
+ Queue<TransformParameters>* transform_parameters() {
return &transform_parameters_;
}
+ // Queue used to store the block ordering in the partition tree of the
+ // superblocks.
+ Queue<PartitionTreeNode>* partition_tree_order() {
+ return &partition_tree_order_;
+ }
private:
friend class ResidualBufferStack;
@@ -128,7 +72,8 @@ class ResidualBuffer : public Allocable {
ResidualBuffer() = default;
AlignedUniquePtr<uint8_t> buffer_;
- TransformParameterQueue transform_parameters_;
+ Queue<TransformParameters> transform_parameters_;
+ Queue<PartitionTreeNode> partition_tree_order_;
// Used by ResidualBufferStack to form a chain of ResidualBuffers.
ResidualBuffer* next_ = nullptr;
};
diff --git a/src/threading_strategy.cc b/src/threading_strategy.cc
index cd4d576..17ce18f 100644
--- a/src/threading_strategy.cc
+++ b/src/threading_strategy.cc
@@ -36,24 +36,25 @@ constexpr int kFrameParallelThresholdMultiplier =
// Computes the number of frame threads to be used based on the following
// heuristic:
// * If |thread_count| == 1, return 0.
-// * If |thread_count| <= |tile_count| * 4, return 0.
+// * If |thread_count| <= |tile_count| * kFrameParallelThresholdMultiplier,
+// return 0.
// * Otherwise, return the largest value of i which satisfies the following
// condition: i + i * tile_columns <= thread_count. This ensures that there
// are at least |tile_columns| worker threads for each frame thread.
// * This function will never return 1 or a value > |thread_count|.
//
-// This heuristic is based empirical performance data. The in-frame threading
-// model (combination of tile multithreading, superblock row multithreading and
-// post filter multithreading) performs better than the frame parallel model
-// until we reach the threshold of |thread_count| > |tile_count| *
-// kFrameParallelThresholdMultiplier.
+// This heuristic is based on empirical performance data. The in-frame
+// threading model (combination of tile multithreading, superblock row
+// multithreading and post filter multithreading) performs better than the
+// frame parallel model until we reach the threshold of |thread_count| >
+// |tile_count| * kFrameParallelThresholdMultiplier.
//
// It is a function of |tile_count| since tile threading and superblock row
-// multithreading will scale only as a factor of |tile_count|. The threshold 4
-// is arrived at based on empirical data. The general idea is that superblock
-// row multithreading plateaus at 4 * |tile_count| because in most practical
-// cases there aren't more than that many superblock rows and columns available
-// to work on in parallel.
+// multithreading will scale only as a factor of |tile_count|. The threshold
+// kFrameParallelThresholdMultiplier is arrived at based on empirical data.
+// The general idea is that superblock row multithreading plateaus at 4 *
+// |tile_count| because in most practical cases there aren't more than that
+// many superblock rows and columns available to work on in parallel.
int ComputeFrameThreadCount(int thread_count, int tile_count,
int tile_columns) {
assert(thread_count > 0);
@@ -132,7 +133,7 @@ bool ThreadingStrategy::Reset(const ObuFrameHeader& frame_header,
thread_count -= 2;
if (thread_count <= 0) break;
}
-#else // !defined(__ANDROID__)
+#else // !defined(__ANDROID__)
// Assign the remaining threads to each Tile.
for (int i = 0; i < tile_count; ++i) {
const int count = thread_count / tile_count +
diff --git a/src/tile.h b/src/tile.h
index 73bb5fd..6bae2a0 100644
--- a/src/tile.h
+++ b/src/tile.h
@@ -48,7 +48,6 @@
#include "src/utils/constants.h"
#include "src/utils/entropy_decoder.h"
#include "src/utils/memory.h"
-#include "src/utils/parameter_tree.h"
#include "src/utils/segmentation_map.h"
#include "src/utils/threadpool.h"
#include "src/utils/types.h"
@@ -292,26 +291,25 @@ class Tile : public Allocable {
// iteratively. It performs a DFS traversal over the partition tree to process
// the blocks in the right order.
bool ProcessPartition(
- int row4x4_start, int column4x4_start, ParameterTree* root,
- TileScratchBuffer* scratch_buffer,
+ int row4x4_start, int column4x4_start, TileScratchBuffer* scratch_buffer,
ResidualPtr* residual); // Iterative implementation of 5.11.4.
bool ProcessBlock(int row4x4, int column4x4, BlockSize block_size,
- ParameterTree* tree, TileScratchBuffer* scratch_buffer,
+ TileScratchBuffer* scratch_buffer,
ResidualPtr* residual); // 5.11.5.
void ResetCdef(int row4x4, int column4x4); // 5.11.55.
// This function is used to decode a superblock when the parsing has already
// been done for that superblock.
- bool DecodeSuperBlock(ParameterTree* tree, TileScratchBuffer* scratch_buffer,
- ResidualPtr* residual);
+ bool DecodeSuperBlock(int sb_row_index, int sb_column_index,
+ TileScratchBuffer* scratch_buffer);
// Helper function used by DecodeSuperBlock(). Note that the decode_block()
// function in the spec is equivalent to ProcessBlock() in the code.
- bool DecodeBlock(ParameterTree* tree, TileScratchBuffer* scratch_buffer,
- ResidualPtr* residual);
+ bool DecodeBlock(int row4x4, int column4x4, BlockSize block_size,
+ TileScratchBuffer* scratch_buffer, ResidualPtr* residual);
void ClearBlockDecoded(TileScratchBuffer* scratch_buffer, int row4x4,
int column4x4); // 5.11.3.
- bool ProcessSuperBlock(int row4x4, int column4x4, int block_width4x4,
+ bool ProcessSuperBlock(int row4x4, int column4x4,
TileScratchBuffer* scratch_buffer,
ProcessingMode mode);
void ResetLoopRestorationParams();
diff --git a/src/tile/bitstream/palette.cc b/src/tile/bitstream/palette.cc
index 674d210..41b42d6 100644
--- a/src/tile/bitstream/palette.cc
+++ b/src/tile/bitstream/palette.cc
@@ -130,10 +130,10 @@ void Tile::ReadPaletteColors(const Block& block, Plane plane) {
void Tile::ReadPaletteModeInfo(const Block& block) {
BlockParameters& bp = *block.bp;
+ bp.palette_mode_info.size[kPlaneTypeY] = 0;
+ bp.palette_mode_info.size[kPlaneTypeUV] = 0;
if (IsBlockSmallerThan8x8(block.size) || block.size > kBlock64x64 ||
!frame_header_.allow_screen_content_tools) {
- bp.palette_mode_info.size[kPlaneTypeY] = 0;
- bp.palette_mode_info.size[kPlaneTypeUV] = 0;
return;
}
const int block_size_context =
@@ -156,7 +156,7 @@ void Tile::ReadPaletteModeInfo(const Block& block) {
ReadPaletteColors(block, kPlaneY);
}
}
- if (bp.uv_mode == kPredictionModeDc && block.HasChroma()) {
+ if (block.HasChroma() && bp.uv_mode == kPredictionModeDc) {
const int context =
static_cast<int>(bp.palette_mode_info.size[kPlaneTypeY] > 0);
const bool has_palette_uv =
diff --git a/src/tile/tile.cc b/src/tile/tile.cc
index ee48f17..9699517 100644
--- a/src/tile/tile.cc
+++ b/src/tile/tile.cc
@@ -609,7 +609,7 @@ bool Tile::ProcessSuperBlockRow(int row4x4,
const int block_width4x4 = kNum4x4BlocksWide[SuperBlockSize()];
for (int column4x4 = column4x4_start_; column4x4 < column4x4_end_;
column4x4 += block_width4x4) {
- if (!ProcessSuperBlock(row4x4, column4x4, block_width4x4, scratch_buffer,
+ if (!ProcessSuperBlock(row4x4, column4x4, scratch_buffer,
processing_mode)) {
LIBGAV1_DLOG(ERROR, "Error decoding super block row: %d column: %d",
row4x4, column4x4);
@@ -642,9 +642,6 @@ void Tile::SaveSymbolDecoderContext() {
}
bool Tile::ParseAndDecode() {
- // If this is the main thread, we build the loop filter bit masks when parsing
- // so that it happens in the current thread. This ensures that the main thread
- // does as much work as possible.
if (split_parse_and_decode_) {
if (!ThreadedParseAndDecode()) return false;
SaveSymbolDecoderContext();
@@ -776,8 +773,8 @@ bool Tile::ThreadedParseAndDecode() {
for (int column4x4 = column4x4_start_, column_index = 0;
column4x4 < column4x4_end_;
column4x4 += block_width4x4, ++column_index) {
- if (!ProcessSuperBlock(row4x4, column4x4, block_width4x4,
- scratch_buffer.get(), kProcessingModeParseOnly)) {
+ if (!ProcessSuperBlock(row4x4, column4x4, scratch_buffer.get(),
+ kProcessingModeParseOnly)) {
std::lock_guard<std::mutex> lock(threading_.mutex);
threading_.abort = true;
break;
@@ -862,8 +859,8 @@ void Tile::DecodeSuperBlock(int row_index, int column_index,
tile_scratch_buffer_pool_->Get();
bool ok = scratch_buffer != nullptr;
if (ok) {
- ok = ProcessSuperBlock(row4x4, column4x4, block_width4x4,
- scratch_buffer.get(), kProcessingModeDecodeOnly);
+ ok = ProcessSuperBlock(row4x4, column4x4, scratch_buffer.get(),
+ kProcessingModeDecodeOnly);
tile_scratch_buffer_pool_->Release(std::move(scratch_buffer));
}
std::unique_lock<std::mutex> lock(threading_.mutex);
@@ -1629,11 +1626,12 @@ bool Tile::TransformBlock(const Block& block, Plane plane, int base_x,
const int sb_row_index = SuperBlockRowIndex(block.row4x4);
const int sb_column_index = SuperBlockColumnIndex(block.column4x4);
if (mode == kProcessingModeDecodeOnly) {
- TransformParameterQueue& tx_params =
+ Queue<TransformParameters>& tx_params =
*residual_buffer_threaded_[sb_row_index][sb_column_index]
->transform_parameters();
ReconstructBlock(block, plane, start_x, start_y, tx_size,
- tx_params.Type(), tx_params.NonZeroCoeffCount());
+ tx_params.Front().type,
+ tx_params.Front().non_zero_coeff_count);
tx_params.Pop();
} else {
TransformType tx_type;
@@ -1656,7 +1654,7 @@ bool Tile::TransformBlock(const Block& block, Plane plane, int base_x,
assert(mode == kProcessingModeParseOnly);
residual_buffer_threaded_[sb_row_index][sb_column_index]
->transform_parameters()
- ->Push(non_zero_coeff_count, tx_type);
+ ->Push(TransformParameters(tx_type, non_zero_coeff_count));
}
}
}
@@ -1886,6 +1884,7 @@ bool Tile::AssignInterMv(const Block& block, bool is_compound) {
GetClampParameters(block, min, max);
BlockParameters& bp = *block.bp;
const PredictionParameters& prediction_parameters = *bp.prediction_parameters;
+ bp.mv.mv64 = 0;
if (is_compound) {
for (int i = 0; i < 2; ++i) {
const PredictionMode mode = GetSinglePredictionMode(i, bp.y_mode);
@@ -1948,6 +1947,7 @@ bool Tile::AssignIntraMv(const Block& block) {
BlockParameters& bp = *block.bp;
const PredictionParameters& prediction_parameters = *bp.prediction_parameters;
const MotionVector& ref_mv_0 = prediction_parameters.reference_mv(0);
+ bp.mv.mv64 = 0;
ReadMotionVector(block, 0);
if (ref_mv_0.mv32 == 0) {
const MotionVector& ref_mv_1 = prediction_parameters.reference_mv(1);
@@ -2122,7 +2122,6 @@ void Tile::PopulateDeblockFilterLevel(const Block& block) {
}
bool Tile::ProcessBlock(int row4x4, int column4x4, BlockSize block_size,
- ParameterTree* const tree,
TileScratchBuffer* const scratch_buffer,
ResidualPtr* residual) {
// Do not process the block if the starting point is beyond the visible frame.
@@ -2133,8 +2132,24 @@ bool Tile::ProcessBlock(int row4x4, int column4x4, BlockSize block_size,
column4x4 >= frame_header_.columns4x4) {
return true;
}
- BlockParameters& bp = *tree->parameters();
- block_parameters_holder_.FillCache(row4x4, column4x4, block_size, &bp);
+
+ if (split_parse_and_decode_) {
+ // Push block ordering info to the queue. DecodeBlock() will use this queue
+ // to decode the blocks in the correct order.
+ const int sb_row_index = SuperBlockRowIndex(row4x4);
+ const int sb_column_index = SuperBlockColumnIndex(column4x4);
+ residual_buffer_threaded_[sb_row_index][sb_column_index]
+ ->partition_tree_order()
+ ->Push(PartitionTreeNode(row4x4, column4x4, block_size));
+ }
+
+ BlockParameters* bp_ptr =
+ block_parameters_holder_.Get(row4x4, column4x4, block_size);
+ if (bp_ptr == nullptr) {
+ LIBGAV1_DLOG(ERROR, "Failed to get BlockParameters.");
+ return false;
+ }
+ BlockParameters& bp = *bp_ptr;
Block block(*this, block_size, row4x4, column4x4, scratch_buffer, residual);
bp.size = block_size;
bp.prediction_parameters =
@@ -2186,16 +2201,13 @@ bool Tile::ProcessBlock(int row4x4, int column4x4, BlockSize block_size,
return true;
}
-bool Tile::DecodeBlock(ParameterTree* const tree,
+bool Tile::DecodeBlock(int row4x4, int column4x4, BlockSize block_size,
TileScratchBuffer* const scratch_buffer,
ResidualPtr* residual) {
- const int row4x4 = tree->row4x4();
- const int column4x4 = tree->column4x4();
if (row4x4 >= frame_header_.rows4x4 ||
column4x4 >= frame_header_.columns4x4) {
return true;
}
- const BlockSize block_size = tree->block_size();
Block block(*this, block_size, row4x4, column4x4, scratch_buffer, residual);
if (!ComputePrediction(block) ||
!Residual(block, kProcessingModeDecodeOnly)) {
@@ -2206,27 +2218,22 @@ bool Tile::DecodeBlock(ParameterTree* const tree,
}
bool Tile::ProcessPartition(int row4x4_start, int column4x4_start,
- ParameterTree* const root,
TileScratchBuffer* const scratch_buffer,
ResidualPtr* residual) {
- Stack<ParameterTree*, kDfsStackSize> stack;
+ Stack<PartitionTreeNode, kDfsStackSize> stack;
// Set up the first iteration.
- ParameterTree* node = root;
- int row4x4 = row4x4_start;
- int column4x4 = column4x4_start;
- BlockSize block_size = SuperBlockSize();
+ stack.Push(
+ PartitionTreeNode(row4x4_start, column4x4_start, SuperBlockSize()));
// DFS loop. If it sees a terminal node (leaf node), ProcessBlock is invoked.
// Otherwise, the children are pushed into the stack for future processing.
do {
- if (!stack.Empty()) {
- // Set up subsequent iterations.
- node = stack.Pop();
- row4x4 = node->row4x4();
- column4x4 = node->column4x4();
- block_size = node->block_size();
- }
+ PartitionTreeNode node = stack.Pop();
+ int row4x4 = node.row4x4;
+ int column4x4 = node.column4x4;
+ BlockSize block_size = node.block_size;
+
if (row4x4 >= frame_header_.rows4x4 ||
column4x4 >= frame_header_.columns4x4) {
continue;
@@ -2262,13 +2269,13 @@ bool Tile::ProcessPartition(int row4x4_start, int column4x4_start,
sequence_header_.color_config.subsampling_y);
return false;
}
- if (!node->SetPartitionType(partition)) {
- LIBGAV1_DLOG(ERROR, "node->SetPartitionType() failed.");
- return false;
- }
+
+ const int quarter_block4x4 = half_block4x4 >> 1;
+ const BlockSize split_size = kSubSize[kPartitionSplit][block_size];
+ assert(partition == kPartitionNone || sub_size != kBlockInvalid);
switch (partition) {
case kPartitionNone:
- if (!ProcessBlock(row4x4, column4x4, sub_size, node, scratch_buffer,
+ if (!ProcessBlock(row4x4, column4x4, sub_size, scratch_buffer,
residual)) {
return false;
}
@@ -2276,28 +2283,82 @@ bool Tile::ProcessPartition(int row4x4_start, int column4x4_start,
case kPartitionSplit:
// The children must be added in reverse order since a stack is being
// used.
- for (int i = 3; i >= 0; --i) {
- ParameterTree* const child = node->children(i);
- assert(child != nullptr);
- stack.Push(child);
- }
+ stack.Push(PartitionTreeNode(row4x4 + half_block4x4,
+ column4x4 + half_block4x4, sub_size));
+ stack.Push(
+ PartitionTreeNode(row4x4 + half_block4x4, column4x4, sub_size));
+ stack.Push(
+ PartitionTreeNode(row4x4, column4x4 + half_block4x4, sub_size));
+ stack.Push(PartitionTreeNode(row4x4, column4x4, sub_size));
break;
case kPartitionHorizontal:
+ if (!ProcessBlock(row4x4, column4x4, sub_size, scratch_buffer,
+ residual) ||
+ !ProcessBlock(row4x4 + half_block4x4, column4x4, sub_size,
+ scratch_buffer, residual)) {
+ return false;
+ }
+ break;
case kPartitionVertical:
+ if (!ProcessBlock(row4x4, column4x4, sub_size, scratch_buffer,
+ residual) ||
+ !ProcessBlock(row4x4, column4x4 + half_block4x4, sub_size,
+ scratch_buffer, residual)) {
+ return false;
+ }
+ break;
case kPartitionHorizontalWithTopSplit:
+ if (!ProcessBlock(row4x4, column4x4, split_size, scratch_buffer,
+ residual) ||
+ !ProcessBlock(row4x4, column4x4 + half_block4x4, split_size,
+ scratch_buffer, residual) ||
+ !ProcessBlock(row4x4 + half_block4x4, column4x4, sub_size,
+ scratch_buffer, residual)) {
+ return false;
+ }
+ break;
case kPartitionHorizontalWithBottomSplit:
+ if (!ProcessBlock(row4x4, column4x4, sub_size, scratch_buffer,
+ residual) ||
+ !ProcessBlock(row4x4 + half_block4x4, column4x4, split_size,
+ scratch_buffer, residual) ||
+ !ProcessBlock(row4x4 + half_block4x4, column4x4 + half_block4x4,
+ split_size, scratch_buffer, residual)) {
+ return false;
+ }
+ break;
case kPartitionVerticalWithLeftSplit:
+ if (!ProcessBlock(row4x4, column4x4, split_size, scratch_buffer,
+ residual) ||
+ !ProcessBlock(row4x4 + half_block4x4, column4x4, split_size,
+ scratch_buffer, residual) ||
+ !ProcessBlock(row4x4, column4x4 + half_block4x4, sub_size,
+ scratch_buffer, residual)) {
+ return false;
+ }
+ break;
case kPartitionVerticalWithRightSplit:
+ if (!ProcessBlock(row4x4, column4x4, sub_size, scratch_buffer,
+ residual) ||
+ !ProcessBlock(row4x4, column4x4 + half_block4x4, split_size,
+ scratch_buffer, residual) ||
+ !ProcessBlock(row4x4 + half_block4x4, column4x4 + half_block4x4,
+ split_size, scratch_buffer, residual)) {
+ return false;
+ }
+ break;
case kPartitionHorizontal4:
+ for (int i = 0; i < 4; ++i) {
+ if (!ProcessBlock(row4x4 + i * quarter_block4x4, column4x4, sub_size,
+ scratch_buffer, residual)) {
+ return false;
+ }
+ }
+ break;
case kPartitionVertical4:
for (int i = 0; i < 4; ++i) {
- ParameterTree* const child = node->children(i);
- // Once a null child is seen, all the subsequent children will also be
- // null.
- if (child == nullptr) break;
- if (!ProcessBlock(child->row4x4(), child->column4x4(),
- child->block_size(), child, scratch_buffer,
- residual)) {
+ if (!ProcessBlock(row4x4, column4x4 + i * quarter_block4x4, sub_size,
+ scratch_buffer, residual)) {
return false;
}
}
@@ -2370,7 +2431,7 @@ void Tile::ClearBlockDecoded(TileScratchBuffer* const scratch_buffer,
}
}
-bool Tile::ProcessSuperBlock(int row4x4, int column4x4, int block_width4x4,
+bool Tile::ProcessSuperBlock(int row4x4, int column4x4,
TileScratchBuffer* const scratch_buffer,
ProcessingMode mode) {
const bool parsing =
@@ -2388,13 +2449,10 @@ bool Tile::ProcessSuperBlock(int row4x4, int column4x4, int block_width4x4,
if (parsing) {
ReadLoopRestorationCoefficients(row4x4, column4x4, block_size);
}
- const int row = row4x4 / block_width4x4;
- const int column = column4x4 / block_width4x4;
if (parsing && decoding) {
uint8_t* residual_buffer = residual_buffer_.get();
- if (!ProcessPartition(row4x4, column4x4,
- block_parameters_holder_.Tree(row, column),
- scratch_buffer, &residual_buffer)) {
+ if (!ProcessPartition(row4x4, column4x4, scratch_buffer,
+ &residual_buffer)) {
LIBGAV1_DLOG(ERROR, "Error decoding partition row: %d column: %d", row4x4,
column4x4);
return false;
@@ -2412,18 +2470,14 @@ bool Tile::ProcessSuperBlock(int row4x4, int column4x4, int block_width4x4,
}
uint8_t* residual_buffer =
residual_buffer_threaded_[sb_row_index][sb_column_index]->buffer();
- if (!ProcessPartition(row4x4, column4x4,
- block_parameters_holder_.Tree(row, column),
- scratch_buffer, &residual_buffer)) {
+ if (!ProcessPartition(row4x4, column4x4, scratch_buffer,
+ &residual_buffer)) {
LIBGAV1_DLOG(ERROR, "Error parsing partition row: %d column: %d", row4x4,
column4x4);
return false;
}
} else {
- uint8_t* residual_buffer =
- residual_buffer_threaded_[sb_row_index][sb_column_index]->buffer();
- if (!DecodeSuperBlock(block_parameters_holder_.Tree(row, column),
- scratch_buffer, &residual_buffer)) {
+ if (!DecodeSuperBlock(sb_row_index, sb_column_index, scratch_buffer)) {
LIBGAV1_DLOG(ERROR, "Error decoding superblock row: %d column: %d",
row4x4, column4x4);
return false;
@@ -2434,26 +2488,23 @@ bool Tile::ProcessSuperBlock(int row4x4, int column4x4, int block_width4x4,
return true;
}
-bool Tile::DecodeSuperBlock(ParameterTree* const tree,
- TileScratchBuffer* const scratch_buffer,
- ResidualPtr* residual) {
- Stack<ParameterTree*, kDfsStackSize> stack;
- stack.Push(tree);
- do {
- ParameterTree* const node = stack.Pop();
- if (node->partition() != kPartitionNone) {
- for (int i = 3; i >= 0; --i) {
- if (node->children(i) == nullptr) continue;
- stack.Push(node->children(i));
- }
- continue;
- }
- if (!DecodeBlock(node, scratch_buffer, residual)) {
+bool Tile::DecodeSuperBlock(int sb_row_index, int sb_column_index,
+ TileScratchBuffer* const scratch_buffer) {
+ uint8_t* residual_buffer =
+ residual_buffer_threaded_[sb_row_index][sb_column_index]->buffer();
+ Queue<PartitionTreeNode>& partition_tree_order =
+ *residual_buffer_threaded_[sb_row_index][sb_column_index]
+ ->partition_tree_order();
+ while (!partition_tree_order.Empty()) {
+ PartitionTreeNode block = partition_tree_order.Front();
+ if (!DecodeBlock(block.row4x4, block.column4x4, block.block_size,
+ scratch_buffer, &residual_buffer)) {
LIBGAV1_DLOG(ERROR, "Error decoding block row: %d column: %d",
- node->row4x4(), node->column4x4());
+ block.row4x4, block.column4x4);
return false;
}
- } while (!stack.Empty());
+ partition_tree_order.Pop();
+ }
return true;
}
diff --git a/src/utils/array_2d.h b/src/utils/array_2d.h
index 2df6241..df2da9f 100644
--- a/src/utils/array_2d.h
+++ b/src/utils/array_2d.h
@@ -120,7 +120,7 @@ class Array2D {
const T* operator[](int row) const { return data_view_[row]; }
private:
- std::unique_ptr<T[]> data_ = nullptr;
+ std::unique_ptr<T[]> data_;
size_t allocated_size_ = 0;
size_t size_ = 0;
Array2DView<T> data_view_;
diff --git a/src/utils/block_parameters_holder.cc b/src/utils/block_parameters_holder.cc
index 3ccdb9b..3bb9f1e 100644
--- a/src/utils/block_parameters_holder.cc
+++ b/src/utils/block_parameters_holder.cc
@@ -19,53 +19,29 @@
#include "src/utils/common.h"
#include "src/utils/constants.h"
#include "src/utils/logging.h"
-#include "src/utils/parameter_tree.h"
#include "src/utils/types.h"
namespace libgav1 {
-namespace {
-
-// Returns the number of super block rows/columns for |value4x4| where value4x4
-// is either rows4x4 or columns4x4.
-int RowsOrColumns4x4ToSuperBlocks(int value4x4, bool use_128x128_superblock) {
- return use_128x128_superblock ? DivideBy128(MultiplyBy4(value4x4) + 127)
- : DivideBy64(MultiplyBy4(value4x4) + 63);
-}
-
-} // namespace
-
-bool BlockParametersHolder::Reset(int rows4x4, int columns4x4,
- bool use_128x128_superblock) {
+bool BlockParametersHolder::Reset(int rows4x4, int columns4x4) {
rows4x4_ = rows4x4;
columns4x4_ = columns4x4;
- use_128x128_superblock_ = use_128x128_superblock;
- if (!block_parameters_cache_.Reset(rows4x4_, columns4x4_)) {
- LIBGAV1_DLOG(ERROR, "block_parameters_cache_.Reset() failed.");
- return false;
- }
- const int rows =
- RowsOrColumns4x4ToSuperBlocks(rows4x4_, use_128x128_superblock_);
- const int columns =
- RowsOrColumns4x4ToSuperBlocks(columns4x4_, use_128x128_superblock_);
- const BlockSize sb_size =
- use_128x128_superblock_ ? kBlock128x128 : kBlock64x64;
- const int multiplier = kNum4x4BlocksWide[sb_size];
- if (!trees_.Reset(rows, columns, /*zero_initialize=*/false)) {
- LIBGAV1_DLOG(ERROR, "trees_.Reset() failed.");
- return false;
- }
- for (int i = 0; i < rows; ++i) {
- for (int j = 0; j < columns; ++j) {
- trees_[i][j] =
- ParameterTree::Create(i * multiplier, j * multiplier, sb_size);
- if (trees_[i][j] == nullptr) {
- LIBGAV1_DLOG(ERROR, "Allocation of trees_[%d][%d] failed.", i, j);
- return false;
- }
- }
+ index_ = 0;
+ return block_parameters_cache_.Reset(rows4x4_, columns4x4_) &&
+ block_parameters_.Resize(rows4x4_ * columns4x4_);
+}
+
+BlockParameters* BlockParametersHolder::Get(int row4x4, int column4x4,
+ BlockSize block_size) {
+ const size_t index = index_.fetch_add(1, std::memory_order_relaxed);
+ if (index >= block_parameters_.size()) return nullptr;
+ auto& bp = block_parameters_.get()[index];
+ if (bp == nullptr) {
+ bp.reset(new (std::nothrow) BlockParameters);
+ if (bp == nullptr) return nullptr;
}
- return true;
+ FillCache(row4x4, column4x4, block_size, bp.get());
+ return bp.get();
}
void BlockParametersHolder::FillCache(int row4x4, int column4x4,
diff --git a/src/utils/block_parameters_holder.h b/src/utils/block_parameters_holder.h
index 35543c3..ca36907 100644
--- a/src/utils/block_parameters_holder.h
+++ b/src/utils/block_parameters_holder.h
@@ -17,18 +17,18 @@
#ifndef LIBGAV1_SRC_UTILS_BLOCK_PARAMETERS_HOLDER_H_
#define LIBGAV1_SRC_UTILS_BLOCK_PARAMETERS_HOLDER_H_
+#include <atomic>
#include <memory>
#include "src/utils/array_2d.h"
#include "src/utils/compiler_attributes.h"
#include "src/utils/constants.h"
-#include "src/utils/parameter_tree.h"
+#include "src/utils/dynamic_buffer.h"
#include "src/utils/types.h"
namespace libgav1 {
-// Holds a 2D array of |ParameterTree| objects. Each tree stores the parameters
-// corresponding to a superblock.
+// Holds the BlockParameters pointers to each 4x4 block in the frame.
class BlockParametersHolder {
public:
BlockParametersHolder() = default;
@@ -37,10 +37,13 @@ class BlockParametersHolder {
BlockParametersHolder(const BlockParametersHolder&) = delete;
BlockParametersHolder& operator=(const BlockParametersHolder&) = delete;
- // If |use_128x128_superblock| is true, 128x128 superblocks will be used,
- // otherwise 64x64 superblocks will be used.
- LIBGAV1_MUST_USE_RESULT bool Reset(int rows4x4, int columns4x4,
- bool use_128x128_superblock);
+ LIBGAV1_MUST_USE_RESULT bool Reset(int rows4x4, int columns4x4);
+
+ // Returns a pointer to a BlockParameters object that can be used safely until
+ // the next call to Reset(). Returns nullptr on memory allocation failure. It
+ // also fills the cache matrix for the block starting at |row4x4|, |column4x4|
+ // of size |block_size| with the returned pointer.
+ BlockParameters* Get(int row4x4, int column4x4, BlockSize block_size);
// Finds the BlockParameters corresponding to |row4x4| and |column4x4|. This
// is done as a simple look up of the |block_parameters_cache_| matrix.
@@ -59,20 +62,24 @@ class BlockParametersHolder {
int columns4x4() const { return columns4x4_; }
- // Returns the ParameterTree corresponding to superblock starting at (|row|,
- // |column|).
- ParameterTree* Tree(int row, int column) { return trees_[row][column].get(); }
+ private:
+ // Needs access to FillCache for testing Cdef.
+ template <int bitdepth, typename Pixel>
+ friend class PostFilterApplyCdefTest;
- // Fills the cache matrix for the block starting at |row4x4|, |column4x4| of
- // size |block_size| with the pointer |bp|.
void FillCache(int row4x4, int column4x4, BlockSize block_size,
BlockParameters* bp);
- private:
int rows4x4_ = 0;
int columns4x4_ = 0;
- bool use_128x128_superblock_ = false;
- Array2D<std::unique_ptr<ParameterTree>> trees_;
+
+ // Owns the memory of BlockParameters pointers for the entire frame. It can
+ // hold upto |rows4x4_| * |columns4x4_| objects. Each object will be allocated
+ // on demand and re-used across frames.
+ DynamicBuffer<std::unique_ptr<BlockParameters>> block_parameters_;
+
+ // Points to the next available index of |block_parameters_|.
+ std::atomic<int> index_;
// This is a 2d array of size |rows4x4_| * |columns4x4_|. This is filled in by
// FillCache() and used by Find() to perform look ups using exactly one look
diff --git a/src/utils/common.h b/src/utils/common.h
index ae43c2b..2e599f0 100644
--- a/src/utils/common.h
+++ b/src/utils/common.h
@@ -30,7 +30,6 @@
#include <cassert>
#include <cstddef>
#include <cstdint>
-#include <cstdlib>
#include <cstring>
#include <type_traits>
@@ -131,7 +130,7 @@ inline int CountLeadingZeros(uint64_t n) {
#if defined(HAVE_BITSCANREVERSE64)
const unsigned char bit_set =
_BitScanReverse64(&first_set_bit, static_cast<unsigned __int64>(n));
-#else // !defined(HAVE_BITSCANREVERSE64)
+#else // !defined(HAVE_BITSCANREVERSE64)
const auto n_hi = static_cast<unsigned long>(n >> 32); // NOLINT(runtime/int)
if (n_hi != 0) {
const unsigned char bit_set = _BitScanReverse(&first_set_bit, n_hi);
@@ -376,7 +375,7 @@ constexpr bool IsDirectionalMode(PredictionMode mode) {
// behavior and result apply to other CPUs' SIMD instructions.
inline int GetRelativeDistance(const unsigned int a, const unsigned int b,
const unsigned int order_hint_shift_bits) {
- const int diff = a - b;
+ const int diff = static_cast<int>(a) - static_cast<int>(b);
assert(order_hint_shift_bits <= 31);
if (order_hint_shift_bits == 0) {
assert(a == 0);
diff --git a/src/utils/constants.h b/src/utils/constants.h
index 34cf56d..a2076c5 100644
--- a/src/utils/constants.h
+++ b/src/utils/constants.h
@@ -629,6 +629,52 @@ inline const char* ToString(const LoopRestorationType type) {
abort();
}
+inline const char* ToString(const TransformSize size) {
+ switch (size) {
+ case kTransformSize4x4:
+ return "kTransformSize4x4";
+ case kTransformSize4x8:
+ return "kTransformSize4x8";
+ case kTransformSize4x16:
+ return "kTransformSize4x16";
+ case kTransformSize8x4:
+ return "kTransformSize8x4";
+ case kTransformSize8x8:
+ return "kTransformSize8x8";
+ case kTransformSize8x16:
+ return "kTransformSize8x16";
+ case kTransformSize8x32:
+ return "kTransformSize8x32";
+ case kTransformSize16x4:
+ return "kTransformSize16x4";
+ case kTransformSize16x8:
+ return "kTransformSize16x8";
+ case kTransformSize16x16:
+ return "kTransformSize16x16";
+ case kTransformSize16x32:
+ return "kTransformSize16x32";
+ case kTransformSize16x64:
+ return "kTransformSize16x64";
+ case kTransformSize32x8:
+ return "kTransformSize32x8";
+ case kTransformSize32x16:
+ return "kTransformSize32x16";
+ case kTransformSize32x32:
+ return "kTransformSize32x32";
+ case kTransformSize32x64:
+ return "kTransformSize32x64";
+ case kTransformSize64x16:
+ return "kTransformSize64x16";
+ case kTransformSize64x32:
+ return "kTransformSize64x32";
+ case kTransformSize64x64:
+ return "kTransformSize64x64";
+ case kNumTransformSizes:
+ return "kNumTransformSizes";
+ }
+ abort();
+}
+
inline const char* ToString(const TransformType type) {
switch (type) {
case kTransformTypeDctDct:
diff --git a/src/utils/cpu.cc b/src/utils/cpu.cc
index a6b7057..b3c51da 100644
--- a/src/utils/cpu.cc
+++ b/src/utils/cpu.cc
@@ -39,7 +39,7 @@ uint64_t Xgetbv() {
__asm__ volatile("xgetbv" : "=a"(eax), "=d"(edx) : "c"(ecx));
return (static_cast<uint64_t>(edx) << 32) | eax;
}
-#else // _MSC_VER
+#else // _MSC_VER
void CpuId(int leaf, uint32_t info[4]) {
__cpuidex(reinterpret_cast<int*>(info), leaf, 0 /*ecx=subleaf*/);
}
diff --git a/src/utils/cpu.h b/src/utils/cpu.h
index 630b251..aefc2df 100644
--- a/src/utils/cpu.h
+++ b/src/utils/cpu.h
@@ -38,7 +38,7 @@ namespace libgav1 {
#if !defined(LIBGAV1_ENABLE_AVX2)
#define LIBGAV1_ENABLE_AVX2 1
#endif // !defined(LIBGAV1_ENABLE_AVX2)
-#else // !LIBGAV1_ENABLE_SSE4_1
+#else // !LIBGAV1_ENABLE_SSE4_1
// Disable AVX2 when SSE4.1 is disabled as it may rely on shared components.
#undef LIBGAV1_ENABLE_AVX2
#define LIBGAV1_ENABLE_AVX2 0
diff --git a/src/utils/dynamic_buffer.h b/src/utils/dynamic_buffer.h
index b51345a..40ece26 100644
--- a/src/utils/dynamic_buffer.h
+++ b/src/utils/dynamic_buffer.h
@@ -46,6 +46,8 @@ class DynamicBuffer {
return true;
}
+ size_t size() const { return size_; }
+
private:
std::unique_ptr<T[]> buffer_;
size_t size_ = 0;
diff --git a/src/utils/libgav1_utils.cmake b/src/utils/libgav1_utils.cmake
index 8b6ec4b..587ca5d 100644
--- a/src/utils/libgav1_utils.cmake
+++ b/src/utils/libgav1_utils.cmake
@@ -39,8 +39,6 @@ list(APPEND libgav1_utils_sources
"${libgav1_source}/utils/logging.cc"
"${libgav1_source}/utils/logging.h"
"${libgav1_source}/utils/memory.h"
- "${libgav1_source}/utils/parameter_tree.cc"
- "${libgav1_source}/utils/parameter_tree.h"
"${libgav1_source}/utils/queue.h"
"${libgav1_source}/utils/raw_bit_reader.cc"
"${libgav1_source}/utils/raw_bit_reader.h"
diff --git a/src/utils/logging.cc b/src/utils/logging.cc
index 9a43c22..26e3e15 100644
--- a/src/utils/logging.cc
+++ b/src/utils/logging.cc
@@ -56,7 +56,7 @@ void Log(LogSeverity severity, const char* file, int line, const char* format,
va_end(ap);
fprintf(stderr, "\n");
}
-#else // !LIBGAV1_ENABLE_LOGGING
+#else // !LIBGAV1_ENABLE_LOGGING
void Log(LogSeverity /*severity*/, const char* /*file*/, int /*line*/,
const char* /*format*/, ...) {}
#endif // LIBGAV1_ENABLE_LOGGING
diff --git a/src/utils/logging.h b/src/utils/logging.h
index 48928db..473aebd 100644
--- a/src/utils/logging.h
+++ b/src/utils/logging.h
@@ -35,13 +35,13 @@
// setting LIBGAV1_ENABLE_LOGGING.
// Severity is given as an all-caps version of enum LogSeverity with the
// leading 'k' removed: LIBGAV1_DLOG(INFO, "...");
-#define LIBGAV1_DLOG(severity, ...) \
- do { \
- constexpr const char* libgav1_logging_internal_basename = \
- ::libgav1::internal::Basename(__FILE__, sizeof(__FILE__) - 1); \
- ::libgav1::internal::Log(LIBGAV1_LOGGING_INTERNAL_##severity, \
- libgav1_logging_internal_basename, __LINE__, \
- __VA_ARGS__); \
+#define LIBGAV1_DLOG(severity, ...) \
+ do { \
+ constexpr const char* libgav1_logging_internal_basename = \
+ libgav1::internal::Basename(__FILE__, sizeof(__FILE__) - 1); \
+ libgav1::internal::Log(LIBGAV1_LOGGING_INTERNAL_##severity, \
+ libgav1_logging_internal_basename, __LINE__, \
+ __VA_ARGS__); \
} while (0)
#else
#define LIBGAV1_DLOG(severity, ...) \
@@ -49,10 +49,10 @@
} while (0)
#endif // LIBGAV1_ENABLE_LOGGING
-#define LIBGAV1_LOGGING_INTERNAL_ERROR ::libgav1::internal::LogSeverity::kError
+#define LIBGAV1_LOGGING_INTERNAL_ERROR libgav1::internal::LogSeverity::kError
#define LIBGAV1_LOGGING_INTERNAL_WARNING \
- ::libgav1::internal::LogSeverity::kWarning
-#define LIBGAV1_LOGGING_INTERNAL_INFO ::libgav1::internal::LogSeverity::kInfo
+ libgav1::internal::LogSeverity::kWarning
+#define LIBGAV1_LOGGING_INTERNAL_INFO libgav1::internal::LogSeverity::kInfo
namespace libgav1 {
namespace internal {
diff --git a/src/utils/memory.h b/src/utils/memory.h
index 219a83f..a8da53b 100644
--- a/src/utils/memory.h
+++ b/src/utils/memory.h
@@ -71,7 +71,7 @@ inline void* AlignedAlloc(size_t alignment, size_t size) {
// more convenient to use memalign(). Unlike glibc, Android does not consider
// memalign() an obsolete function.
return memalign(alignment, size);
-#else // !defined(__ANDROID__)
+#else // !defined(__ANDROID__)
void* ptr = nullptr;
// posix_memalign requires that the requested alignment be at least
// sizeof(void*). In this case, fall back on malloc which should return
diff --git a/src/utils/parameter_tree.cc b/src/utils/parameter_tree.cc
deleted file mode 100644
index 9426ce6..0000000
--- a/src/utils/parameter_tree.cc
+++ /dev/null
@@ -1,133 +0,0 @@
-// Copyright 2019 The libgav1 Authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "src/utils/parameter_tree.h"
-
-#include <cassert>
-#include <memory>
-#include <new>
-
-#include "src/utils/common.h"
-#include "src/utils/constants.h"
-#include "src/utils/logging.h"
-#include "src/utils/types.h"
-
-namespace libgav1 {
-
-// static
-std::unique_ptr<ParameterTree> ParameterTree::Create(int row4x4, int column4x4,
- BlockSize block_size,
- bool is_leaf) {
- std::unique_ptr<ParameterTree> tree(
- new (std::nothrow) ParameterTree(row4x4, column4x4, block_size));
- if (tree != nullptr && is_leaf && !tree->SetPartitionType(kPartitionNone)) {
- tree = nullptr;
- }
- return tree;
-}
-
-bool ParameterTree::SetPartitionType(Partition partition) {
- assert(!partition_type_set_);
- partition_ = partition;
- partition_type_set_ = true;
- const int block_width4x4 = kNum4x4BlocksWide[block_size_];
- const int half_block4x4 = block_width4x4 >> 1;
- const int quarter_block4x4 = half_block4x4 >> 1;
- const BlockSize sub_size = kSubSize[partition][block_size_];
- const BlockSize split_size = kSubSize[kPartitionSplit][block_size_];
- assert(partition == kPartitionNone || sub_size != kBlockInvalid);
- switch (partition) {
- case kPartitionNone:
- parameters_.reset(new (std::nothrow) BlockParameters());
- return parameters_ != nullptr;
- case kPartitionHorizontal:
- children_[0] = ParameterTree::Create(row4x4_, column4x4_, sub_size, true);
- children_[1] = ParameterTree::Create(row4x4_ + half_block4x4, column4x4_,
- sub_size, true);
- return children_[0] != nullptr && children_[1] != nullptr;
- case kPartitionVertical:
- children_[0] = ParameterTree::Create(row4x4_, column4x4_, sub_size, true);
- children_[1] = ParameterTree::Create(row4x4_, column4x4_ + half_block4x4,
- sub_size, true);
- return children_[0] != nullptr && children_[1] != nullptr;
- case kPartitionSplit:
- children_[0] =
- ParameterTree::Create(row4x4_, column4x4_, sub_size, false);
- children_[1] = ParameterTree::Create(row4x4_, column4x4_ + half_block4x4,
- sub_size, false);
- children_[2] = ParameterTree::Create(row4x4_ + half_block4x4, column4x4_,
- sub_size, false);
- children_[3] = ParameterTree::Create(
- row4x4_ + half_block4x4, column4x4_ + half_block4x4, sub_size, false);
- return children_[0] != nullptr && children_[1] != nullptr &&
- children_[2] != nullptr && children_[3] != nullptr;
- case kPartitionHorizontalWithTopSplit:
- assert(split_size != kBlockInvalid);
- children_[0] =
- ParameterTree::Create(row4x4_, column4x4_, split_size, true);
- children_[1] = ParameterTree::Create(row4x4_, column4x4_ + half_block4x4,
- split_size, true);
- children_[2] = ParameterTree::Create(row4x4_ + half_block4x4, column4x4_,
- sub_size, true);
- return children_[0] != nullptr && children_[1] != nullptr &&
- children_[2] != nullptr;
- case kPartitionHorizontalWithBottomSplit:
- assert(split_size != kBlockInvalid);
- children_[0] = ParameterTree::Create(row4x4_, column4x4_, sub_size, true);
- children_[1] = ParameterTree::Create(row4x4_ + half_block4x4, column4x4_,
- split_size, true);
- children_[2] =
- ParameterTree::Create(row4x4_ + half_block4x4,
- column4x4_ + half_block4x4, split_size, true);
- return children_[0] != nullptr && children_[1] != nullptr &&
- children_[2] != nullptr;
- case kPartitionVerticalWithLeftSplit:
- assert(split_size != kBlockInvalid);
- children_[0] =
- ParameterTree::Create(row4x4_, column4x4_, split_size, true);
- children_[1] = ParameterTree::Create(row4x4_ + half_block4x4, column4x4_,
- split_size, true);
- children_[2] = ParameterTree::Create(row4x4_, column4x4_ + half_block4x4,
- sub_size, true);
- return children_[0] != nullptr && children_[1] != nullptr &&
- children_[2] != nullptr;
- case kPartitionVerticalWithRightSplit:
- assert(split_size != kBlockInvalid);
- children_[0] = ParameterTree::Create(row4x4_, column4x4_, sub_size, true);
- children_[1] = ParameterTree::Create(row4x4_, column4x4_ + half_block4x4,
- split_size, true);
- children_[2] =
- ParameterTree::Create(row4x4_ + half_block4x4,
- column4x4_ + half_block4x4, split_size, true);
- return children_[0] != nullptr && children_[1] != nullptr &&
- children_[2] != nullptr;
- case kPartitionHorizontal4:
- for (int i = 0; i < 4; ++i) {
- children_[i] = ParameterTree::Create(row4x4_ + i * quarter_block4x4,
- column4x4_, sub_size, true);
- if (children_[i] == nullptr) return false;
- }
- return true;
- default:
- assert(partition == kPartitionVertical4);
- for (int i = 0; i < 4; ++i) {
- children_[i] = ParameterTree::Create(
- row4x4_, column4x4_ + i * quarter_block4x4, sub_size, true);
- if (children_[i] == nullptr) return false;
- }
- return true;
- }
-}
-
-} // namespace libgav1
diff --git a/src/utils/parameter_tree.h b/src/utils/parameter_tree.h
deleted file mode 100644
index 935f3eb..0000000
--- a/src/utils/parameter_tree.h
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * Copyright 2019 The libgav1 Authors
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef LIBGAV1_SRC_UTILS_PARAMETER_TREE_H_
-#define LIBGAV1_SRC_UTILS_PARAMETER_TREE_H_
-
-#include <cassert>
-#include <memory>
-
-#include "src/utils/common.h"
-#include "src/utils/compiler_attributes.h"
-#include "src/utils/constants.h"
-#include "src/utils/memory.h"
-#include "src/utils/types.h"
-
-namespace libgav1 {
-
-class ParameterTree : public Allocable {
- public:
- // Creates a parameter tree to store the parameters of a block of size
- // |block_size| starting at coordinates |row4x4| and |column4x4|. If |is_leaf|
- // is set to true, the memory will be allocated for the BlockParameters for
- // this node. Otherwise, no memory will be allocated. If |is_leaf| is set to
- // false, |block_size| must be a square block, i.e.,
- // kBlockWidthPixels[block_size] must be equal to
- // kBlockHeightPixels[block_size].
- static std::unique_ptr<ParameterTree> Create(int row4x4, int column4x4,
- BlockSize block_size,
- bool is_leaf = false);
-
- // Move only (not Copyable).
- ParameterTree(ParameterTree&& other) = default;
- ParameterTree& operator=(ParameterTree&& other) = default;
- ParameterTree(const ParameterTree&) = delete;
- ParameterTree& operator=(const ParameterTree&) = delete;
-
- // Set the partition type of the current node to |partition|.
- // if (partition == kPartitionNone) {
- // Memory will be allocated for the BlockParameters for this node.
- // } else if (partition != kPartitionSplit) {
- // The appropriate child nodes will be populated and memory will be
- // allocated for the BlockParameters of the children.
- // } else {
- // The appropriate child nodes will be populated but they are considered to
- // be hanging, i.e., future calls to SetPartitionType() on the child nodes
- // will have to set them or their descendants to a terminal type.
- // }
- // This function must be called only once per node.
- LIBGAV1_MUST_USE_RESULT bool SetPartitionType(Partition partition);
-
- // Basic getters.
- int row4x4() const { return row4x4_; }
- int column4x4() const { return column4x4_; }
- BlockSize block_size() const { return block_size_; }
- Partition partition() const { return partition_; }
- ParameterTree* children(int index) const {
- assert(index < 4);
- return children_[index].get();
- }
- // Returns the BlockParameters object of the current node if one exists.
- // Otherwise returns nullptr. This function will return a valid
- // BlockParameters object only for leaf nodes.
- BlockParameters* parameters() const { return parameters_.get(); }
-
- private:
- ParameterTree(int row4x4, int column4x4, BlockSize block_size)
- : row4x4_(row4x4), column4x4_(column4x4), block_size_(block_size) {}
-
- Partition partition_ = kPartitionNone;
- std::unique_ptr<BlockParameters> parameters_ = nullptr;
- int row4x4_ = -1;
- int column4x4_ = -1;
- BlockSize block_size_ = kBlockInvalid;
- bool partition_type_set_ = false;
-
- // Child values are defined as follows for various partition types:
- // * Horizontal: 0 top partition; 1 bottom partition; 2 nullptr; 3 nullptr;
- // * Vertical: 0 left partition; 1 right partition; 2 nullptr; 3 nullptr;
- // * Split: 0 top-left partition; 1 top-right partition; 2; bottom-left
- // partition; 3 bottom-right partition;
- // * HorizontalWithTopSplit: 0 top-left partition; 1 top-right partition; 2
- // bottom partition; 3 nullptr;
- // * HorizontalWithBottomSplit: 0 top partition; 1 bottom-left partition; 2
- // bottom-right partition; 3 nullptr;
- // * VerticalWithLeftSplit: 0 top-left partition; 1 bottom-left partition; 2
- // right partition; 3 nullptr;
- // * VerticalWithRightSplit: 0 left-partition; 1 top-right partition; 2
- // bottom-right partition; 3 nullptr;
- // * Horizontal4: 0 top partition; 1 second top partition; 2 third top
- // partition; 3 bottom partition;
- // * Vertical4: 0 left partition; 1 second left partition; 2 third left
- // partition; 3 right partition;
- std::unique_ptr<ParameterTree> children_[4] = {};
-
- friend class ParameterTreeTest;
-};
-
-} // namespace libgav1
-
-#endif // LIBGAV1_SRC_UTILS_PARAMETER_TREE_H_
diff --git a/src/utils/raw_bit_reader.h b/src/utils/raw_bit_reader.h
index 76e7bfa..7d8ce8f 100644
--- a/src/utils/raw_bit_reader.h
+++ b/src/utils/raw_bit_reader.h
@@ -38,7 +38,7 @@ class RawBitReader : public BitReader, public Allocable {
size_t* value); // le(n) in the spec.
bool ReadUnsignedLeb128(size_t* value); // leb128() in the spec.
// Reads a variable length unsigned number and stores it in |*value|. On a
- // successful return, |*value| is in the range of 0 to UINT32_MAX − 1,
+ // successful return, |*value| is in the range of 0 to UINT32_MAX - 1,
// inclusive.
bool ReadUvlc(uint32_t* value); // uvlc() in the spec.
bool Finished() const;
diff --git a/src/utils/threadpool.cc b/src/utils/threadpool.cc
index 8c8f4fe..a3099e1 100644
--- a/src/utils/threadpool.cc
+++ b/src/utils/threadpool.cc
@@ -37,17 +37,21 @@
#include <chrono> // NOLINT (unapproved c++11 header)
#endif
+// Define the GetTid() function, a wrapper for the gettid() system call in
+// Linux.
+#if defined(__ANDROID__)
+static pid_t GetTid() { return gettid(); }
+#elif defined(__GLIBC__)
// The glibc wrapper for the gettid() system call was added in glibc 2.30.
// Emulate it for older versions of glibc.
-#if defined(__GLIBC_PREREQ)
-#if !__GLIBC_PREREQ(2, 30)
-
+#if __GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 30)
+static pid_t GetTid() { return gettid(); }
+#else // Older than glibc 2.30
#include <sys/syscall.h>
-static pid_t gettid() { return static_cast<pid_t>(syscall(SYS_gettid)); }
-
-#endif
-#endif // defined(__GLIBC_PREREQ)
+static pid_t GetTid() { return static_cast<pid_t>(syscall(SYS_gettid)); }
+#endif // glibc 2.30 or later.
+#endif // defined(__GLIBC__)
namespace libgav1 {
@@ -216,7 +220,7 @@ void ThreadPool::WorkerThread::SetupName() {
// If the |name| buffer is longer than 16 bytes, pthread_setname_np fails
// with error 34 (ERANGE) on Android.
char name[16];
- pid_t id = gettid();
+ pid_t id = GetTid();
int rv = snprintf(name, sizeof(name), "%s/%" PRId64, pool_->name_prefix_,
static_cast<int64_t>(id));
assert(rv >= 0);
diff --git a/src/utils/types.h b/src/utils/types.h
index 374f06b..eba13b7 100644
--- a/src/utils/types.h
+++ b/src/utils/types.h
@@ -18,6 +18,7 @@
#define LIBGAV1_SRC_UTILS_TYPES_H_
#include <array>
+#include <cstddef>
#include <cstdint>
#include <memory>
@@ -512,6 +513,10 @@ struct ObuFrameHeader {
Delta delta_lf;
// A valid value of reference_frame_index[i] is in the range [0, 7]. -1
// indicates an invalid value.
+ //
+ // NOTE: When the frame is an intra frame (frame_type is kFrameKey or
+ // kFrameIntraOnly), reference_frame_index is not used and may be
+ // uninitialized.
int8_t reference_frame_index[kNumInterReferenceFrameTypes];
// The ref_order_hint[ i ] syntax element in the uncompressed header.
// Specifies the expected output order hint for each reference frame.
@@ -521,5 +526,24 @@ struct ObuFrameHeader {
FilmGrainParams film_grain_params;
};
+// Structure used for traversing the partition tree.
+struct PartitionTreeNode {
+ PartitionTreeNode() = default;
+ PartitionTreeNode(int row4x4, int column4x4, BlockSize block_size)
+ : row4x4(row4x4), column4x4(column4x4), block_size(block_size) {}
+ int row4x4 = -1;
+ int column4x4 = -1;
+ BlockSize block_size = kBlockInvalid;
+};
+
+// Structure used for storing the transform parameters in a superblock.
+struct TransformParameters {
+ TransformParameters() = default;
+ TransformParameters(TransformType type, int non_zero_coeff_count)
+ : type(type), non_zero_coeff_count(non_zero_coeff_count) {}
+ TransformType type;
+ int non_zero_coeff_count;
+};
+
} // namespace libgav1
#endif // LIBGAV1_SRC_UTILS_TYPES_H_
diff --git a/tests/block_utils.cc b/tests/block_utils.cc
new file mode 100644
index 0000000..96833a2
--- /dev/null
+++ b/tests/block_utils.cc
@@ -0,0 +1,130 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tests/block_utils.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+
+namespace libgav1 {
+namespace test_utils {
+namespace {
+
+template <typename Pixel>
+void PrintBlockDiff(const Pixel* block1, const Pixel* block2, int width,
+ int height, int stride1, int stride2,
+ const bool print_padding) {
+ const int print_width = print_padding ? std::min(stride1, stride2) : width;
+ const int field_width = (sizeof(Pixel) == 1) ? 4 : 5;
+
+ for (int y = 0; y < height; ++y) {
+ printf("[%2d] ", y);
+ for (int x = 0; x < print_width; ++x) {
+ if (x >= width) {
+ if (block1[x] == block2[x]) {
+ printf("[%*d] ", field_width, block1[x]);
+ } else {
+ printf("[*%*d] ", field_width - 1, block1[x]);
+ }
+ } else {
+ if (block1[x] == block2[x]) {
+ printf("%*d ", field_width, block1[x]);
+ } else {
+ printf("*%*d ", field_width - 1, block1[x]);
+ }
+ }
+ }
+ printf("\n");
+ block1 += stride1;
+ block2 += stride2;
+ }
+}
+
+} // namespace
+
+template <typename Pixel>
+void PrintBlock(const Pixel* block, int width, int height, int stride,
+ const bool print_padding /*= false*/) {
+ const int print_width = print_padding ? stride : width;
+ const int field_width = (sizeof(Pixel) == 1) ? 4 : 5;
+ for (int y = 0; y < height; ++y) {
+ printf("[%2d] ", y);
+ for (int x = 0; x < print_width; ++x) {
+ if (x >= width) {
+ printf("[%*d] ", field_width, block[x]);
+ } else {
+ printf("%*d ", field_width, block[x]);
+ }
+ }
+ printf("\n");
+ block += stride;
+ }
+}
+
+template void PrintBlock(const uint8_t* block, int width, int height,
+ int stride, bool print_padding /*= false*/);
+template void PrintBlock(const uint16_t* block, int width, int height,
+ int stride, bool print_padding /*= false*/);
+template void PrintBlock(const int8_t* block, int width, int height, int stride,
+ bool print_padding /*= false*/);
+template void PrintBlock(const int16_t* block, int width, int height,
+ int stride, bool print_padding /*= false*/);
+
+template <typename Pixel>
+bool CompareBlocks(const Pixel* block1, const Pixel* block2, int width,
+ int height, int stride1, int stride2,
+ const bool check_padding, const bool print_diff /*= true*/) {
+ bool ok = true;
+ const int check_width = check_padding ? std::min(stride1, stride2) : width;
+ for (int y = 0; y < height; ++y) {
+ const uint64_t row1 = static_cast<uint64_t>(y) * stride1;
+ const uint64_t row2 = static_cast<uint64_t>(y) * stride2;
+ ok = memcmp(block1 + row1, block2 + row2,
+ sizeof(block1[0]) * check_width) == 0;
+ if (!ok) break;
+ }
+ if (!ok && print_diff) {
+ printf("block1 (width: %d height: %d stride: %d):\n", width, height,
+ stride1);
+ PrintBlockDiff(block1, block2, width, height, stride1, stride2,
+ check_padding);
+ printf("\nblock2 (width: %d height: %d stride: %d):\n", width, height,
+ stride2);
+ PrintBlockDiff(block2, block1, width, height, stride2, stride1,
+ check_padding);
+ }
+ return ok;
+}
+
+template bool CompareBlocks(const uint8_t* block1, const uint8_t* block2,
+ int width, int height, int stride1, int stride2,
+ const bool check_padding,
+ const bool print_diff /*= true*/);
+template bool CompareBlocks(const uint16_t* block1, const uint16_t* block2,
+ int width, int height, int stride1, int stride2,
+ const bool check_padding,
+ const bool print_diff /*= true*/);
+template bool CompareBlocks(const int8_t* block1, const int8_t* block2,
+ int width, int height, int stride1, int stride2,
+ const bool check_padding,
+ const bool print_diff /*= true*/);
+template bool CompareBlocks(const int16_t* block1, const int16_t* block2,
+ int width, int height, int stride1, int stride2,
+ const bool check_padding,
+ const bool print_diff /*= true*/);
+
+} // namespace test_utils
+} // namespace libgav1
diff --git a/tests/block_utils.h b/tests/block_utils.h
new file mode 100644
index 0000000..4542420
--- /dev/null
+++ b/tests/block_utils.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_TESTS_BLOCK_UTILS_H_
+#define LIBGAV1_TESTS_BLOCK_UTILS_H_
+
+#include <cstdint>
+
+namespace libgav1 {
+namespace test_utils {
+
+//------------------------------------------------------------------------------
+// Prints |block| pixel by pixel with |width| pixels per row if |print_padding|
+// is false, |stride| otherwise. If |print_padding| is true padding pixels are
+// surrounded in '[]'.
+template <typename Pixel>
+void PrintBlock(const Pixel* block, int width, int height, int stride,
+ bool print_padding = false);
+
+extern template void PrintBlock(const uint8_t* block, int width, int height,
+ int stride, bool print_padding /*= false*/);
+extern template void PrintBlock(const uint16_t* block, int width, int height,
+ int stride, bool print_padding /*= false*/);
+
+//------------------------------------------------------------------------------
+// Compares |block1| and |block2| pixel by pixel checking |width| pixels per row
+// if |check_padding| is false, min(|stride1|, |stride2|) pixels otherwise.
+// Prints the blocks with differences marked with a '*' if |print_diff| is
+// true (the default).
+
+template <typename Pixel>
+bool CompareBlocks(const Pixel* block1, const Pixel* block2, int width,
+ int height, int stride1, int stride2, bool check_padding,
+ bool print_diff = true);
+
+extern template bool CompareBlocks(const uint8_t* block1, const uint8_t* block2,
+ int width, int height, int stride1,
+ int stride2, bool check_padding,
+ bool print_diff /*= true*/);
+extern template bool CompareBlocks(const uint16_t* block1,
+ const uint16_t* block2, int width,
+ int height, int stride1, int stride2,
+ bool check_padding,
+ bool print_diff /*= true*/);
+
+} // namespace test_utils
+} // namespace libgav1
+
+#endif // LIBGAV1_TESTS_BLOCK_UTILS_H_
diff --git a/tests/libgav1_tests.cmake b/tests/libgav1_tests.cmake
new file mode 100644
index 0000000..ac2fb2e
--- /dev/null
+++ b/tests/libgav1_tests.cmake
@@ -0,0 +1,626 @@
+# Copyright 2020 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_LIBGAV1_TESTS_CMAKE_)
+ return()
+endif() # LIBGAV1_LIBGAV1_TESTS_CMAKE_
+set(LIBGAV1_LIBGAV1_TESTS_CMAKE_ 1)
+
+set(libgav1_googletest "${libgav1_root}/third_party/googletest")
+if(NOT LIBGAV1_ENABLE_TESTS OR NOT EXISTS "${libgav1_googletest}")
+ macro(libgav1_add_tests_targets)
+
+ endmacro()
+
+ if(LIBGAV1_ENABLE_TESTS AND NOT EXISTS "${libgav1_googletest}")
+ message(
+ "GoogleTest not found, setting LIBGAV1_ENABLE_TESTS to false.\n"
+ "To enable tests download the GoogleTest repository to"
+ " third_party/googletest:\n\n git \\\n -C ${libgav1_root} \\\n"
+ " clone \\\n"
+ " https://github.com/google/googletest.git third_party/googletest\n")
+ set(LIBGAV1_ENABLE_TESTS FALSE CACHE BOOL "Enables tests." FORCE)
+ endif()
+ return()
+endif()
+
+# Check GoogleTest compiler requirements.
+if((CMAKE_CXX_COMPILER_ID
+ MATCHES
+ "Clang|GNU"
+ AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS "5")
+ OR (MSVC AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS "19"))
+ macro(libgav1_add_tests_targets)
+
+ endmacro()
+
+ message(
+ WARNING
+ "${CMAKE_CXX_COMPILER} (${CMAKE_CXX_COMPILER_ID} version"
+ " ${CMAKE_CXX_COMPILER_VERSION}) is below the minimum requirements for"
+ " GoogleTest; disabling unit tests. See"
+ " https://github.com/google/googletest#compilers for more detail.")
+ set(LIBGAV1_ENABLE_TESTS FALSE CACHE BOOL "Enables tests." FORCE)
+ return()
+endif()
+
+list(APPEND libgav1_tests_block_utils_sources
+ "${libgav1_root}/tests/block_utils.h"
+ "${libgav1_root}/tests/block_utils.cc")
+
+list(APPEND libgav1_tests_utils_sources
+ "${libgav1_root}/tests/third_party/libvpx/acm_random.h"
+ "${libgav1_root}/tests/third_party/libvpx/md5_helper.h"
+ "${libgav1_root}/tests/third_party/libvpx/md5_utils.cc"
+ "${libgav1_root}/tests/third_party/libvpx/md5_utils.h"
+ "${libgav1_root}/tests/utils.h" "${libgav1_root}/tests/utils.cc")
+
+list(APPEND libgav1_tests_utils_test_sources
+ "${libgav1_root}/tests/utils_test.cc")
+
+list(APPEND libgav1_average_blend_test_sources
+ "${libgav1_source}/dsp/average_blend_test.cc")
+list(APPEND libgav1_cdef_test_sources "${libgav1_source}/dsp/cdef_test.cc")
+list(APPEND libgav1_convolve_test_sources
+ "${libgav1_source}/dsp/convolve_test.cc")
+list(APPEND libgav1_distance_weighted_blend_test_sources
+ "${libgav1_source}/dsp/distance_weighted_blend_test.cc")
+list(APPEND libgav1_dsp_test_sources "${libgav1_source}/dsp/dsp_test.cc")
+list(APPEND libgav1_intra_edge_test_sources
+ "${libgav1_source}/dsp/intra_edge_test.cc")
+list(APPEND libgav1_intrapred_cfl_test_sources
+ "${libgav1_source}/dsp/intrapred_cfl_test.cc")
+list(APPEND libgav1_intrapred_directional_test_sources
+ "${libgav1_source}/dsp/intrapred_directional_test.cc")
+list(APPEND libgav1_intrapred_filter_test_sources
+ "${libgav1_source}/dsp/intrapred_filter_test.cc")
+list(APPEND libgav1_intrapred_test_sources
+ "${libgav1_source}/dsp/intrapred_test.cc")
+list(APPEND libgav1_inverse_transform_test_sources
+ "${libgav1_source}/dsp/inverse_transform_test.cc")
+list(APPEND libgav1_loop_filter_test_sources
+ "${libgav1_source}/dsp/loop_filter_test.cc")
+list(APPEND libgav1_loop_restoration_test_sources
+ "${libgav1_source}/dsp/loop_restoration_test.cc")
+list(APPEND libgav1_mask_blend_test_sources
+ "${libgav1_source}/dsp/mask_blend_test.cc")
+list(APPEND libgav1_motion_field_projection_test_sources
+ "${libgav1_source}/dsp/motion_field_projection_test.cc")
+list(APPEND libgav1_motion_vector_search_test_sources
+ "${libgav1_source}/dsp/motion_vector_search_test.cc")
+list(APPEND libgav1_super_res_test_sources
+ "${libgav1_source}/dsp/super_res_test.cc")
+list(APPEND libgav1_weight_mask_test_sources
+ "${libgav1_source}/dsp/weight_mask_test.cc")
+list(APPEND libgav1_obmc_test_sources "${libgav1_source}/dsp/obmc_test.cc")
+list(APPEND libgav1_warp_test_sources "${libgav1_source}/dsp/warp_test.cc")
+
+macro(libgav1_add_tests_targets)
+ if(NOT LIBGAV1_ENABLE_TESTS)
+ message(
+ FATAL_ERROR
+ "This version of libgav1_add_tests_targets() should only be used with"
+ " LIBGAV1_ENABLE_TESTS set to true.")
+ endif()
+ libgav1_add_library(TEST
+ NAME
+ libgav1_gtest
+ TYPE
+ STATIC
+ SOURCES
+ "${libgav1_googletest}/googletest/src/gtest-all.cc"
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_gtest_include_paths}
+ ${libgav1_include_paths})
+
+ libgav1_add_library(TEST
+ NAME
+ libgav1_gtest_main
+ TYPE
+ STATIC
+ SOURCES
+ "${libgav1_googletest}/googletest/src/gtest_main.cc"
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_gtest_include_paths}
+ ${libgav1_include_paths})
+
+ if(ANDROID OR IOS)
+ if(DEFINED LIBGAV1_THREADPOOL_USE_STD_MUTEX
+ AND NOT LIBGAV1_THREADPOOL_USE_STD_MUTEX)
+ set(use_absl_threading TRUE)
+ endif()
+ elseif(NOT
+ (DEFINED
+ LIBGAV1_THREADPOOL_USE_STD_MUTEX
+ AND LIBGAV1_THREADPOOL_USE_STD_MUTEX))
+ set(use_absl_threading TRUE)
+ endif()
+
+ if(use_absl_threading)
+ list(APPEND libgav1_common_test_absl_deps absl::synchronization)
+ endif()
+
+ libgav1_add_executable(TEST
+ NAME
+ tests_utils_test
+ SOURCES
+ ${libgav1_tests_utils_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ OBJLIB_DEPS
+ libgav1_dsp
+ libgav1_tests_utils
+ libgav1_utils
+ LIB_DEPS
+ absl::time
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest
+ libgav1_gtest_main)
+
+ libgav1_add_library(TEST
+ NAME
+ libgav1_tests_block_utils
+ TYPE
+ OBJECT
+ SOURCES
+ ${libgav1_tests_block_utils_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths})
+
+ libgav1_add_library(TEST
+ NAME
+ libgav1_tests_utils
+ TYPE
+ OBJECT
+ SOURCES
+ ${libgav1_tests_utils_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths})
+
+ libgav1_add_executable(TEST
+ NAME
+ average_blend_test
+ SOURCES
+ ${libgav1_average_blend_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ OBJLIB_DEPS
+ libgav1_decoder
+ libgav1_dsp
+ libgav1_tests_block_utils
+ libgav1_tests_utils
+ libgav1_utils
+ LIB_DEPS
+ absl::str_format_internal
+ absl::strings
+ absl::time
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest
+ libgav1_gtest_main)
+
+ libgav1_add_executable(TEST
+ NAME
+ cdef_test
+ SOURCES
+ ${libgav1_cdef_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ OBJLIB_DEPS
+ libgav1_decoder
+ libgav1_dsp
+ libgav1_tests_utils
+ libgav1_utils
+ LIB_DEPS
+ absl::strings
+ absl::time
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest
+ libgav1_gtest_main)
+
+ libgav1_add_executable(TEST
+ NAME
+ convolve_test
+ SOURCES
+ ${libgav1_convolve_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ OBJLIB_DEPS
+ libgav1_decoder
+ libgav1_dsp
+ libgav1_tests_block_utils
+ libgav1_tests_utils
+ libgav1_utils
+ LIB_DEPS
+ absl::str_format_internal
+ absl::time
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest
+ libgav1_gtest_main)
+
+ libgav1_add_executable(TEST
+ NAME
+ distance_weighted_blend_test
+ SOURCES
+ ${libgav1_distance_weighted_blend_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ OBJLIB_DEPS
+ libgav1_decoder
+ libgav1_dsp
+ libgav1_tests_utils
+ libgav1_utils
+ LIB_DEPS
+ absl::str_format_internal
+ absl::time
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest
+ libgav1_gtest_main)
+
+ libgav1_add_executable(TEST
+ NAME
+ dsp_test
+ SOURCES
+ ${libgav1_dsp_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ OBJLIB_DEPS
+ libgav1_decoder
+ libgav1_dsp
+ libgav1_tests_utils
+ libgav1_utils
+ LIB_DEPS
+ absl::strings
+ absl::time
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest
+ libgav1_gtest_main)
+
+ libgav1_add_executable(TEST
+ NAME
+ intrapred_cfl_test
+ SOURCES
+ ${libgav1_intrapred_cfl_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ OBJLIB_DEPS
+ libgav1_decoder
+ libgav1_dsp
+ libgav1_tests_block_utils
+ libgav1_tests_utils
+ libgav1_utils
+ LIB_DEPS
+ absl::time
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest
+ libgav1_gtest_main)
+
+ libgav1_add_executable(TEST
+ NAME
+ intrapred_directional_test
+ SOURCES
+ ${libgav1_intrapred_directional_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ OBJLIB_DEPS
+ libgav1_decoder
+ libgav1_dsp
+ libgav1_tests_block_utils
+ libgav1_tests_utils
+ libgav1_utils
+ LIB_DEPS
+ absl::time
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest
+ libgav1_gtest_main)
+
+ libgav1_add_executable(TEST
+ NAME
+ intrapred_filter_test
+ SOURCES
+ ${libgav1_intrapred_filter_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ OBJLIB_DEPS
+ libgav1_decoder
+ libgav1_dsp
+ libgav1_tests_block_utils
+ libgav1_tests_utils
+ libgav1_utils
+ LIB_DEPS
+ absl::time
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest
+ libgav1_gtest_main)
+
+ libgav1_add_executable(TEST
+ NAME
+ intrapred_test
+ SOURCES
+ ${libgav1_intrapred_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ OBJLIB_DEPS
+ libgav1_decoder
+ libgav1_dsp
+ libgav1_tests_block_utils
+ libgav1_tests_utils
+ libgav1_utils
+ LIB_DEPS
+ absl::time
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest
+ libgav1_gtest_main)
+
+ libgav1_add_executable(TEST
+ NAME
+ intra_edge_test
+ SOURCES
+ ${libgav1_intra_edge_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ OBJLIB_DEPS
+ libgav1_decoder
+ libgav1_tests_utils
+ libgav1_dsp
+ libgav1_utils
+ LIB_DEPS
+ absl::strings
+ absl::time
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest
+ libgav1_gtest_main)
+
+ libgav1_add_executable(TEST
+ NAME
+ inverse_transform_test
+ SOURCES
+ ${libgav1_inverse_transform_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ OBJLIB_DEPS
+ libgav1_decoder
+ libgav1_tests_block_utils
+ libgav1_tests_utils
+ libgav1_dsp
+ libgav1_utils
+ LIB_DEPS
+ absl::strings
+ absl::time
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest
+ libgav1_gtest_main)
+
+ libgav1_add_executable(TEST
+ NAME
+ loop_filter_test
+ SOURCES
+ ${libgav1_loop_filter_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ OBJLIB_DEPS
+ libgav1_decoder
+ libgav1_dsp
+ libgav1_tests_block_utils
+ libgav1_tests_utils
+ libgav1_utils
+ LIB_DEPS
+ absl::time
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest
+ libgav1_gtest_main)
+
+ libgav1_add_executable(TEST
+ NAME
+ loop_restoration_test
+ SOURCES
+ ${libgav1_loop_restoration_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ OBJLIB_DEPS
+ libgav1_decoder
+ libgav1_dsp
+ libgav1_tests_block_utils
+ libgav1_tests_utils
+ libgav1_utils
+ LIB_DEPS
+ absl::time
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest
+ libgav1_gtest_main)
+
+ libgav1_add_executable(TEST
+ NAME
+ mask_blend_test
+ SOURCES
+ ${libgav1_mask_blend_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ OBJLIB_DEPS
+ libgav1_decoder
+ libgav1_dsp
+ libgav1_tests_utils
+ libgav1_utils
+ LIB_DEPS
+ absl::str_format_internal
+ absl::time
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest
+ libgav1_gtest_main)
+
+ libgav1_add_executable(TEST
+ NAME
+ motion_field_projection_test
+ SOURCES
+ ${libgav1_motion_field_projection_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ OBJLIB_DEPS
+ libgav1_decoder
+ libgav1_dsp
+ libgav1_tests_utils
+ libgav1_utils
+ LIB_DEPS
+ absl::str_format_internal
+ absl::time
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest
+ libgav1_gtest_main)
+
+ libgav1_add_executable(TEST
+ NAME
+ motion_vector_search_test
+ SOURCES
+ ${libgav1_motion_vector_search_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ OBJLIB_DEPS
+ libgav1_decoder
+ libgav1_dsp
+ libgav1_tests_utils
+ libgav1_utils
+ LIB_DEPS
+ absl::str_format_internal
+ absl::time
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest
+ libgav1_gtest_main)
+
+ libgav1_add_executable(TEST
+ NAME
+ obmc_test
+ SOURCES
+ ${libgav1_obmc_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ OBJLIB_DEPS
+ libgav1_decoder
+ libgav1_dsp
+ libgav1_tests_block_utils
+ libgav1_tests_utils
+ libgav1_utils
+ LIB_DEPS
+ absl::str_format_internal
+ absl::time
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest
+ libgav1_gtest_main)
+
+ libgav1_add_executable(TEST
+ NAME
+ super_res_test
+ SOURCES
+ ${libgav1_super_res_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ OBJLIB_DEPS
+ libgav1_decoder
+ libgav1_dsp
+ libgav1_tests_utils
+ libgav1_utils
+ LIB_DEPS
+ absl::str_format_internal
+ absl::time
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest
+ libgav1_gtest_main)
+
+ libgav1_add_executable(TEST
+ NAME
+ warp_test
+ SOURCES
+ ${libgav1_warp_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ OBJLIB_DEPS
+ libgav1_decoder
+ libgav1_dsp
+ libgav1_tests_block_utils
+ libgav1_tests_utils
+ libgav1_utils
+ LIB_DEPS
+ absl::str_format_internal
+ absl::time
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest
+ libgav1_gtest_main)
+
+ libgav1_add_executable(TEST
+ NAME
+ weight_mask_test
+ SOURCES
+ ${libgav1_weight_mask_test_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_test_include_paths}
+ OBJLIB_DEPS
+ libgav1_decoder
+ libgav1_dsp
+ libgav1_tests_utils
+ libgav1_utils
+ LIB_DEPS
+ absl::str_format_internal
+ absl::time
+ ${libgav1_common_test_absl_deps}
+ libgav1_gtest
+ libgav1_gtest_main)
+endmacro()
diff --git a/tests/third_party/libvpx/LICENSE b/tests/third_party/libvpx/LICENSE
new file mode 100644
index 0000000..83ef339
--- /dev/null
+++ b/tests/third_party/libvpx/LICENSE
@@ -0,0 +1,30 @@
+Copyright (c) 2010, The WebM Project authors. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+
+ * Neither the name of Google, nor the WebM Project, nor the names
+ of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written
+ permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/tests/third_party/libvpx/acm_random.h b/tests/third_party/libvpx/acm_random.h
new file mode 100644
index 0000000..e8cfc9c
--- /dev/null
+++ b/tests/third_party/libvpx/acm_random.h
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef LIBGAV1_TESTS_THIRD_PARTY_LIBVPX_ACM_RANDOM_H_
+#define LIBGAV1_TESTS_THIRD_PARTY_LIBVPX_ACM_RANDOM_H_
+
+#include <cassert>
+#include <cstdint>
+#include <limits>
+
+#include "gtest/gtest.h"
+
+namespace libvpx_test {
+
+class ACMRandom {
+ public:
+ ACMRandom() : random_(DeterministicSeed()) {}
+
+ explicit ACMRandom(int seed) : random_(seed) {}
+
+ void Reset(int seed) { random_.Reseed(seed); }
+ uint16_t Rand16(void) {
+ const uint32_t value =
+ random_.Generate(testing::internal::Random::kMaxRange);
+ return (value >> 15) & 0xffff;
+ }
+
+ int32_t Rand20Signed(void) {
+ // Use 20 bits: values between 524287 and -524288.
+ const uint32_t value = random_.Generate(1048576);
+ return static_cast<int32_t>(value) - 524288;
+ }
+
+ int16_t Rand16Signed(void) {
+ // Use 16 bits: values between 32767 and -32768.
+ return static_cast<int16_t>(random_.Generate(65536));
+ }
+
+ int16_t Rand13Signed(void) {
+ // Use 13 bits: values between 4095 and -4096.
+ const uint32_t value = random_.Generate(8192);
+ return static_cast<int16_t>(value) - 4096;
+ }
+
+ int16_t Rand9Signed(void) {
+ // Use 9 bits: values between 255 (0x0FF) and -256 (0x100).
+ const uint32_t value = random_.Generate(512);
+ return static_cast<int16_t>(value) - 256;
+ }
+
+ uint8_t Rand8(void) {
+ const uint32_t value =
+ random_.Generate(testing::internal::Random::kMaxRange);
+ // There's a bit more entropy in the upper bits of this implementation.
+ return (value >> 23) & 0xff;
+ }
+
+ uint8_t Rand8Extremes(void) {
+ // Returns a random value near 0 or near 255, to better exercise
+ // saturation behavior.
+ const uint8_t r = Rand8();
+ return static_cast<uint8_t>((r < 128) ? r << 4 : r >> 4);
+ }
+
+ uint32_t RandRange(const uint32_t range) {
+ // testing::internal::Random::Generate provides values in the range
+ // testing::internal::Random::kMaxRange.
+ assert(range <= testing::internal::Random::kMaxRange);
+ return random_.Generate(range);
+ }
+
+ int PseudoUniform(int range) { return random_.Generate(range); }
+
+ int operator()(int n) { return PseudoUniform(n); }
+
+ static constexpr int DeterministicSeed(void) { return 0xbaba; }
+
+ private:
+ testing::internal::Random random_;
+};
+
+} // namespace libvpx_test
+
+#endif // LIBGAV1_TESTS_THIRD_PARTY_LIBVPX_ACM_RANDOM_H_
diff --git a/tests/third_party/libvpx/md5_helper.h b/tests/third_party/libvpx/md5_helper.h
new file mode 100644
index 0000000..c97b590
--- /dev/null
+++ b/tests/third_party/libvpx/md5_helper.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef LIBGAV1_TESTS_THIRD_PARTY_LIBVPX_MD5_HELPER_H_
+#define LIBGAV1_TESTS_THIRD_PARTY_LIBVPX_MD5_HELPER_H_
+
+#include <cstddef>
+#include <cstdint>
+
+#include "tests/third_party/libvpx/md5_utils.h"
+
+namespace libvpx_test {
+class MD5 {
+ public:
+ MD5() { MD5Init(&md5_); }
+
+ void Add(const uint8_t *data, size_t size) {
+ MD5Update(&md5_, data, static_cast<uint32_t>(size));
+ }
+
+ const char *Get(void) {
+ static const char hex[16] = {
+ '0', '1', '2', '3', '4', '5', '6', '7',
+ '8', '9', 'a', 'b', 'c', 'd', 'e', 'f',
+ };
+ uint8_t tmp[16];
+ MD5Context ctx_tmp = md5_;
+
+ MD5Final(tmp, &ctx_tmp);
+ for (int i = 0; i < 16; i++) {
+ res_[i * 2 + 0] = hex[tmp[i] >> 4];
+ res_[i * 2 + 1] = hex[tmp[i] & 0xf];
+ }
+ res_[32] = 0;
+
+ return res_;
+ }
+
+ protected:
+ char res_[33];
+ MD5Context md5_;
+};
+
+} // namespace libvpx_test
+
+#endif // LIBGAV1_TESTS_THIRD_PARTY_LIBVPX_MD5_HELPER_H_
diff --git a/tests/third_party/libvpx/md5_utils.cc b/tests/third_party/libvpx/md5_utils.cc
new file mode 100644
index 0000000..4638e54
--- /dev/null
+++ b/tests/third_party/libvpx/md5_utils.cc
@@ -0,0 +1,249 @@
+/*
+ * This code implements the MD5 message-digest algorithm.
+ * The algorithm is due to Ron Rivest. This code was
+ * written by Colin Plumb in 1993, no copyright is claimed.
+ * This code is in the public domain; do with it what you wish.
+ *
+ * Equivalent code is available from RSA Data Security, Inc.
+ * This code has been tested against that, and is equivalent,
+ * except that you don't need to include two pages of legalese
+ * with every copy.
+ *
+ * To compute the message digest of a chunk of bytes, declare an
+ * MD5Context structure, pass it to MD5Init, call MD5Update as
+ * needed on buffers full of bytes, and then call MD5Final, which
+ * will fill a supplied 16-byte array with the digest.
+ *
+ * Changed so as no longer to depend on Colin Plumb's `usual.h' header
+ * definitions
+ * - Ian Jackson <ian@chiark.greenend.org.uk>.
+ * Still in the public domain.
+ */
+
+#include "tests/third_party/libvpx/md5_utils.h"
+
+#include <cstring>
+
+static void byteSwap(UWORD32 *buf, unsigned words) {
+ md5byte *p;
+
+ /* Only swap bytes for big endian machines */
+ int i = 1;
+
+ if (*(char *)&i == 1) return;
+
+ p = (md5byte *)buf;
+
+ do {
+ *buf++ = (UWORD32)((unsigned)p[3] << 8 | p[2]) << 16 |
+ ((unsigned)p[1] << 8 | p[0]);
+ p += 4;
+ } while (--words);
+}
+
+/*
+ * Start MD5 accumulation. Set bit count to 0 and buffer to mysterious
+ * initialization constants.
+ */
+void MD5Init(struct MD5Context *ctx) {
+ ctx->buf[0] = 0x67452301;
+ ctx->buf[1] = 0xefcdab89;
+ ctx->buf[2] = 0x98badcfe;
+ ctx->buf[3] = 0x10325476;
+
+ ctx->bytes[0] = 0;
+ ctx->bytes[1] = 0;
+}
+
+/*
+ * Update context to reflect the concatenation of another buffer full
+ * of bytes.
+ */
+void MD5Update(struct MD5Context *ctx, md5byte const *buf, unsigned len) {
+ UWORD32 t;
+
+ /* Update byte count */
+
+ t = ctx->bytes[0];
+
+ if ((ctx->bytes[0] = t + len) < t)
+ ctx->bytes[1]++; /* Carry from low to high */
+
+ t = 64 - (t & 0x3f); /* Space available in ctx->in (at least 1) */
+
+ if (t > len) {
+ memcpy((md5byte *)ctx->in + 64 - t, buf, len);
+ return;
+ }
+
+ /* First chunk is an odd size */
+ memcpy((md5byte *)ctx->in + 64 - t, buf, t);
+ byteSwap(ctx->in, 16);
+ MD5Transform(ctx->buf, ctx->in);
+ buf += t;
+ len -= t;
+
+ /* Process data in 64-byte chunks */
+ while (len >= 64) {
+ memcpy(ctx->in, buf, 64);
+ byteSwap(ctx->in, 16);
+ MD5Transform(ctx->buf, ctx->in);
+ buf += 64;
+ len -= 64;
+ }
+
+ /* Handle any remaining bytes of data. */
+ memcpy(ctx->in, buf, len);
+}
+
+/*
+ * Final wrapup - pad to 64-byte boundary with the bit pattern
+ * 1 0* (64-bit count of bits processed, MSB-first)
+ */
+void MD5Final(md5byte digest[16], struct MD5Context *ctx) {
+ int count = ctx->bytes[0] & 0x3f; /* Number of bytes in ctx->in */
+ md5byte *p = (md5byte *)ctx->in + count;
+
+ /* Set the first char of padding to 0x80. There is always room. */
+ *p++ = 0x80;
+
+ /* Bytes of padding needed to make 56 bytes (-8..55) */
+ count = 56 - 1 - count;
+
+ if (count < 0) { /* Padding forces an extra block */
+ memset(p, 0, count + 8);
+ byteSwap(ctx->in, 16);
+ MD5Transform(ctx->buf, ctx->in);
+ p = (md5byte *)ctx->in;
+ count = 56;
+ }
+
+ memset(p, 0, count);
+ byteSwap(ctx->in, 14);
+
+ /* Append length in bits and transform */
+ ctx->in[14] = ctx->bytes[0] << 3;
+ ctx->in[15] = ctx->bytes[1] << 3 | ctx->bytes[0] >> 29;
+ MD5Transform(ctx->buf, ctx->in);
+
+ byteSwap(ctx->buf, 4);
+ memcpy(digest, ctx->buf, 16);
+ memset(ctx, 0, sizeof(*ctx)); /* In case it's sensitive */
+}
+
+#ifndef ASM_MD5
+
+/* The four core functions - F1 is optimized somewhat */
+
+/* #define F1(x, y, z) (x & y | ~x & z) */
+#define F1(x, y, z) (z ^ (x & (y ^ z)))
+#define F2(x, y, z) F1(z, x, y)
+#define F3(x, y, z) (x ^ y ^ z)
+#define F4(x, y, z) (y ^ (x | ~z))
+
+/* This is the central step in the MD5 algorithm. */
+#define MD5STEP(f, w, x, y, z, in, s) \
+ (w += f(x, y, z) + in, w = (w << s | w >> (32 - s)) + x)
+
+#if defined(__clang__) && defined(__has_attribute)
+#if __has_attribute(no_sanitize)
+#define VPX_NO_UNSIGNED_OVERFLOW_CHECK \
+ __attribute__((no_sanitize("unsigned-integer-overflow")))
+#endif
+#endif
+
+#ifndef VPX_NO_UNSIGNED_OVERFLOW_CHECK
+#define VPX_NO_UNSIGNED_OVERFLOW_CHECK
+#endif
+
+/*
+ * The core of the MD5 algorithm, this alters an existing MD5 hash to
+ * reflect the addition of 16 longwords of new data. MD5Update blocks
+ * the data and converts bytes into longwords for this routine.
+ */
+VPX_NO_UNSIGNED_OVERFLOW_CHECK void MD5Transform(UWORD32 buf[4],
+ UWORD32 const in[16]) {
+ UWORD32 a, b, c, d;
+
+ a = buf[0];
+ b = buf[1];
+ c = buf[2];
+ d = buf[3];
+
+ MD5STEP(F1, a, b, c, d, in[0] + 0xd76aa478, 7);
+ MD5STEP(F1, d, a, b, c, in[1] + 0xe8c7b756, 12);
+ MD5STEP(F1, c, d, a, b, in[2] + 0x242070db, 17);
+ MD5STEP(F1, b, c, d, a, in[3] + 0xc1bdceee, 22);
+ MD5STEP(F1, a, b, c, d, in[4] + 0xf57c0faf, 7);
+ MD5STEP(F1, d, a, b, c, in[5] + 0x4787c62a, 12);
+ MD5STEP(F1, c, d, a, b, in[6] + 0xa8304613, 17);
+ MD5STEP(F1, b, c, d, a, in[7] + 0xfd469501, 22);
+ MD5STEP(F1, a, b, c, d, in[8] + 0x698098d8, 7);
+ MD5STEP(F1, d, a, b, c, in[9] + 0x8b44f7af, 12);
+ MD5STEP(F1, c, d, a, b, in[10] + 0xffff5bb1, 17);
+ MD5STEP(F1, b, c, d, a, in[11] + 0x895cd7be, 22);
+ MD5STEP(F1, a, b, c, d, in[12] + 0x6b901122, 7);
+ MD5STEP(F1, d, a, b, c, in[13] + 0xfd987193, 12);
+ MD5STEP(F1, c, d, a, b, in[14] + 0xa679438e, 17);
+ MD5STEP(F1, b, c, d, a, in[15] + 0x49b40821, 22);
+
+ MD5STEP(F2, a, b, c, d, in[1] + 0xf61e2562, 5);
+ MD5STEP(F2, d, a, b, c, in[6] + 0xc040b340, 9);
+ MD5STEP(F2, c, d, a, b, in[11] + 0x265e5a51, 14);
+ MD5STEP(F2, b, c, d, a, in[0] + 0xe9b6c7aa, 20);
+ MD5STEP(F2, a, b, c, d, in[5] + 0xd62f105d, 5);
+ MD5STEP(F2, d, a, b, c, in[10] + 0x02441453, 9);
+ MD5STEP(F2, c, d, a, b, in[15] + 0xd8a1e681, 14);
+ MD5STEP(F2, b, c, d, a, in[4] + 0xe7d3fbc8, 20);
+ MD5STEP(F2, a, b, c, d, in[9] + 0x21e1cde6, 5);
+ MD5STEP(F2, d, a, b, c, in[14] + 0xc33707d6, 9);
+ MD5STEP(F2, c, d, a, b, in[3] + 0xf4d50d87, 14);
+ MD5STEP(F2, b, c, d, a, in[8] + 0x455a14ed, 20);
+ MD5STEP(F2, a, b, c, d, in[13] + 0xa9e3e905, 5);
+ MD5STEP(F2, d, a, b, c, in[2] + 0xfcefa3f8, 9);
+ MD5STEP(F2, c, d, a, b, in[7] + 0x676f02d9, 14);
+ MD5STEP(F2, b, c, d, a, in[12] + 0x8d2a4c8a, 20);
+
+ MD5STEP(F3, a, b, c, d, in[5] + 0xfffa3942, 4);
+ MD5STEP(F3, d, a, b, c, in[8] + 0x8771f681, 11);
+ MD5STEP(F3, c, d, a, b, in[11] + 0x6d9d6122, 16);
+ MD5STEP(F3, b, c, d, a, in[14] + 0xfde5380c, 23);
+ MD5STEP(F3, a, b, c, d, in[1] + 0xa4beea44, 4);
+ MD5STEP(F3, d, a, b, c, in[4] + 0x4bdecfa9, 11);
+ MD5STEP(F3, c, d, a, b, in[7] + 0xf6bb4b60, 16);
+ MD5STEP(F3, b, c, d, a, in[10] + 0xbebfbc70, 23);
+ MD5STEP(F3, a, b, c, d, in[13] + 0x289b7ec6, 4);
+ MD5STEP(F3, d, a, b, c, in[0] + 0xeaa127fa, 11);
+ MD5STEP(F3, c, d, a, b, in[3] + 0xd4ef3085, 16);
+ MD5STEP(F3, b, c, d, a, in[6] + 0x04881d05, 23);
+ MD5STEP(F3, a, b, c, d, in[9] + 0xd9d4d039, 4);
+ MD5STEP(F3, d, a, b, c, in[12] + 0xe6db99e5, 11);
+ MD5STEP(F3, c, d, a, b, in[15] + 0x1fa27cf8, 16);
+ MD5STEP(F3, b, c, d, a, in[2] + 0xc4ac5665, 23);
+
+ MD5STEP(F4, a, b, c, d, in[0] + 0xf4292244, 6);
+ MD5STEP(F4, d, a, b, c, in[7] + 0x432aff97, 10);
+ MD5STEP(F4, c, d, a, b, in[14] + 0xab9423a7, 15);
+ MD5STEP(F4, b, c, d, a, in[5] + 0xfc93a039, 21);
+ MD5STEP(F4, a, b, c, d, in[12] + 0x655b59c3, 6);
+ MD5STEP(F4, d, a, b, c, in[3] + 0x8f0ccc92, 10);
+ MD5STEP(F4, c, d, a, b, in[10] + 0xffeff47d, 15);
+ MD5STEP(F4, b, c, d, a, in[1] + 0x85845dd1, 21);
+ MD5STEP(F4, a, b, c, d, in[8] + 0x6fa87e4f, 6);
+ MD5STEP(F4, d, a, b, c, in[15] + 0xfe2ce6e0, 10);
+ MD5STEP(F4, c, d, a, b, in[6] + 0xa3014314, 15);
+ MD5STEP(F4, b, c, d, a, in[13] + 0x4e0811a1, 21);
+ MD5STEP(F4, a, b, c, d, in[4] + 0xf7537e82, 6);
+ MD5STEP(F4, d, a, b, c, in[11] + 0xbd3af235, 10);
+ MD5STEP(F4, c, d, a, b, in[2] + 0x2ad7d2bb, 15);
+ MD5STEP(F4, b, c, d, a, in[9] + 0xeb86d391, 21);
+
+ buf[0] += a;
+ buf[1] += b;
+ buf[2] += c;
+ buf[3] += d;
+}
+
+#undef VPX_NO_UNSIGNED_OVERFLOW_CHECK
+
+#endif
diff --git a/tests/third_party/libvpx/md5_utils.h b/tests/third_party/libvpx/md5_utils.h
new file mode 100644
index 0000000..13be035
--- /dev/null
+++ b/tests/third_party/libvpx/md5_utils.h
@@ -0,0 +1,41 @@
+/*
+ * This is the header file for the MD5 message-digest algorithm.
+ * The algorithm is due to Ron Rivest. This code was
+ * written by Colin Plumb in 1993, no copyright is claimed.
+ * This code is in the public domain; do with it what you wish.
+ *
+ * Equivalent code is available from RSA Data Security, Inc.
+ * This code has been tested against that, and is equivalent,
+ * except that you don't need to include two pages of legalese
+ * with every copy.
+ *
+ * To compute the message digest of a chunk of bytes, declare an
+ * MD5Context structure, pass it to MD5Init, call MD5Update as
+ * needed on buffers full of bytes, and then call MD5Final, which
+ * will fill a supplied 16-byte array with the digest.
+ *
+ * Changed so as no longer to depend on Colin Plumb's `usual.h'
+ * header definitions
+ * - Ian Jackson <ian@chiark.greenend.org.uk>.
+ * Still in the public domain.
+ */
+
+#ifndef LIBGAV1_TESTS_THIRD_PARTY_LIBVPX_MD5_UTILS_H_
+#define LIBGAV1_TESTS_THIRD_PARTY_LIBVPX_MD5_UTILS_H_
+
+#define md5byte unsigned char
+#define UWORD32 unsigned int
+
+typedef struct MD5Context MD5Context;
+struct MD5Context {
+ UWORD32 buf[4];
+ UWORD32 bytes[2];
+ UWORD32 in[16];
+};
+
+void MD5Init(struct MD5Context *context);
+void MD5Update(struct MD5Context *context, md5byte const *buf, unsigned len);
+void MD5Final(unsigned char digest[16], struct MD5Context *context);
+void MD5Transform(UWORD32 buf[4], UWORD32 const in[16]);
+
+#endif // LIBGAV1_TESTS_THIRD_PARTY_LIBVPX_MD5_UTILS_H_
diff --git a/tests/utils.cc b/tests/utils.cc
new file mode 100644
index 0000000..b73cf01
--- /dev/null
+++ b/tests/utils.cc
@@ -0,0 +1,120 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tests/utils.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+#include <memory>
+#include <string>
+
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/dsp.h"
+#include "src/gav1/decoder_buffer.h"
+#include "src/utils/constants.h"
+#include "tests/third_party/libvpx/md5_helper.h"
+
+namespace libgav1 {
+namespace test_utils {
+
+void ResetDspTable(const int bitdepth) {
+ dsp::Dsp* const dsp = dsp_internal::GetWritableDspTable(bitdepth);
+ ASSERT_NE(dsp, nullptr);
+ memset(dsp, 0, sizeof(dsp::Dsp));
+}
+
+std::string GetMd5Sum(const void* bytes, size_t size) {
+ libvpx_test::MD5 md5;
+ md5.Add(static_cast<const uint8_t*>(bytes), size);
+ return md5.Get();
+}
+
+template <typename Pixel>
+std::string GetMd5Sum(const Pixel* block, int width, int height, int stride) {
+ libvpx_test::MD5 md5;
+ const Pixel* row = block;
+ for (int i = 0; i < height; ++i) {
+ md5.Add(reinterpret_cast<const uint8_t*>(row), width * sizeof(Pixel));
+ row += stride;
+ }
+ return md5.Get();
+}
+
+template std::string GetMd5Sum(const int8_t* block, int width, int height,
+ int stride);
+template std::string GetMd5Sum(const int16_t* block, int width, int height,
+ int stride);
+
+std::string GetMd5Sum(const DecoderBuffer& buffer) {
+ libvpx_test::MD5 md5;
+ const size_t pixel_size =
+ (buffer.bitdepth == 8) ? sizeof(uint8_t) : sizeof(uint16_t);
+ for (int plane = kPlaneY; plane < buffer.NumPlanes(); ++plane) {
+ const int height = buffer.displayed_height[plane];
+ const size_t width = buffer.displayed_width[plane] * pixel_size;
+ const int stride = buffer.stride[plane];
+ const uint8_t* plane_buffer = buffer.plane[plane];
+ for (int row = 0; row < height; ++row) {
+ md5.Add(plane_buffer, width);
+ plane_buffer += stride;
+ }
+ }
+ return md5.Get();
+}
+
+void CheckMd5Digest(const char name[], const char function_name[],
+ const char expected_digest[], const void* data, size_t size,
+ absl::Duration elapsed_time) {
+ const std::string digest = test_utils::GetMd5Sum(data, size);
+ printf("Mode %s[%31s]: %5d us MD5: %s\n", name, function_name,
+ static_cast<int>(absl::ToInt64Microseconds(elapsed_time)),
+ digest.c_str());
+ EXPECT_STREQ(expected_digest, digest.c_str());
+}
+
+template <typename Pixel>
+void CheckMd5Digest(const char name[], const char function_name[],
+ const char expected_digest[], const Pixel* block, int width,
+ int height, int stride, absl::Duration elapsed_time) {
+ const std::string digest =
+ test_utils::GetMd5Sum(block, width, height, stride);
+ printf("Mode %s[%31s]: %5d us MD5: %s\n", name, function_name,
+ static_cast<int>(absl::ToInt64Microseconds(elapsed_time)),
+ digest.c_str());
+ EXPECT_STREQ(expected_digest, digest.c_str());
+}
+
+template void CheckMd5Digest(const char name[], const char function_name[],
+ const char expected_digest[], const int8_t* block,
+ int width, int height, int stride,
+ absl::Duration elapsed_time);
+template void CheckMd5Digest(const char name[], const char function_name[],
+ const char expected_digest[], const int16_t* block,
+ int width, int height, int stride,
+ absl::Duration elapsed_time);
+
+void CheckMd5Digest(const char name[], const char function_name[],
+ const char expected_digest[], const char actual_digest[],
+ absl::Duration elapsed_time) {
+ printf("Mode %s[%31s]: %5d us MD5: %s\n", name, function_name,
+ static_cast<int>(absl::ToInt64Microseconds(elapsed_time)),
+ actual_digest);
+ EXPECT_STREQ(expected_digest, actual_digest);
+}
+
+} // namespace test_utils
+} // namespace libgav1
diff --git a/tests/utils.h b/tests/utils.h
new file mode 100644
index 0000000..b3062da
--- /dev/null
+++ b/tests/utils.h
@@ -0,0 +1,138 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_TESTS_UTILS_H_
+#define LIBGAV1_TESTS_UTILS_H_
+
+#include <cstddef>
+#include <new>
+#include <string>
+
+#include "absl/base/config.h"
+#include "absl/time/time.h"
+#include "src/gav1/decoder_buffer.h"
+#include "src/utils/memory.h"
+#include "tests/third_party/libvpx/acm_random.h"
+
+#ifdef ABSL_HAVE_EXCEPTIONS
+#include <exception>
+#endif
+
+namespace libgav1 {
+namespace test_utils {
+
+enum { kAlternateDeterministicSeed = 0x9571 };
+static_assert(kAlternateDeterministicSeed !=
+ libvpx_test::ACMRandom::DeterministicSeed(),
+ "");
+
+// Similar to libgav1::MaxAlignedAllocable, but retains the throwing versions
+// of new to support googletest allocations.
+struct MaxAlignedAllocable {
+ // Class-specific allocation functions.
+ static void* operator new(size_t size) {
+ void* const p =
+ libgav1::MaxAlignedAllocable::operator new(size, std::nothrow);
+#ifdef ABSL_HAVE_EXCEPTIONS
+ if (p == nullptr) throw std::bad_alloc();
+#endif
+ return p;
+ }
+ static void* operator new[](size_t size) {
+ void* const p =
+ libgav1::MaxAlignedAllocable::operator new[](size, std::nothrow);
+#ifdef ABSL_HAVE_EXCEPTIONS
+ if (p == nullptr) throw std::bad_alloc();
+#endif
+ return p;
+ }
+
+ // Class-specific non-throwing allocation functions
+ static void* operator new(size_t size, const std::nothrow_t& tag) noexcept {
+ return libgav1::MaxAlignedAllocable::operator new(size, tag);
+ }
+ static void* operator new[](size_t size, const std::nothrow_t& tag) noexcept {
+ return libgav1::MaxAlignedAllocable::operator new[](size, tag);
+ }
+
+ // Class-specific deallocation functions.
+ static void operator delete(void* ptr) noexcept {
+ libgav1::MaxAlignedAllocable::operator delete(ptr);
+ }
+ static void operator delete[](void* ptr) noexcept {
+ libgav1::MaxAlignedAllocable::operator delete[](ptr);
+ }
+
+ // Only called if new (std::nothrow) is used and the constructor throws an
+ // exception.
+ static void operator delete(void* ptr, const std::nothrow_t& tag) noexcept {
+ libgav1::MaxAlignedAllocable::operator delete(ptr, tag);
+ }
+ // Only called if new[] (std::nothrow) is used and the constructor throws an
+ // exception.
+ static void operator delete[](void* ptr, const std::nothrow_t& tag) noexcept {
+ libgav1::MaxAlignedAllocable::operator delete[](ptr, tag);
+ }
+};
+
+// Clears dsp table entries for |bitdepth|. This function is not thread safe.
+void ResetDspTable(int bitdepth);
+
+//------------------------------------------------------------------------------
+// Gets human readable hexadecimal encoded MD5 sum from given data, block, or
+// frame buffer.
+
+std::string GetMd5Sum(const void* bytes, size_t size);
+template <typename Pixel>
+std::string GetMd5Sum(const Pixel* block, int width, int height, int stride);
+std::string GetMd5Sum(const DecoderBuffer& buffer);
+
+//------------------------------------------------------------------------------
+// Compares the md5 digest of |size| bytes of |data| with |expected_digest|.
+// Prints a log message with |name|, |function_name|, md5 digest and
+// |elapsed_time|. |name| and |function_name| are merely tags used for logging
+// and can be any meaningful string depending on the caller's context.
+
+void CheckMd5Digest(const char name[], const char function_name[],
+ const char expected_digest[], const void* data, size_t size,
+ absl::Duration elapsed_time);
+
+//------------------------------------------------------------------------------
+// Compares the md5 digest of |block| with |expected_digest|. The width, height,
+// and stride of |block| are |width|, |height|, and |stride|, respectively.
+// Prints a log message with |name|, |function_name|, md5 digest and
+// |elapsed_time|. |name| and |function_name| are merely tags used for logging
+// and can be any meaningful string depending on the caller's context.
+
+template <typename Pixel>
+void CheckMd5Digest(const char name[], const char function_name[],
+ const char expected_digest[], const Pixel* block, int width,
+ int height, int stride, absl::Duration elapsed_time);
+
+//------------------------------------------------------------------------------
+// Compares |actual_digest| with |expected_digest|. Prints a log message with
+// |name|, |function_name|, md5 digest and |elapsed_time|. |name| and
+// |function_name| are merely tags used for logging and can be any meaningful
+// string depending on the caller's context.
+
+void CheckMd5Digest(const char name[], const char function_name[],
+ const char expected_digest[], const char actual_digest[],
+ absl::Duration elapsed_time);
+
+} // namespace test_utils
+} // namespace libgav1
+
+#endif // LIBGAV1_TESTS_UTILS_H_
diff --git a/tests/utils_test.cc b/tests/utils_test.cc
new file mode 100644
index 0000000..1d5b598
--- /dev/null
+++ b/tests/utils_test.cc
@@ -0,0 +1,190 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tests/utils.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <new>
+
+#include "absl/base/config.h"
+#include "gtest/gtest.h"
+#include "src/utils/memory.h"
+
+#ifdef ABSL_HAVE_EXCEPTIONS
+#include <exception>
+#endif
+
+namespace libgav1 {
+namespace test_utils {
+namespace {
+
+constexpr size_t kMaxAllocableSize = 0x40000000;
+
+// Has a trivial default constructor that performs no action.
+struct SmallMaxAligned : public MaxAlignedAllocable {
+ alignas(kMaxAlignment) uint8_t x;
+};
+
+// Has a nontrivial default constructor that initializes the data member.
+struct SmallMaxAlignedNontrivialConstructor : public MaxAlignedAllocable {
+ alignas(kMaxAlignment) uint8_t x = 0;
+};
+
+// Has a trivial default constructor that performs no action.
+struct HugeMaxAligned : public MaxAlignedAllocable {
+ alignas(kMaxAlignment) uint8_t x[kMaxAllocableSize + 1];
+};
+
+// Has a nontrivial default constructor that initializes the data member.
+struct HugeMaxAlignedNontrivialConstructor : public MaxAlignedAllocable {
+ alignas(kMaxAlignment) uint8_t x[kMaxAllocableSize + 1] = {};
+};
+
+#ifdef ABSL_HAVE_EXCEPTIONS
+struct MaxAlignedThrowingConstructor : public MaxAlignedAllocable {
+ MaxAlignedThrowingConstructor() { throw std::exception(); }
+
+ uint8_t x;
+};
+#endif
+
+TEST(TestUtilsTest, TestMaxAlignedAllocable) {
+ {
+ // MaxAlignedAllocable::operator new (std::nothrow) is called.
+ std::unique_ptr<SmallMaxAligned> small(new (std::nothrow) SmallMaxAligned);
+ EXPECT_NE(small, nullptr);
+ // Note this check doesn't guarantee conformance as a suitably aligned
+ // address may be returned from any allocator.
+ EXPECT_EQ(reinterpret_cast<uintptr_t>(small.get()) & (kMaxAlignment - 1),
+ 0);
+ // MaxAlignedAllocable::operator delete is called.
+ }
+
+ {
+ // MaxAlignedAllocable::operator new is called.
+ std::unique_ptr<SmallMaxAligned> small(new SmallMaxAligned);
+ EXPECT_NE(small, nullptr);
+ // Note this check doesn't guarantee conformance as a suitably aligned
+ // address may be returned from any allocator.
+ EXPECT_EQ(reinterpret_cast<uintptr_t>(small.get()) & (kMaxAlignment - 1),
+ 0);
+ // MaxAlignedAllocable::operator delete is called.
+ }
+
+ {
+ // MaxAlignedAllocable::operator new[] (std::nothrow) is called.
+ std::unique_ptr<SmallMaxAligned[]> small_array_of_smalls(
+ new (std::nothrow) SmallMaxAligned[10]);
+ EXPECT_NE(small_array_of_smalls, nullptr);
+ EXPECT_EQ(reinterpret_cast<uintptr_t>(small_array_of_smalls.get()) &
+ (kMaxAlignment - 1),
+ 0);
+ // MaxAlignedAllocable::operator delete[] is called.
+ }
+
+ {
+ // MaxAlignedAllocable::operator new[] is called.
+ std::unique_ptr<SmallMaxAligned[]> small_array_of_smalls(
+ new SmallMaxAligned[10]);
+ EXPECT_NE(small_array_of_smalls, nullptr);
+ EXPECT_EQ(reinterpret_cast<uintptr_t>(small_array_of_smalls.get()) &
+ (kMaxAlignment - 1),
+ 0);
+ // MaxAlignedAllocable::operator delete[] is called.
+ }
+
+ {
+ // MaxAlignedAllocable::operator new (std::nothrow) is called.
+ std::unique_ptr<HugeMaxAligned> huge(new (std::nothrow) HugeMaxAligned);
+ EXPECT_EQ(huge, nullptr);
+ }
+
+ {
+ // MaxAlignedAllocable::operator new[] (std::nothrow) is called.
+ std::unique_ptr<SmallMaxAligned[]> huge_array_of_smalls(
+ new (std::nothrow)
+ SmallMaxAligned[kMaxAllocableSize / sizeof(SmallMaxAligned) + 1]);
+ EXPECT_EQ(huge_array_of_smalls, nullptr);
+ }
+
+#ifdef ABSL_HAVE_EXCEPTIONS
+ try {
+ // MaxAlignedAllocable::operator new (std::nothrow) is called.
+ // The constructor throws an exception.
+ // MaxAlignedAllocable::operator delete (std::nothrow) is called.
+ auto* always = new (std::nothrow) MaxAlignedThrowingConstructor;
+ static_cast<void>(always);
+ } catch (...) {
+ }
+
+ try {
+ // MaxAlignedAllocable::operator new is called.
+ // The constructor throws an exception.
+ // MaxAlignedAllocable::operator delete is called.
+ auto* always = new MaxAlignedThrowingConstructor;
+ static_cast<void>(always);
+ } catch (...) {
+ }
+
+ try {
+ // MaxAlignedAllocable::operator new[] (std::nothrow) is called.
+ // The constructor throws an exception.
+ // MaxAlignedAllocable::operator delete[] (std::nothrow) is called.
+ auto* always = new (std::nothrow) MaxAlignedThrowingConstructor[2];
+ static_cast<void>(always);
+ } catch (...) {
+ }
+
+ try {
+ // MaxAlignedAllocable::operator new[] is called.
+ // The constructor throws an exception.
+ // MaxAlignedAllocable::operator delete[] is called.
+ auto* always = new MaxAlignedThrowingConstructor[2];
+ static_cast<void>(always);
+ } catch (...) {
+ }
+
+ // Note these calls are only safe with exceptions enabled as if the throwing
+ // operator new returns the object is expected to be valid. In this case an
+ // attempt to invoke the object's constructor on a nullptr may be made which
+ // is undefined behavior.
+ try {
+ // MaxAlignedAllocable::operator new is called.
+ std::unique_ptr<HugeMaxAlignedNontrivialConstructor> huge(
+ new HugeMaxAlignedNontrivialConstructor);
+ ADD_FAILURE() << "huge allocation should fail.";
+ } catch (...) {
+ SUCCEED();
+ }
+
+ try {
+ // MaxAlignedAllocable::operator new[] is called.
+ std::unique_ptr<SmallMaxAlignedNontrivialConstructor[]>
+ huge_array_of_smalls(
+ new SmallMaxAlignedNontrivialConstructor
+ [kMaxAllocableSize /
+ sizeof(SmallMaxAlignedNontrivialConstructor) +
+ 1]);
+ ADD_FAILURE() << "huge_array_of_smalls allocation should fail.";
+ } catch (...) {
+ SUCCEED();
+ }
+#endif // ABSL_HAVE_EXCEPTIONS
+}
+
+} // namespace
+} // namespace test_utils
+} // namespace libgav1