diff options
author | qinxialei <xialeiqin@gmail.com> | 2021-04-22 11:20:18 +0800 |
---|---|---|
committer | qinxialei <xialeiqin@gmail.com> | 2021-04-22 11:20:18 +0800 |
commit | 81ce37eb93e8ce442ecb1855a4e7166628128ac7 (patch) | |
tree | 2af6329f74f88ce090d08c61db5fb4bed8656584 | |
parent | 4dab0c756a3cdd65b43470a4cca835422b32ca6e (diff) | |
parent | 2381d803c76105f44717d75f089ec37f51e5cfe4 (diff) | |
download | libgav1-81ce37eb93e8ce442ecb1855a4e7166628128ac7.tar.gz libgav1-81ce37eb93e8ce442ecb1855a4e7166628128ac7.tar.bz2 libgav1-81ce37eb93e8ce442ecb1855a4e7166628128ac7.zip |
Update upstream source from tag 'upstream/0.16.3'
Update to upstream version '0.16.3'
with Debian dir a433e04a69210eb8fcdd6089240e161eb33f0590
179 files changed, 36565 insertions, 7327 deletions
diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..b934084 --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +* whitespace=tab-in-indent,space-before-tab,trailing-space diff --git a/CMakeLists.txt b/CMakeLists.txt index 5d00ae6..5e9e17a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -36,31 +36,17 @@ endif() set(libgav1_examples "${libgav1_root}/examples") set(libgav1_source "${libgav1_root}/src") -include(FindThreads) - -include("${libgav1_examples}/libgav1_examples.cmake") -include("${libgav1_root}/cmake/libgav1_build_definitions.cmake") -include("${libgav1_root}/cmake/libgav1_cpu_detection.cmake") -include("${libgav1_root}/cmake/libgav1_flags.cmake") -include("${libgav1_root}/cmake/libgav1_helpers.cmake") -include("${libgav1_root}/cmake/libgav1_install.cmake") -include("${libgav1_root}/cmake/libgav1_intrinsics.cmake") include("${libgav1_root}/cmake/libgav1_options.cmake") -include("${libgav1_root}/cmake/libgav1_sanitizer.cmake") -include("${libgav1_root}/cmake/libgav1_targets.cmake") -include("${libgav1_root}/cmake/libgav1_variables.cmake") -include("${libgav1_source}/dsp/libgav1_dsp.cmake") -include("${libgav1_source}/libgav1_decoder.cmake") -include("${libgav1_source}/utils/libgav1_utils.cmake") libgav1_option(NAME LIBGAV1_ENABLE_OPTIMIZATIONS HELPSTRING "Enables optimized code." VALUE ON) -libgav1_option(NAME LIBGAV1_ENABLE_AVX2 HELPSTRING - "Enables avx2 optimizations." VALUE ON) +libgav1_option(NAME LIBGAV1_ENABLE_AVX2 HELPSTRING "Enables avx2 optimizations." + VALUE ON) libgav1_option(NAME LIBGAV1_ENABLE_NEON HELPSTRING "Enables neon optimizations." VALUE ON) libgav1_option(NAME LIBGAV1_ENABLE_SSE4_1 HELPSTRING "Enables sse4.1 optimizations." VALUE ON) +libgav1_option(NAME LIBGAV1_ENABLE_TESTS HELPSTRING "Enables tests." VALUE ON) libgav1_option( NAME LIBGAV1_VERBOSE HELPSTRING "Enables verbose build system output. Higher numbers are more verbose." VALUE @@ -70,6 +56,23 @@ if(NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE Release) endif() +include(FindThreads) + +include("${libgav1_examples}/libgav1_examples.cmake") +include("${libgav1_root}/cmake/libgav1_build_definitions.cmake") +include("${libgav1_root}/cmake/libgav1_cpu_detection.cmake") +include("${libgav1_root}/cmake/libgav1_flags.cmake") +include("${libgav1_root}/cmake/libgav1_helpers.cmake") +include("${libgav1_root}/cmake/libgav1_install.cmake") +include("${libgav1_root}/cmake/libgav1_intrinsics.cmake") +include("${libgav1_root}/cmake/libgav1_sanitizer.cmake") +include("${libgav1_root}/cmake/libgav1_targets.cmake") +include("${libgav1_root}/cmake/libgav1_variables.cmake") +include("${libgav1_root}/tests/libgav1_tests.cmake") +include("${libgav1_source}/dsp/libgav1_dsp.cmake") +include("${libgav1_source}/libgav1_decoder.cmake") +include("${libgav1_source}/utils/libgav1_utils.cmake") + libgav1_optimization_detect() libgav1_set_build_definitions() libgav1_set_cxx_flags() @@ -109,13 +112,27 @@ if(NOT "${LIBGAV1_EXE_LINKER_FLAGS}" STREQUAL "") separate_arguments(LIBGAV1_EXE_LINKER_FLAGS) endif() -add_subdirectory("${libgav1_root}/third_party/abseil-cpp" - "${libgav1_abseil_build}" EXCLUDE_FROM_ALL) +# Set test-only flags based on LIBGAV1_CXX_FLAGS. +libgav1_set_test_flags() + +set(libgav1_abseil "${libgav1_root}/third_party/abseil-cpp") +if(NOT EXISTS "${libgav1_abseil}") + message( + FATAL_ERROR + "Abseil not found. This dependency is required by the" + " examples & tests and libgav1 when LIBGAV1_THREADPOOL_USE_STD_MUTEX is" + " not defined. To continue, download the Abseil repository to" + " third_party/abseil-cpp:\n git \\\n -C ${libgav1_root} \\\n" + " clone \\\n" + " https://github.com/abseil/abseil-cpp.git third_party/abseil-cpp") +endif() +add_subdirectory("${libgav1_abseil}" "${libgav1_abseil_build}" EXCLUDE_FROM_ALL) libgav1_reset_target_lists() libgav1_add_dsp_targets() libgav1_add_decoder_targets() libgav1_add_examples_targets() +libgav1_add_tests_targets() libgav1_add_utils_targets() libgav1_setup_install_target() @@ -20,7 +20,18 @@ information on the AV1 video format can be found at From within the libgav1 directory: ```shell - $ git clone https://github.com/abseil/abseil-cpp.git third_party/abseil-cpp + $ git clone https://github.com/abseil/abseil-cpp.git third_party/abseil-cpp + ``` + + Note: Abseil is required by the examples and tests. libgav1 will depend on + it if `LIBGAV1_THREADPOOL_USE_STD_MUTEX` is set to `0` (see below). + +4. (Optional) [GoogleTest](https://github.com/google/googletest) + + From within the libgav1 directory: + + ```shell + $ git clone https://github.com/google/googletest.git third_party/googletest ``` ### Compile @@ -58,10 +69,11 @@ Configuration options: * `LIBGAV1_THREADPOOL_USE_STD_MUTEX`: controls use of std::mutex and absl::Mutex in ThreadPool. Defining this to 1 will remove any Abseil dependency from the core library. Automatically defined in - `src/utils/threadpool.h` if unset. + `src/utils/threadpool.h` if unset. Defaults to 1 on Android & iOS, 0 + otherwise. * `LIBGAV1_MAX_THREADS`: sets the number of threads that the library is - allowed to create. Has to be an integer > 0. Otherwise this is ignored. - The default value is 128. + allowed to create. Has to be an integer > 0. Otherwise this is ignored. The + default value is 128. * `LIBGAV1_FRAME_PARALLEL_THRESHOLD_MULTIPLIER`: the threshold multiplier that is used to determine when to use frame parallel decoding. Frame parallel decoding will be used if |threads| > |tile_count| * this multiplier. Has to diff --git a/cmake/libgav1_build_definitions.cmake b/cmake/libgav1_build_definitions.cmake index b170e7e..fc83490 100644 --- a/cmake/libgav1_build_definitions.cmake +++ b/cmake/libgav1_build_definitions.cmake @@ -21,7 +21,24 @@ macro(libgav1_set_build_definitions) string(TOLOWER "${CMAKE_BUILD_TYPE}" build_type_lowercase) libgav1_load_version_info() - set(LIBGAV1_SOVERSION 0) + + # Library version info. See the libtool docs for updating the values: + # https://www.gnu.org/software/libtool/manual/libtool.html#Updating-version-info + # + # c=<current>, r=<revision>, a=<age> + # + # libtool generates a .so file as .so.[c-a].a.r, while -version-info c:r:a is + # passed to libtool. + # + # We set LIBGAV1_SOVERSION = [c-a].a.r + set(LT_CURRENT 0) + set(LT_REVISION 0) + set(LT_AGE 0) + math(EXPR LIBGAV1_SOVERSION_MAJOR "${LT_CURRENT} - ${LT_AGE}") + set(LIBGAV1_SOVERSION "${LIBGAV1_SOVERSION_MAJOR}.${LT_AGE}.${LT_REVISION}") + unset(LT_CURRENT) + unset(LT_REVISION) + unset(LT_AGE) list(APPEND libgav1_include_paths "${libgav1_root}" "${libgav1_root}/src" "${libgav1_build}" "${libgav1_root}/third_party/abseil-cpp") @@ -89,9 +106,7 @@ macro(libgav1_set_build_definitions) endif() if(build_type_lowercase MATCHES "rel") - # TODO(tomfinegan): this value is only a concern for the core library and - # can be made smaller if the test targets are avoided. - list(APPEND libgav1_base_cxx_flags "-Wstack-usage=196608") + list(APPEND libgav1_base_cxx_flags "-Wframe-larger-than=196608") endif() list(APPEND libgav1_msvc_cxx_flags diff --git a/cmake/libgav1_flags.cmake b/cmake/libgav1_flags.cmake index 2d8d9a6..a5408e2 100644 --- a/cmake/libgav1_flags.cmake +++ b/cmake/libgav1_flags.cmake @@ -205,7 +205,7 @@ macro(libgav1_test_exe_linker_flag) # Restore cached global exe linker flags. if(cached_CMAKE_EXE_LINKER_FLAGS) - set(CMAKE_EXE_LINKER_FLAGS cached_CMAKE_EXE_LINKER_FLAGS) + set(CMAKE_EXE_LINKER_FLAGS ${cached_CMAKE_EXE_LINKER_FLAGS}) else() unset(CMAKE_EXE_LINKER_FLAGS) endif() @@ -249,3 +249,15 @@ macro(libgav1_set_cxx_flags) libgav1_test_cxx_flag(FLAG_LIST_VAR_NAMES ${cxx_flag_lists}) endmacro() + +# Sets LIBGAV1_TEST_C_FLAGS and LIBGAV1_TEST_CXX_FLAGS. +# +# Note: libgav1_set_cxx_flags() must be called before this macro. Furthermore, +# the call to this macro should be made after all additions to LIBGAV1_CXX_FLAGS +# are complete. +macro(libgav1_set_test_flags) + if(LIBGAV1_ENABLE_TESTS) + set(LIBGAV1_TEST_CXX_FLAGS ${LIBGAV1_CXX_FLAGS}) + list(FILTER LIBGAV1_TEST_CXX_FLAGS EXCLUDE REGEX "-Wframe-larger-than") + endif() +endmacro() diff --git a/cmake/libgav1_helpers.cmake b/cmake/libgav1_helpers.cmake index 76d8d67..ac16257 100644 --- a/cmake/libgav1_helpers.cmake +++ b/cmake/libgav1_helpers.cmake @@ -20,7 +20,13 @@ set(LIBGAV1_CMAKE_LIBGAV1_HELPERS_CMAKE_ 1) # Kills build generation using message(FATAL_ERROR) and outputs all data passed # to the console via use of $ARGN. macro(libgav1_die) - message(FATAL_ERROR ${ARGN}) + # macro parameters are not variables so a temporary is needed to work with + # list(). + set(msg ${ARGN}) + # message(${ARGN}) will merge all list elements with no separator while + # "${ARGN}" will output the list as a ';' delimited string. + list(JOIN msg " " msg) + message(FATAL_ERROR "${msg}") endmacro() # Converts semi-colon delimited list variable(s) to string. Output is written to @@ -94,10 +100,10 @@ macro(libgav1_create_dummy_source_file) "${dummy_source_dir}/libgav1_${cdsf_TARGET}_${cdsf_BASENAME}.cc") set(dummy_source_code "// Generated file. DO NOT EDIT!\n" - "// C++ source file created for target ${cdsf_TARGET}. \n" - "void libgav1_${cdsf_TARGET}_${cdsf_BASENAME}_dummy_function(void);\n" + "// C++ source file created for target ${cdsf_TARGET}.\n" + "void libgav1_${cdsf_TARGET}_${cdsf_BASENAME}_dummy_function(void)\;\n" "void libgav1_${cdsf_TARGET}_${cdsf_BASENAME}_dummy_function(void) {}\n") - file(WRITE "${dummy_source_file}" "${dummy_source_code}") + file(WRITE "${dummy_source_file}" ${dummy_source_code}) target_sources(${cdsf_TARGET} PRIVATE ${dummy_source_file}) diff --git a/cmake/libgav1_sanitizer.cmake b/cmake/libgav1_sanitizer.cmake index 4bb2263..2f9ee07 100644 --- a/cmake/libgav1_sanitizer.cmake +++ b/cmake/libgav1_sanitizer.cmake @@ -39,7 +39,9 @@ macro(libgav1_configure_sanitizer) list(APPEND LIBGAV1_CXX_FLAGS "-fno-omit-frame-pointer" "-fno-optimize-sibling-calls") - libgav1_test_cxx_flag(FLAG_LIST_VAR_NAMES LIBGAV1_CXX_FLAGS FLAG_REQUIRED) + # Check the linker flags first as they may be required in the compile check + # to avoid undefined symbols related to the sanitizer. libgav1_test_exe_linker_flag(FLAG_LIST_VAR_NAME LIBGAV1_EXE_LINKER_FLAGS) + libgav1_test_cxx_flag(FLAG_LIST_VAR_NAMES LIBGAV1_CXX_FLAGS FLAG_REQUIRED) endif() endmacro() diff --git a/cmake/libgav1_targets.cmake b/cmake/libgav1_targets.cmake index 78b4865..997f8bd 100644 --- a/cmake/libgav1_targets.cmake +++ b/cmake/libgav1_targets.cmake @@ -29,7 +29,7 @@ endmacro() # Creates an executable target. The target name is passed as a parameter to the # NAME argument, and the sources passed as a parameter to the SOURCES argument: -# libgav1_add_test(NAME <name> SOURCES <sources> [optional args]) +# libgav1_add_executable(NAME <name> SOURCES <sources> [optional args]) # # Optional args: # cmake-format: off @@ -115,15 +115,35 @@ macro(libgav1_add_executable) target_include_directories(${exe_NAME} PRIVATE ${exe_INCLUDES}) endif() - if(exe_COMPILE_FLAGS OR LIBGAV1_CXX_FLAGS) + unset(exe_LIBGAV1_COMPILE_FLAGS) + if(exe_TEST) + list(FILTER exe_SOURCES INCLUDE REGEX "\\.c$") + list(LENGTH exe_SOURCES exe_SOURCES_length) + if(exe_SOURCES_length EQUAL 0) + set(exe_LIBGAV1_COMPILE_FLAGS ${LIBGAV1_TEST_CXX_FLAGS}) + else() + set(exe_LIBGAV1_COMPILE_FLAGS ${LIBGAV1_TEST_C_FLAGS}) + endif() + else() + set(exe_LIBGAV1_COMPILE_FLAGS ${LIBGAV1_CXX_FLAGS}) + endif() + + if(exe_COMPILE_FLAGS OR exe_LIBGAV1_COMPILE_FLAGS) target_compile_options(${exe_NAME} - PRIVATE ${exe_COMPILE_FLAGS} ${LIBGAV1_CXX_FLAGS}) + PRIVATE ${exe_COMPILE_FLAGS} + ${exe_LIBGAV1_COMPILE_FLAGS}) endif() if(exe_LINK_FLAGS OR LIBGAV1_EXE_LINKER_FLAGS) - set_target_properties(${exe_NAME} - PROPERTIES LINK_FLAGS ${exe_LINK_FLAGS} - ${LIBGAV1_EXE_LINKER_FLAGS}) + list(APPEND exe_LINK_FLAGS "${LIBGAV1_EXE_LINKER_FLAGS}") + if(${CMAKE_VERSION} VERSION_LESS "3.13") + # LINK_FLAGS is managed as a string. + libgav1_set_and_stringify(SOURCE "${exe_LINK_FLAGS}" DEST exe_LINK_FLAGS) + set_target_properties(${exe_NAME} + PROPERTIES LINK_FLAGS "${exe_LINK_FLAGS}") + else() + target_link_options(${exe_NAME} PRIVATE ${exe_LINK_FLAGS}) + endif() endif() if(exe_OBJLIB_DEPS) @@ -137,7 +157,7 @@ macro(libgav1_add_executable) endif() if(BUILD_SHARED_LIBS AND (MSVC OR WIN32)) - target_compile_definitions(${lib_NAME} PRIVATE "LIBGAV1_BUILDING_DLL=0") + target_compile_definitions(${exe_NAME} PRIVATE "LIBGAV1_BUILDING_DLL=0") endif() if(exe_LIB_DEPS) @@ -321,7 +341,9 @@ macro(libgav1_add_library) endif() if(lib_TYPE STREQUAL SHARED AND NOT MSVC) - set_target_properties(${lib_NAME} PROPERTIES SOVERSION ${LIBGAV1_SOVERSION}) + set_target_properties(${lib_NAME} + PROPERTIES VERSION ${LIBGAV1_SOVERSION} SOVERSION + ${LIBGAV1_SOVERSION_MAJOR}) endif() if(BUILD_SHARED_LIBS AND (MSVC OR WIN32)) diff --git a/examples/gav1_decode.cc b/examples/gav1_decode.cc index 4de0ba2..1408e8c 100644 --- a/examples/gav1_decode.cc +++ b/examples/gav1_decode.cc @@ -419,6 +419,9 @@ int main(int argc, char* argv[]) { input_buffers.ReleaseInputBuffer(input_buffer); } input_buffer = nullptr; + // Clear any in progress frames to ensure the output frame limit is + // respected. + decoder.SignalEOS(); } } while (input_buffer != nullptr || (!file_reader->IsEndOfFile() && !limit_reached) || diff --git a/examples/logging.h b/examples/logging.h index c0bcad7..cf5a09f 100644 --- a/examples/logging.h +++ b/examples/logging.h @@ -46,7 +46,7 @@ constexpr const char* Basename(const char* file_name, size_t offset) { #define LIBGAV1_EXAMPLES_LOG_ERROR(error_string) \ do { \ constexpr const char* libgav1_examples_basename = \ - ::libgav1::examples::Basename(__FILE__, sizeof(__FILE__) - 1); \ + libgav1::examples::Basename(__FILE__, sizeof(__FILE__) - 1); \ fprintf(stderr, "%s:%d (%s): %s.\n", libgav1_examples_basename, __LINE__, \ __func__, error_string); \ } while (false) diff --git a/src/decoder_impl.cc b/src/decoder_impl.cc index 751671d..e23903c 100644 --- a/src/decoder_impl.cc +++ b/src/decoder_impl.cc @@ -36,7 +36,6 @@ #include "src/utils/common.h" #include "src/utils/constants.h" #include "src/utils/logging.h" -#include "src/utils/parameter_tree.h" #include "src/utils/raw_bit_reader.h" #include "src/utils/segmentation.h" #include "src/utils/threadpool.h" @@ -631,10 +630,6 @@ DecoderImpl::~DecoderImpl() { } StatusCode DecoderImpl::Init() { - if (!GenerateWedgeMask(&wedge_masks_)) { - LIBGAV1_DLOG(ERROR, "GenerateWedgeMask() failed."); - return kStatusOutOfMemory; - } if (!output_frame_queue_.Init(kMaxLayers)) { LIBGAV1_DLOG(ERROR, "output_frame_queue_.Init() failed."); return kStatusOutOfMemory; @@ -857,6 +852,10 @@ StatusCode DecoderImpl::ParseAndSchedule(const uint8_t* data, size_t size, LIBGAV1_DLOG(ERROR, "InitializeQuantizerMatrix() failed."); return kStatusOutOfMemory; } + if (!MaybeInitializeWedgeMasks(obu->frame_header().frame_type)) { + LIBGAV1_DLOG(ERROR, "InitializeWedgeMasks() failed."); + return kStatusOutOfMemory; + } if (IsNewSequenceHeader(*obu)) { const ObuSequenceHeader& sequence_header = obu->sequence_header(); const Libgav1ImageFormat image_format = @@ -1050,6 +1049,10 @@ StatusCode DecoderImpl::DecodeTemporalUnit(const TemporalUnit& temporal_unit, LIBGAV1_DLOG(ERROR, "InitializeQuantizerMatrix() failed."); return kStatusOutOfMemory; } + if (!MaybeInitializeWedgeMasks(obu->frame_header().frame_type)) { + LIBGAV1_DLOG(ERROR, "InitializeWedgeMasks() failed."); + return kStatusOutOfMemory; + } if (IsNewSequenceHeader(*obu)) { const ObuSequenceHeader& sequence_header = obu->sequence_header(); const Libgav1ImageFormat image_format = @@ -1278,8 +1281,7 @@ StatusCode DecoderImpl::DecodeTiles( // without having to check for boundary conditions. if (!frame_scratch_buffer->block_parameters_holder.Reset( frame_header.rows4x4 + kMaxBlockHeight4x4, - frame_header.columns4x4 + kMaxBlockWidth4x4, - sequence_header.use_128x128_superblock)) { + frame_header.columns4x4 + kMaxBlockWidth4x4)) { return kStatusOutOfMemory; } const dsp::Dsp* const dsp = @@ -1646,6 +1648,17 @@ bool DecoderImpl::IsNewSequenceHeader(const ObuParser& obu) { return sequence_header_changed; } +bool DecoderImpl::MaybeInitializeWedgeMasks(FrameType frame_type) { + if (IsIntraFrame(frame_type) || wedge_masks_initialized_) { + return true; + } + if (!GenerateWedgeMask(&wedge_masks_)) { + return false; + } + wedge_masks_initialized_ = true; + return true; +} + bool DecoderImpl::MaybeInitializeQuantizerMatrix( const ObuFrameHeader& frame_header) { if (quantizer_matrix_initialized_ || !frame_header.quantizer.use_matrix) { diff --git a/src/decoder_impl.h b/src/decoder_impl.h index 721b666..b52ecdf 100644 --- a/src/decoder_impl.h +++ b/src/decoder_impl.h @@ -215,6 +215,10 @@ class DecoderImpl : public Allocable { // |quantizer_matrix_initialized_| to true. bool MaybeInitializeQuantizerMatrix(const ObuFrameHeader& frame_header); + // Allocates and generates the |wedge_masks_| if necessary and sets + // |wedge_masks_initialized_| to true. + bool MaybeInitializeWedgeMasks(FrameType frame_type); + // Elements in this queue cannot be moved with std::move since the // |EncodedFrame.temporal_unit| stores a pointer to elements in this queue. Queue<TemporalUnit> temporal_units_; @@ -233,6 +237,7 @@ class DecoderImpl : public Allocable { BufferPool buffer_pool_; WedgeMaskArray wedge_masks_; + bool wedge_masks_initialized_ = false; QuantizerMatrix quantizer_matrix_; bool quantizer_matrix_initialized_ = false; FrameScratchBufferPool frame_scratch_buffer_pool_; diff --git a/src/decoder_state.h b/src/decoder_state.h index 897c99f..ea5c792 100644 --- a/src/decoder_state.h +++ b/src/decoder_state.h @@ -33,7 +33,6 @@ struct DecoderState { for (int ref_index = 0, mask = refresh_frame_flags; mask != 0; ++ref_index, mask >>= 1) { if ((mask & 1) != 0) { - reference_valid[ref_index] = true; reference_frame_id[ref_index] = current_frame_id; reference_frame[ref_index] = current_frame; reference_order_hint[ref_index] = order_hint; @@ -43,7 +42,6 @@ struct DecoderState { // Clears all the reference frames. void ClearReferenceFrames() { - reference_valid = {}; reference_frame_id = {}; reference_order_hint = {}; for (int ref_index = 0; ref_index < kNumReferenceFrameTypes; ++ref_index) { @@ -51,12 +49,11 @@ struct DecoderState { } } - // reference_valid and reference_frame_id are used only if - // sequence_header_.frame_id_numbers_present is true. - // The reference_valid array is indexed by a reference picture slot number. - // A value (boolean) in the array signifies whether the corresponding - // reference picture slot is valid for use as a reference picture. - std::array<bool, kNumReferenceFrameTypes> reference_valid = {}; + // reference_frame_id and current_frame_id have meaningful values and are used + // in checks only if sequence_header_.frame_id_numbers_present is true. If + // sequence_header_.frame_id_numbers_present is false, reference_frame_id and + // current_frame_id are assigned the default value 0 and are not used in + // checks. std::array<uint16_t, kNumReferenceFrameTypes> reference_frame_id = {}; // A valid value of current_frame_id is an unsigned integer of at most 16 // bits. -1 indicates current_frame_id is not initialized. @@ -81,6 +78,11 @@ struct DecoderState { // * |true| indicates that the reference frame is a backwards reference. // Note: reference_frame_sign_bias[0] (for kReferenceFrameIntra) is not used. std::array<bool, kNumReferenceFrameTypes> reference_frame_sign_bias = {}; + // The RefValid[i] variable in the spec does not need to be stored explicitly. + // If the RefValid[i] variable in the spec is 0, then reference_frame[i] is a + // null pointer. (Whenever the spec sets the RefValid[i] variable to 0, we set + // reference_frame[i] to a null pointer.) If the RefValid[i] variable in the + // spec is 1, then reference_frame[i] contains a frame buffer pointer. std::array<RefCountedBufferPtr, kNumReferenceFrameTypes> reference_frame; }; diff --git a/src/dsp/arm/average_blend_neon.cc b/src/dsp/arm/average_blend_neon.cc index 834e8b4..5b4c094 100644 --- a/src/dsp/arm/average_blend_neon.cc +++ b/src/dsp/arm/average_blend_neon.cc @@ -35,6 +35,11 @@ namespace { constexpr int kInterPostRoundBit = kInterRoundBitsVertical - kInterRoundBitsCompoundVertical; +} // namespace + +namespace low_bitdepth { +namespace { + inline uint8x8_t AverageBlend8Row(const int16_t* prediction_0, const int16_t* prediction_1) { const int16x8_t pred0 = vld1q_s16(prediction_0); @@ -128,13 +133,139 @@ void Init8bpp() { } } // namespace +} // namespace low_bitdepth + +#if LIBGAV1_MAX_BITDEPTH >= 10 +namespace high_bitdepth { +namespace { + +inline uint16x8_t AverageBlend8Row(const uint16_t* prediction_0, + const uint16_t* prediction_1, + const int32x4_t compound_offset, + const uint16x8_t v_bitdepth) { + const uint16x8_t pred0 = vld1q_u16(prediction_0); + const uint16x8_t pred1 = vld1q_u16(prediction_1); + const uint32x4_t pred_lo = + vaddl_u16(vget_low_u16(pred0), vget_low_u16(pred1)); + const uint32x4_t pred_hi = + vaddl_u16(vget_high_u16(pred0), vget_high_u16(pred1)); + const int32x4_t offset_lo = + vsubq_s32(vreinterpretq_s32_u32(pred_lo), compound_offset); + const int32x4_t offset_hi = + vsubq_s32(vreinterpretq_s32_u32(pred_hi), compound_offset); + const uint16x4_t res_lo = vqrshrun_n_s32(offset_lo, kInterPostRoundBit + 1); + const uint16x4_t res_hi = vqrshrun_n_s32(offset_hi, kInterPostRoundBit + 1); + return vminq_u16(vcombine_u16(res_lo, res_hi), v_bitdepth); +} + +inline void AverageBlendLargeRow(const uint16_t* prediction_0, + const uint16_t* prediction_1, const int width, + uint16_t* dest, + const int32x4_t compound_offset, + const uint16x8_t v_bitdepth) { + int x = width; + do { + vst1q_u16(dest, AverageBlend8Row(prediction_0, prediction_1, + compound_offset, v_bitdepth)); + prediction_0 += 8; + prediction_1 += 8; + dest += 8; + + vst1q_u16(dest, AverageBlend8Row(prediction_0, prediction_1, + compound_offset, v_bitdepth)); + prediction_0 += 8; + prediction_1 += 8; + dest += 8; + + x -= 16; + } while (x != 0); +} + +void AverageBlend_NEON(const void* prediction_0, const void* prediction_1, + const int width, const int height, void* const dest, + const ptrdiff_t dest_stride) { + auto* dst = static_cast<uint16_t*>(dest); + const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); + const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); + int y = height; + + const ptrdiff_t dst_stride = dest_stride >> 1; + const int32x4_t compound_offset = + vdupq_n_s32(static_cast<int32_t>(kCompoundOffset + kCompoundOffset)); + const uint16x8_t v_bitdepth = vdupq_n_u16((1 << kBitdepth10) - 1); + if (width == 4) { + do { + const uint16x8_t result = + AverageBlend8Row(pred_0, pred_1, compound_offset, v_bitdepth); + pred_0 += 8; + pred_1 += 8; + + vst1_u16(dst, vget_low_u16(result)); + dst += dst_stride; + vst1_u16(dst, vget_high_u16(result)); + dst += dst_stride; + y -= 2; + } while (y != 0); + return; + } + + if (width == 8) { + do { + vst1q_u16(dst, + AverageBlend8Row(pred_0, pred_1, compound_offset, v_bitdepth)); + dst += dst_stride; + pred_0 += 8; + pred_1 += 8; + + vst1q_u16(dst, + AverageBlend8Row(pred_0, pred_1, compound_offset, v_bitdepth)); + dst += dst_stride; + pred_0 += 8; + pred_1 += 8; + + y -= 2; + } while (y != 0); + return; + } + + do { + AverageBlendLargeRow(pred_0, pred_1, width, dst, compound_offset, + v_bitdepth); + dst += dst_stride; + pred_0 += width; + pred_1 += width; + + AverageBlendLargeRow(pred_0, pred_1, width, dst, compound_offset, + v_bitdepth); + dst += dst_stride; + pred_0 += width; + pred_1 += width; + + y -= 2; + } while (y != 0); +} + +void Init10bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10); + assert(dsp != nullptr); + dsp->average_blend = AverageBlend_NEON; +} + +} // namespace +} // namespace high_bitdepth +#endif // LIBGAV1_MAX_BITDEPTH >= 10 -void AverageBlendInit_NEON() { Init8bpp(); } +void AverageBlendInit_NEON() { + low_bitdepth::Init8bpp(); +#if LIBGAV1_MAX_BITDEPTH >= 10 + high_bitdepth::Init10bpp(); +#endif // LIBGAV1_MAX_BITDEPTH >= 10 +} } // namespace dsp } // namespace libgav1 -#else // !LIBGAV1_ENABLE_NEON +#else // !LIBGAV1_ENABLE_NEON namespace libgav1 { namespace dsp { diff --git a/src/dsp/arm/cdef_neon.cc b/src/dsp/arm/cdef_neon.cc index 4d0e76f..60c72d6 100644 --- a/src/dsp/arm/cdef_neon.cc +++ b/src/dsp/arm/cdef_neon.cc @@ -265,7 +265,7 @@ LIBGAV1_ALWAYS_INLINE void AddPartial(const void* const source, // 05 15 25 35 45 55 65 75 00 00 00 00 00 00 00 00 // 06 16 26 36 46 56 66 76 00 00 00 00 00 00 00 00 // 07 17 27 37 47 57 67 77 00 00 00 00 00 00 00 00 - partial_lo[2] = vsetq_lane_u16(SumVector(v_src[0]), partial_lo[2], 0); + partial_lo[2] = vsetq_lane_u16(SumVector(v_src[0]), vdupq_n_u16(0), 0); partial_lo[2] = vsetq_lane_u16(SumVector(v_src[1]), partial_lo[2], 1); partial_lo[2] = vsetq_lane_u16(SumVector(v_src[2]), partial_lo[2], 2); partial_lo[2] = vsetq_lane_u16(SumVector(v_src[3]), partial_lo[2], 3); @@ -285,9 +285,8 @@ LIBGAV1_ALWAYS_INLINE void AddPartial(const void* const source, // 50 51 52 53 54 55 56 57 00 00 00 00 00 00 00 00 // 60 61 62 63 64 65 66 67 00 00 00 00 00 00 00 00 // 70 71 72 73 74 75 76 77 00 00 00 00 00 00 00 00 - const uint8x8_t v_zero = vdup_n_u8(0); - partial_lo[6] = vaddl_u8(v_zero, v_src[0]); - for (int i = 1; i < 8; ++i) { + partial_lo[6] = vaddl_u8(v_src[0], v_src[1]); + for (int i = 2; i < 8; ++i) { partial_lo[6] = vaddw_u8(partial_lo[6], v_src[i]); } @@ -451,7 +450,7 @@ void LoadDirection4(const uint16_t* const src, const ptrdiff_t stride, int16x8_t Constrain(const uint16x8_t pixel, const uint16x8_t reference, const uint16x8_t threshold, const int16x8_t damping) { - // If reference > pixel, the difference will be negative, so covert to 0 or + // If reference > pixel, the difference will be negative, so convert to 0 or // -1. const uint16x8_t sign = vcgtq_u16(reference, pixel); const uint16x8_t abs_diff = vabdq_u16(pixel, reference); @@ -686,7 +685,7 @@ void CdefInit_NEON() { low_bitdepth::Init8bpp(); } } // namespace dsp } // namespace libgav1 -#else // !LIBGAV1_ENABLE_NEON +#else // !LIBGAV1_ENABLE_NEON namespace libgav1 { namespace dsp { diff --git a/src/dsp/arm/common_neon.h b/src/dsp/arm/common_neon.h index dcb7567..05e0d05 100644 --- a/src/dsp/arm/common_neon.h +++ b/src/dsp/arm/common_neon.h @@ -28,8 +28,7 @@ #if 0 #include <cstdio> - -#include "absl/strings/str_cat.h" +#include <string> constexpr bool kEnablePrintRegs = true; @@ -86,11 +85,11 @@ inline void PrintVectQ(const DebugRegisterQ r, const char* const name, inline void PrintReg(const int32x4x2_t val, const std::string& name) { DebugRegisterQ r; - vst1q_u32(r.u32, val.val[0]); - const std::string name0 = absl::StrCat(name, ".val[0]").c_str(); + vst1q_s32(r.i32, val.val[0]); + const std::string name0 = name + std::string(".val[0]"); PrintVectQ(r, name0.c_str(), 32); - vst1q_u32(r.u32, val.val[1]); - const std::string name1 = absl::StrCat(name, ".val[1]").c_str(); + vst1q_s32(r.i32, val.val[1]); + const std::string name1 = name + std::string(".val[1]"); PrintVectQ(r, name1.c_str(), 32); } @@ -169,14 +168,14 @@ inline void PrintReg(const int8x8_t val, const char* name) { // Print an individual (non-vector) value in decimal format. inline void PrintReg(const int x, const char* name) { if (kEnablePrintRegs) { - printf("%s: %d\n", name, x); + fprintf(stderr, "%s: %d\n", name, x); } } // Print an individual (non-vector) value in hexadecimal format. inline void PrintHex(const int x, const char* name) { if (kEnablePrintRegs) { - printf("%s: %x\n", name, x); + fprintf(stderr, "%s: %x\n", name, x); } } @@ -277,22 +276,32 @@ inline void Store2(uint16_t* const buf, const uint16x4_t val) { ValueToMem<uint32_t>(buf, vget_lane_u32(vreinterpret_u32_u16(val), lane)); } +// Simplify code when caller has |buf| cast as uint8_t*. +inline void Store4(void* const buf, const uint16x4_t val) { + vst1_u16(static_cast<uint16_t*>(buf), val); +} + +// Simplify code when caller has |buf| cast as uint8_t*. +inline void Store8(void* const buf, const uint16x8_t val) { + vst1q_u16(static_cast<uint16_t*>(buf), val); +} + //------------------------------------------------------------------------------ // Bit manipulation. // vshXX_n_XX() requires an immediate. template <int shift> -inline uint8x8_t LeftShift(const uint8x8_t vector) { +inline uint8x8_t LeftShiftVector(const uint8x8_t vector) { return vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(vector), shift)); } template <int shift> -inline uint8x8_t RightShift(const uint8x8_t vector) { +inline uint8x8_t RightShiftVector(const uint8x8_t vector) { return vreinterpret_u8_u64(vshr_n_u64(vreinterpret_u64_u8(vector), shift)); } template <int shift> -inline int8x8_t RightShift(const int8x8_t vector) { +inline int8x8_t RightShiftVector(const int8x8_t vector) { return vreinterpret_s8_u64(vshr_n_u64(vreinterpret_u64_s8(vector), shift)); } @@ -387,6 +396,15 @@ inline uint16_t SumVector(const uint8x8_t a) { #endif // defined(__aarch64__) } +inline uint32_t SumVector(const uint32x2_t a) { +#if defined(__aarch64__) + return vaddv_u32(a); +#else + const uint64x1_t b = vpaddl_u32(a); + return vget_lane_u32(vreinterpret_u32_u64(b), 0); +#endif // defined(__aarch64__) +} + inline uint32_t SumVector(const uint32x4_t a) { #if defined(__aarch64__) return vaddvq_u32(a); @@ -447,6 +465,36 @@ inline uint16x8x2_t VtrnqU64(uint32x4_t a0, uint32x4_t a1) { } // Input: +// 00 01 02 03 +// 10 11 12 13 +// 20 21 22 23 +// 30 31 32 33 +inline void Transpose4x4(uint16x4_t a[4]) { + // b: + // 00 10 02 12 + // 01 11 03 13 + const uint16x4x2_t b = vtrn_u16(a[0], a[1]); + // c: + // 20 30 22 32 + // 21 31 23 33 + const uint16x4x2_t c = vtrn_u16(a[2], a[3]); + // d: + // 00 10 20 30 + // 02 12 22 32 + const uint32x2x2_t d = + vtrn_u32(vreinterpret_u32_u16(b.val[0]), vreinterpret_u32_u16(c.val[0])); + // e: + // 01 11 21 31 + // 03 13 23 33 + const uint32x2x2_t e = + vtrn_u32(vreinterpret_u32_u16(b.val[1]), vreinterpret_u32_u16(c.val[1])); + a[0] = vreinterpret_u16_u32(d.val[0]); + a[1] = vreinterpret_u16_u32(e.val[0]); + a[2] = vreinterpret_u16_u32(d.val[1]); + a[3] = vreinterpret_u16_u32(e.val[1]); +} + +// Input: // a: 00 01 02 03 10 11 12 13 // b: 20 21 22 23 30 31 32 33 // Output: diff --git a/src/dsp/arm/convolve_neon.cc b/src/dsp/arm/convolve_neon.cc index fd9b912..331bfe2 100644 --- a/src/dsp/arm/convolve_neon.cc +++ b/src/dsp/arm/convolve_neon.cc @@ -101,245 +101,278 @@ int16x8_t SumOnePassTaps(const uint8x8_t* const src, return vreinterpretq_s16_u16(sum); } -template <int filter_index, bool negative_outside_taps> -int16x8_t SumHorizontalTaps(const uint8_t* const src, - const uint8x8_t* const v_tap) { - uint8x8_t v_src[8]; - const uint8x16_t src_long = vld1q_u8(src); - int16x8_t sum; - - if (filter_index < 2) { - v_src[0] = vget_low_u8(vextq_u8(src_long, src_long, 1)); - v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 2)); - v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 3)); - v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 4)); - v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 5)); - v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 6)); - sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src, v_tap + 1); - } else if (filter_index == 2) { - v_src[0] = vget_low_u8(src_long); - v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1)); - v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2)); - v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3)); - v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 4)); - v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 5)); - v_src[6] = vget_low_u8(vextq_u8(src_long, src_long, 6)); - v_src[7] = vget_low_u8(vextq_u8(src_long, src_long, 7)); - sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src, v_tap); - } else if (filter_index == 3) { - v_src[0] = vget_low_u8(vextq_u8(src_long, src_long, 3)); - v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 4)); - sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src, v_tap + 3); - } else if (filter_index > 3) { - v_src[0] = vget_low_u8(vextq_u8(src_long, src_long, 2)); - v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 3)); - v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 4)); - v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 5)); - sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src, v_tap + 2); - } - return sum; -} - -template <int filter_index, bool negative_outside_taps> -uint8x8_t SimpleHorizontalTaps(const uint8_t* const src, - const uint8x8_t* const v_tap) { - int16x8_t sum = - SumHorizontalTaps<filter_index, negative_outside_taps>(src, v_tap); - - // Normally the Horizontal pass does the downshift in two passes: - // kInterRoundBitsHorizontal - 1 and then (kFilterBits - - // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them - // requires adding the rounding offset from the skipped shift. - constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2); - - sum = vaddq_s16(sum, vdupq_n_s16(first_shift_rounding_bit)); - return vqrshrun_n_s16(sum, kFilterBits - 1); -} - -template <int filter_index, bool negative_outside_taps> -uint16x8_t HorizontalTaps8To16(const uint8_t* const src, - const uint8x8_t* const v_tap) { - const int16x8_t sum = - SumHorizontalTaps<filter_index, negative_outside_taps>(src, v_tap); - - return vreinterpretq_u16_s16( - vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1)); -} - -template <int filter_index> -int16x8_t SumHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride, - const uint8x8_t* const v_tap) { - uint16x8_t sum; - const uint8x8_t input0 = vld1_u8(src); - src += src_stride; - const uint8x8_t input1 = vld1_u8(src); - uint8x8x2_t input = vzip_u8(input0, input1); - - if (filter_index == 3) { - // tap signs : + + - sum = vmull_u8(vext_u8(input.val[0], input.val[1], 6), v_tap[3]); - sum = vmlal_u8(sum, input.val[1], v_tap[4]); - } else if (filter_index == 4) { - // tap signs : - + + - - sum = vmull_u8(vext_u8(input.val[0], input.val[1], 6), v_tap[3]); - sum = vmlsl_u8(sum, RightShift<4 * 8>(input.val[0]), v_tap[2]); - sum = vmlal_u8(sum, input.val[1], v_tap[4]); - sum = vmlsl_u8(sum, RightShift<2 * 8>(input.val[1]), v_tap[5]); - } else { - // tap signs : + + + + - sum = vmull_u8(RightShift<4 * 8>(input.val[0]), v_tap[2]); - sum = vmlal_u8(sum, vext_u8(input.val[0], input.val[1], 6), v_tap[3]); - sum = vmlal_u8(sum, input.val[1], v_tap[4]); - sum = vmlal_u8(sum, RightShift<2 * 8>(input.val[1]), v_tap[5]); - } - - return vreinterpretq_s16_u16(sum); -} - -template <int filter_index> -uint8x8_t SimpleHorizontalTaps2x2(const uint8_t* src, - const ptrdiff_t src_stride, - const uint8x8_t* const v_tap) { - int16x8_t sum = SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap); - - // Normally the Horizontal pass does the downshift in two passes: - // kInterRoundBitsHorizontal - 1 and then (kFilterBits - - // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them - // requires adding the rounding offset from the skipped shift. - constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2); - - sum = vaddq_s16(sum, vdupq_n_s16(first_shift_rounding_bit)); - return vqrshrun_n_s16(sum, kFilterBits - 1); -} - -template <int filter_index> -uint16x8_t HorizontalTaps8To16_2x2(const uint8_t* src, - const ptrdiff_t src_stride, - const uint8x8_t* const v_tap) { - const int16x8_t sum = - SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap); - - return vreinterpretq_u16_s16( - vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1)); -} - -template <int num_taps, int step, int filter_index, - bool negative_outside_taps = true, bool is_2d = false, - bool is_compound = false> -void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride, - void* const dest, const ptrdiff_t pred_stride, - const int width, const int height, - const uint8x8_t* const v_tap) { +template <int filter_index, bool negative_outside_taps, bool is_2d, + bool is_compound> +void FilterHorizontalWidth8AndUp(const uint8_t* src, const ptrdiff_t src_stride, + void* const dest, const ptrdiff_t pred_stride, + const int width, const int height, + const uint8x8_t* const v_tap) { auto* dest8 = static_cast<uint8_t*>(dest); auto* dest16 = static_cast<uint16_t*>(dest); - - // 4 tap filters are never used when width > 4. - if (num_taps != 4 && width > 4) { - int y = 0; + if (!is_2d) { + int y = height; do { int x = 0; - do { - if (is_2d || is_compound) { - const uint16x8_t v_sum = - HorizontalTaps8To16<filter_index, negative_outside_taps>(&src[x], - v_tap); + do { // Increasing loop counter x is better. + const uint8x16_t src_long = vld1q_u8(src + x); + uint8x8_t v_src[8]; + int16x8_t sum; + if (filter_index < 2) { + v_src[0] = vget_low_u8(src_long); + v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1)); + v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2)); + v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3)); + v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 4)); + v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 5)); + sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src, + v_tap + 1); + } else if (filter_index == 2) { + v_src[0] = vget_low_u8(src_long); + v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1)); + v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2)); + v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3)); + v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 4)); + v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 5)); + v_src[6] = vget_low_u8(vextq_u8(src_long, src_long, 6)); + v_src[7] = vget_low_u8(vextq_u8(src_long, src_long, 7)); + sum = SumOnePassTaps<filter_index, false>(v_src, v_tap); + } else if (filter_index == 3) { + v_src[0] = vget_low_u8(src_long); + v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1)); + sum = SumOnePassTaps<filter_index, false>(v_src, v_tap + 3); + } else if (filter_index > 3) { + v_src[0] = vget_low_u8(src_long); + v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1)); + v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2)); + v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3)); + sum = SumOnePassTaps<filter_index, false>(v_src, v_tap + 2); + } + if (is_compound) { + const uint16x8_t v_sum = vreinterpretq_u16_s16( + vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1)); vst1q_u16(&dest16[x], v_sum); } else { - const uint8x8_t result = - SimpleHorizontalTaps<filter_index, negative_outside_taps>(&src[x], - v_tap); + // Normally the Horizontal pass does the downshift in two passes: + // kInterRoundBitsHorizontal - 1 and then (kFilterBits - + // kInterRoundBitsHorizontal). Each one uses a rounding shift. + // Combining them requires adding the rounding offset from the skipped + // shift. + constexpr int first_shift_rounding_bit = + 1 << (kInterRoundBitsHorizontal - 2); + sum = vaddq_s16(sum, vdupq_n_s16(first_shift_rounding_bit)); + const uint8x8_t result = vqrshrun_n_s16(sum, kFilterBits - 1); vst1_u8(&dest8[x], result); } - x += step; + x += 8; } while (x < width); src += src_stride; dest8 += pred_stride; dest16 += pred_stride; - } while (++y < height); + } while (--y != 0); + } else { + int x = 0; + do { + const uint8_t* s = src + x; + int y = height; + do { // Increasing loop counter x is better. + const uint8x16_t src_long = vld1q_u8(s); + uint8x8_t v_src[8]; + int16x8_t sum; + if (filter_index < 2) { + v_src[0] = vget_low_u8(src_long); + v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1)); + v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2)); + v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3)); + v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 4)); + v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 5)); + sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src, + v_tap + 1); + } else if (filter_index == 2) { + v_src[0] = vget_low_u8(src_long); + v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1)); + v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2)); + v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3)); + v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 4)); + v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 5)); + v_src[6] = vget_low_u8(vextq_u8(src_long, src_long, 6)); + v_src[7] = vget_low_u8(vextq_u8(src_long, src_long, 7)); + sum = SumOnePassTaps<filter_index, false>(v_src, v_tap); + } else if (filter_index == 3) { + v_src[0] = vget_low_u8(src_long); + v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1)); + sum = SumOnePassTaps<filter_index, false>(v_src, v_tap + 3); + } else if (filter_index > 3) { + v_src[0] = vget_low_u8(src_long); + v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1)); + v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2)); + v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3)); + sum = SumOnePassTaps<filter_index, false>(v_src, v_tap + 2); + } + const uint16x8_t v_sum = vreinterpretq_u16_s16( + vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1)); + vst1q_u16(dest16, v_sum); + s += src_stride; + dest16 += 8; + } while (--y != 0); + x += 8; + } while (x < width); + } +} + +template <int filter_index, bool is_2d, bool is_compound> +void FilterHorizontalWidth4(const uint8_t* src, const ptrdiff_t src_stride, + void* const dest, const ptrdiff_t pred_stride, + const int height, const uint8x8_t* const v_tap) { + auto* dest8 = static_cast<uint8_t*>(dest); + auto* dest16 = static_cast<uint16_t*>(dest); + int y = height; + do { + uint8x8_t v_src[4]; + int16x8_t sum; + v_src[0] = vld1_u8(src); + if (filter_index == 3) { + v_src[1] = RightShiftVector<1 * 8>(v_src[0]); + sum = SumOnePassTaps<filter_index, false>(v_src, v_tap + 3); + } else { + v_src[1] = RightShiftVector<1 * 8>(v_src[0]); + v_src[2] = RightShiftVector<2 * 8>(v_src[0]); + v_src[3] = RightShiftVector<3 * 8>(v_src[0]); + sum = SumOnePassTaps<filter_index, false>(v_src, v_tap + 2); + } + if (is_2d || is_compound) { + const uint16x4_t v_sum = vreinterpret_u16_s16( + vrshr_n_s16(vget_low_s16(sum), kInterRoundBitsHorizontal - 1)); + vst1_u16(dest16, v_sum); + } else { + constexpr int first_shift_rounding_bit = + 1 << (kInterRoundBitsHorizontal - 2); + sum = vaddq_s16(sum, vdupq_n_s16(first_shift_rounding_bit)); + const uint8x8_t result = vqrshrun_n_s16(sum, kFilterBits - 1); + StoreLo4(&dest8[0], result); + } + src += src_stride; + dest8 += pred_stride; + dest16 += pred_stride; + } while (--y != 0); +} + +template <int filter_index, bool is_2d> +void FilterHorizontalWidth2(const uint8_t* src, const ptrdiff_t src_stride, + void* const dest, const ptrdiff_t pred_stride, + const int height, const uint8x8_t* const v_tap) { + auto* dest8 = static_cast<uint8_t*>(dest); + auto* dest16 = static_cast<uint16_t*>(dest); + int y = height >> 1; + do { + const uint8x8_t input0 = vld1_u8(src); + const uint8x8_t input1 = vld1_u8(src + src_stride); + const uint8x8x2_t input = vzip_u8(input0, input1); + uint16x8_t sum; + if (filter_index == 3) { + // tap signs : + + + sum = vmull_u8(input.val[0], v_tap[3]); + sum = vmlal_u8(sum, vext_u8(input.val[0], input.val[1], 2), v_tap[4]); + } else if (filter_index == 4) { + // tap signs : - + + - + sum = vmull_u8(RightShiftVector<2 * 8>(input.val[0]), v_tap[3]); + sum = vmlsl_u8(sum, input.val[0], v_tap[2]); + sum = vmlal_u8(sum, RightShiftVector<4 * 8>(input.val[0]), v_tap[4]); + sum = vmlsl_u8(sum, vext_u8(input.val[0], input.val[1], 6), v_tap[5]); + } else { + // tap signs : + + + + + sum = vmull_u8(input.val[0], v_tap[2]); + sum = vmlal_u8(sum, RightShiftVector<2 * 8>(input.val[0]), v_tap[3]); + sum = vmlal_u8(sum, RightShiftVector<4 * 8>(input.val[0]), v_tap[4]); + sum = vmlal_u8(sum, vext_u8(input.val[0], input.val[1], 6), v_tap[5]); + } + int16x8_t s = vreinterpretq_s16_u16(sum); + if (is_2d) { + const uint16x8_t v_sum = + vreinterpretq_u16_s16(vrshrq_n_s16(s, kInterRoundBitsHorizontal - 1)); + dest16[0] = vgetq_lane_u16(v_sum, 0); + dest16[1] = vgetq_lane_u16(v_sum, 2); + dest16 += pred_stride; + dest16[0] = vgetq_lane_u16(v_sum, 1); + dest16[1] = vgetq_lane_u16(v_sum, 3); + dest16 += pred_stride; + } else { + // Normally the Horizontal pass does the downshift in two passes: + // kInterRoundBitsHorizontal - 1 and then (kFilterBits - + // kInterRoundBitsHorizontal). Each one uses a rounding shift. + // Combining them requires adding the rounding offset from the skipped + // shift. + constexpr int first_shift_rounding_bit = + 1 << (kInterRoundBitsHorizontal - 2); + s = vaddq_s16(s, vdupq_n_s16(first_shift_rounding_bit)); + const uint8x8_t result = vqrshrun_n_s16(s, kFilterBits - 1); + dest8[0] = vget_lane_u8(result, 0); + dest8[1] = vget_lane_u8(result, 2); + dest8 += pred_stride; + dest8[0] = vget_lane_u8(result, 1); + dest8[1] = vget_lane_u8(result, 3); + dest8 += pred_stride; + } + src += src_stride << 1; + } while (--y != 0); + + // The 2d filters have an odd |height| because the horizontal pass + // generates context for the vertical pass. + if (is_2d) { + assert(height % 2 == 1); + const uint8x8_t input = vld1_u8(src); + uint16x8_t sum; + if (filter_index == 3) { + sum = vmull_u8(input, v_tap[3]); + sum = vmlal_u8(sum, RightShiftVector<1 * 8>(input), v_tap[4]); + } else if (filter_index == 4) { + sum = vmull_u8(RightShiftVector<1 * 8>(input), v_tap[3]); + sum = vmlsl_u8(sum, input, v_tap[2]); + sum = vmlal_u8(sum, RightShiftVector<2 * 8>(input), v_tap[4]); + sum = vmlsl_u8(sum, RightShiftVector<3 * 8>(input), v_tap[5]); + } else { + assert(filter_index == 5); + sum = vmull_u8(input, v_tap[2]); + sum = vmlal_u8(sum, RightShiftVector<1 * 8>(input), v_tap[3]); + sum = vmlal_u8(sum, RightShiftVector<2 * 8>(input), v_tap[4]); + sum = vmlal_u8(sum, RightShiftVector<3 * 8>(input), v_tap[5]); + } + // |sum| contains an int16_t value. + sum = vreinterpretq_u16_s16(vrshrq_n_s16(vreinterpretq_s16_u16(sum), + kInterRoundBitsHorizontal - 1)); + Store2<0>(dest16, sum); + } +} + +template <int filter_index, bool negative_outside_taps, bool is_2d, + bool is_compound> +void FilterHorizontal(const uint8_t* const src, const ptrdiff_t src_stride, + void* const dest, const ptrdiff_t pred_stride, + const int width, const int height, + const uint8x8_t* const v_tap) { + assert(width < 8 || filter_index <= 3); + // Don't simplify the redundant if conditions with the template parameters, + // which helps the compiler generate compact code. + if (width >= 8 && filter_index <= 3) { + FilterHorizontalWidth8AndUp<filter_index, negative_outside_taps, is_2d, + is_compound>(src, src_stride, dest, pred_stride, + width, height, v_tap); return; } - // Horizontal passes only needs to account for |num_taps| 2 and 4 when + // Horizontal passes only needs to account for number of taps 2 and 4 when // |width| <= 4. assert(width <= 4); - assert(num_taps <= 4); - if (num_taps <= 4) { + assert(filter_index >= 3 && filter_index <= 5); + if (filter_index >= 3 && filter_index <= 5) { if (width == 4) { - int y = 0; - do { - if (is_2d || is_compound) { - const uint16x8_t v_sum = - HorizontalTaps8To16<filter_index, negative_outside_taps>(src, - v_tap); - vst1_u16(dest16, vget_low_u16(v_sum)); - } else { - const uint8x8_t result = - SimpleHorizontalTaps<filter_index, negative_outside_taps>(src, - v_tap); - StoreLo4(&dest8[0], result); - } - src += src_stride; - dest8 += pred_stride; - dest16 += pred_stride; - } while (++y < height); + FilterHorizontalWidth4<filter_index, is_2d, is_compound>( + src, src_stride, dest, pred_stride, height, v_tap); return; } - + assert(width == 2); if (!is_compound) { - int y = 0; - do { - if (is_2d) { - const uint16x8_t sum = - HorizontalTaps8To16_2x2<filter_index>(src, src_stride, v_tap); - dest16[0] = vgetq_lane_u16(sum, 0); - dest16[1] = vgetq_lane_u16(sum, 2); - dest16 += pred_stride; - dest16[0] = vgetq_lane_u16(sum, 1); - dest16[1] = vgetq_lane_u16(sum, 3); - dest16 += pred_stride; - } else { - const uint8x8_t sum = - SimpleHorizontalTaps2x2<filter_index>(src, src_stride, v_tap); - - dest8[0] = vget_lane_u8(sum, 0); - dest8[1] = vget_lane_u8(sum, 2); - dest8 += pred_stride; - - dest8[0] = vget_lane_u8(sum, 1); - dest8[1] = vget_lane_u8(sum, 3); - dest8 += pred_stride; - } - - src += src_stride << 1; - y += 2; - } while (y < height - 1); - - // The 2d filters have an odd |height| because the horizontal pass - // generates context for the vertical pass. - if (is_2d) { - assert(height % 2 == 1); - uint16x8_t sum; - const uint8x8_t input = vld1_u8(src); - if (filter_index == 3) { // |num_taps| == 2 - sum = vmull_u8(RightShift<3 * 8>(input), v_tap[3]); - sum = vmlal_u8(sum, RightShift<4 * 8>(input), v_tap[4]); - } else if (filter_index == 4) { - sum = vmull_u8(RightShift<3 * 8>(input), v_tap[3]); - sum = vmlsl_u8(sum, RightShift<2 * 8>(input), v_tap[2]); - sum = vmlal_u8(sum, RightShift<4 * 8>(input), v_tap[4]); - sum = vmlsl_u8(sum, RightShift<5 * 8>(input), v_tap[5]); - } else { - assert(filter_index == 5); - sum = vmull_u8(RightShift<2 * 8>(input), v_tap[2]); - sum = vmlal_u8(sum, RightShift<3 * 8>(input), v_tap[3]); - sum = vmlal_u8(sum, RightShift<4 * 8>(input), v_tap[4]); - sum = vmlal_u8(sum, RightShift<5 * 8>(input), v_tap[5]); - } - // |sum| contains an int16_t value. - sum = vreinterpretq_u16_s16(vrshrq_n_s16( - vreinterpretq_s16_u16(sum), kInterRoundBitsHorizontal - 1)); - Store2<0>(dest16, sum); - } + FilterHorizontalWidth2<filter_index, is_2d>(src, src_stride, dest, + pred_stride, height, v_tap); } } } @@ -451,78 +484,85 @@ int16x8_t SimpleSum2DVerticalTaps(const int16x8_t* const src, } template <int num_taps, bool is_compound = false> -void Filter2DVertical(const uint16_t* src, void* const dst, - const ptrdiff_t dst_stride, const int width, - const int height, const int16x8_t taps) { +void Filter2DVerticalWidth8AndUp(const uint16_t* src, void* const dst, + const ptrdiff_t dst_stride, const int width, + const int height, const int16x8_t taps) { assert(width >= 8); constexpr int next_row = num_taps - 1; - // The Horizontal pass uses |width| as |stride| for the intermediate buffer. - const ptrdiff_t src_stride = width; - - auto* dst8 = static_cast<uint8_t*>(dst); - auto* dst16 = static_cast<uint16_t*>(dst); + auto* const dst8 = static_cast<uint8_t*>(dst); + auto* const dst16 = static_cast<uint16_t*>(dst); int x = 0; do { - int16x8_t srcs[8]; - const uint16_t* src_x = src + x; - srcs[0] = vreinterpretq_s16_u16(vld1q_u16(src_x)); - src_x += src_stride; + int16x8_t srcs[9]; + srcs[0] = vreinterpretq_s16_u16(vld1q_u16(src)); + src += 8; if (num_taps >= 4) { - srcs[1] = vreinterpretq_s16_u16(vld1q_u16(src_x)); - src_x += src_stride; - srcs[2] = vreinterpretq_s16_u16(vld1q_u16(src_x)); - src_x += src_stride; + srcs[1] = vreinterpretq_s16_u16(vld1q_u16(src)); + src += 8; + srcs[2] = vreinterpretq_s16_u16(vld1q_u16(src)); + src += 8; if (num_taps >= 6) { - srcs[3] = vreinterpretq_s16_u16(vld1q_u16(src_x)); - src_x += src_stride; - srcs[4] = vreinterpretq_s16_u16(vld1q_u16(src_x)); - src_x += src_stride; + srcs[3] = vreinterpretq_s16_u16(vld1q_u16(src)); + src += 8; + srcs[4] = vreinterpretq_s16_u16(vld1q_u16(src)); + src += 8; if (num_taps == 8) { - srcs[5] = vreinterpretq_s16_u16(vld1q_u16(src_x)); - src_x += src_stride; - srcs[6] = vreinterpretq_s16_u16(vld1q_u16(src_x)); - src_x += src_stride; + srcs[5] = vreinterpretq_s16_u16(vld1q_u16(src)); + src += 8; + srcs[6] = vreinterpretq_s16_u16(vld1q_u16(src)); + src += 8; } } } - int y = 0; + uint8_t* d8 = dst8 + x; + uint16_t* d16 = dst16 + x; + int y = height; do { - srcs[next_row] = vreinterpretq_s16_u16(vld1q_u16(src_x)); - src_x += src_stride; - - const int16x8_t sum = - SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps); + srcs[next_row] = vreinterpretq_s16_u16(vld1q_u16(src)); + src += 8; + srcs[next_row + 1] = vreinterpretq_s16_u16(vld1q_u16(src)); + src += 8; + const int16x8_t sum0 = + SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs + 0, taps); + const int16x8_t sum1 = + SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs + 1, taps); if (is_compound) { - vst1q_u16(dst16 + x + y * dst_stride, vreinterpretq_u16_s16(sum)); + vst1q_u16(d16, vreinterpretq_u16_s16(sum0)); + d16 += dst_stride; + vst1q_u16(d16, vreinterpretq_u16_s16(sum1)); + d16 += dst_stride; } else { - vst1_u8(dst8 + x + y * dst_stride, vqmovun_s16(sum)); + vst1_u8(d8, vqmovun_s16(sum0)); + d8 += dst_stride; + vst1_u8(d8, vqmovun_s16(sum1)); + d8 += dst_stride; } - - srcs[0] = srcs[1]; + srcs[0] = srcs[2]; if (num_taps >= 4) { - srcs[1] = srcs[2]; - srcs[2] = srcs[3]; + srcs[1] = srcs[3]; + srcs[2] = srcs[4]; if (num_taps >= 6) { - srcs[3] = srcs[4]; - srcs[4] = srcs[5]; + srcs[3] = srcs[5]; + srcs[4] = srcs[6]; if (num_taps == 8) { - srcs[5] = srcs[6]; - srcs[6] = srcs[7]; + srcs[5] = srcs[7]; + srcs[6] = srcs[8]; } } } - } while (++y < height); + y -= 2; + } while (y != 0); x += 8; } while (x < width); } // Take advantage of |src_stride| == |width| to process two rows at a time. template <int num_taps, bool is_compound = false> -void Filter2DVertical4xH(const uint16_t* src, void* const dst, - const ptrdiff_t dst_stride, const int height, - const int16x8_t taps) { +void Filter2DVerticalWidth4(const uint16_t* src, void* const dst, + const ptrdiff_t dst_stride, const int height, + const int16x8_t taps) { auto* dst8 = static_cast<uint8_t*>(dst); auto* dst16 = static_cast<uint16_t*>(dst); @@ -545,7 +585,7 @@ void Filter2DVertical4xH(const uint16_t* src, void* const dst, } } - int y = 0; + int y = height; do { srcs[num_taps] = vreinterpretq_s16_u16(vld1q_u16(src)); src += 8; @@ -580,15 +620,15 @@ void Filter2DVertical4xH(const uint16_t* src, void* const dst, } } } - y += 2; - } while (y < height); + y -= 2; + } while (y != 0); } // Take advantage of |src_stride| == |width| to process four rows at a time. template <int num_taps> -void Filter2DVertical2xH(const uint16_t* src, void* const dst, - const ptrdiff_t dst_stride, const int height, - const int16x8_t taps) { +void Filter2DVerticalWidth2(const uint16_t* src, void* const dst, + const ptrdiff_t dst_stride, const int height, + const int16x8_t taps) { constexpr int next_row = (num_taps < 6) ? 4 : 8; auto* dst8 = static_cast<uint8_t*>(dst); @@ -672,29 +712,47 @@ LIBGAV1_ALWAYS_INLINE void DoHorizontalPass( } if (filter_index == 2) { // 8 tap. - FilterHorizontal<8, 8, 2, true, is_2d, is_compound>( + FilterHorizontal<2, true, is_2d, is_compound>( src, src_stride, dst, dst_stride, width, height, v_tap); } else if (filter_index == 1) { // 6 tap. // Check if outside taps are positive. if ((filter_id == 1) | (filter_id == 15)) { - FilterHorizontal<6, 8, 1, false, is_2d, is_compound>( - src, src_stride, dst, dst_stride, width, height, v_tap); + FilterHorizontal<1, false, is_2d, is_compound>( + src + 1, src_stride, dst, dst_stride, width, height, v_tap); } else { - FilterHorizontal<6, 8, 1, true, is_2d, is_compound>( - src, src_stride, dst, dst_stride, width, height, v_tap); + FilterHorizontal<1, true, is_2d, is_compound>( + src + 1, src_stride, dst, dst_stride, width, height, v_tap); } } else if (filter_index == 0) { // 6 tap. - FilterHorizontal<6, 8, 0, true, is_2d, is_compound>( - src, src_stride, dst, dst_stride, width, height, v_tap); + FilterHorizontal<0, true, is_2d, is_compound>( + src + 1, src_stride, dst, dst_stride, width, height, v_tap); } else if (filter_index == 4) { // 4 tap. - FilterHorizontal<4, 8, 4, true, is_2d, is_compound>( - src, src_stride, dst, dst_stride, width, height, v_tap); + FilterHorizontal<4, true, is_2d, is_compound>( + src + 2, src_stride, dst, dst_stride, width, height, v_tap); } else if (filter_index == 5) { // 4 tap. - FilterHorizontal<4, 8, 5, true, is_2d, is_compound>( - src, src_stride, dst, dst_stride, width, height, v_tap); + FilterHorizontal<5, true, is_2d, is_compound>( + src + 2, src_stride, dst, dst_stride, width, height, v_tap); } else { // 2 tap. - FilterHorizontal<2, 8, 3, true, is_2d, is_compound>( - src, src_stride, dst, dst_stride, width, height, v_tap); + FilterHorizontal<3, true, is_2d, is_compound>( + src + 3, src_stride, dst, dst_stride, width, height, v_tap); + } +} + +template <int vertical_taps> +void Filter2DVertical(const uint16_t* const intermediate_result, + const int width, const int height, const int16x8_t taps, + void* const prediction, const ptrdiff_t pred_stride) { + auto* const dest = static_cast<uint8_t*>(prediction); + if (width >= 8) { + Filter2DVerticalWidth8AndUp<vertical_taps>( + intermediate_result, dest, pred_stride, width, height, taps); + } else if (width == 4) { + Filter2DVerticalWidth4<vertical_taps>(intermediate_result, dest, + pred_stride, height, taps); + } else { + assert(width == 2); + Filter2DVerticalWidth2<vertical_taps>(intermediate_result, dest, + pred_stride, height, taps); } } @@ -704,7 +762,7 @@ void Convolve2D_NEON(const void* const reference, const int vertical_filter_index, const int horizontal_filter_id, const int vertical_filter_id, const int width, - const int height, void* prediction, + const int height, void* const prediction, const ptrdiff_t pred_stride) { const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width); const int vert_filter_index = GetFilterIndex(vertical_filter_index, height); @@ -715,67 +773,31 @@ void Convolve2D_NEON(const void* const reference, intermediate_result[kMaxSuperBlockSizeInPixels * (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)]; const int intermediate_height = height + vertical_taps - 1; - const ptrdiff_t src_stride = reference_stride; - const auto* src = static_cast<const uint8_t*>(reference) - - (vertical_taps / 2 - 1) * src_stride - kHorizontalOffset; + const auto* const src = static_cast<const uint8_t*>(reference) - + (vertical_taps / 2 - 1) * src_stride - + kHorizontalOffset; DoHorizontalPass</*is_2d=*/true>(src, src_stride, intermediate_result, width, width, intermediate_height, horizontal_filter_id, horiz_filter_index); // Vertical filter. - auto* dest = static_cast<uint8_t*>(prediction); - const ptrdiff_t dest_stride = pred_stride; assert(vertical_filter_id != 0); - const int16x8_t taps = vmovl_s8( vld1_s8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id])); - if (vertical_taps == 8) { - if (width == 2) { - Filter2DVertical2xH<8>(intermediate_result, dest, dest_stride, height, - taps); - } else if (width == 4) { - Filter2DVertical4xH<8>(intermediate_result, dest, dest_stride, height, - taps); - } else { - Filter2DVertical<8>(intermediate_result, dest, dest_stride, width, height, - taps); - } + Filter2DVertical<8>(intermediate_result, width, height, taps, prediction, + pred_stride); } else if (vertical_taps == 6) { - if (width == 2) { - Filter2DVertical2xH<6>(intermediate_result, dest, dest_stride, height, - taps); - } else if (width == 4) { - Filter2DVertical4xH<6>(intermediate_result, dest, dest_stride, height, - taps); - } else { - Filter2DVertical<6>(intermediate_result, dest, dest_stride, width, height, - taps); - } + Filter2DVertical<6>(intermediate_result, width, height, taps, prediction, + pred_stride); } else if (vertical_taps == 4) { - if (width == 2) { - Filter2DVertical2xH<4>(intermediate_result, dest, dest_stride, height, - taps); - } else if (width == 4) { - Filter2DVertical4xH<4>(intermediate_result, dest, dest_stride, height, - taps); - } else { - Filter2DVertical<4>(intermediate_result, dest, dest_stride, width, height, - taps); - } + Filter2DVertical<4>(intermediate_result, width, height, taps, prediction, + pred_stride); } else { // |vertical_taps| == 2 - if (width == 2) { - Filter2DVertical2xH<2>(intermediate_result, dest, dest_stride, height, - taps); - } else if (width == 4) { - Filter2DVertical4xH<2>(intermediate_result, dest, dest_stride, height, - taps); - } else { - Filter2DVertical<2>(intermediate_result, dest, dest_stride, width, height, - taps); - } + Filter2DVertical<2>(intermediate_result, width, height, taps, prediction, + pred_stride); } } @@ -788,7 +810,7 @@ void Convolve2D_NEON(const void* const reference, // increments. The first load covers the initial elements of src_x, while the // final load covers the taps. template <int grade_x> -inline uint8x8x3_t LoadSrcVals(const uint8_t* src_x) { +inline uint8x8x3_t LoadSrcVals(const uint8_t* const src_x) { uint8x8x3_t ret; const uint8x16_t src_val = vld1q_u8(src_x); ret.val[0] = vget_low_u8(src_val); @@ -811,7 +833,7 @@ inline uint8x16_t GetPositive2TapFilter(const int tap_index) { } template <int grade_x> -inline void ConvolveKernelHorizontal2Tap(const uint8_t* src, +inline void ConvolveKernelHorizontal2Tap(const uint8_t* const src, const ptrdiff_t src_stride, const int width, const int subpixel_x, const int step_x, @@ -843,7 +865,7 @@ inline void ConvolveKernelHorizontal2Tap(const uint8_t* src, // on x. const uint8x8_t taps[2] = {VQTbl1U8(filter_taps0, filter_indices), VQTbl1U8(filter_taps1, filter_indices)}; - int y = 0; + int y = intermediate_height; do { // Load a pool of samples to select from using stepped indices. const uint8x16_t src_vals = vld1q_u8(src_x); @@ -860,7 +882,7 @@ inline void ConvolveKernelHorizontal2Tap(const uint8_t* src, kInterRoundBitsHorizontal - 1)); src_x += src_stride; intermediate += kIntermediateStride; - } while (++y < intermediate_height); + } while (--y != 0); return; } @@ -883,7 +905,7 @@ inline void ConvolveKernelHorizontal2Tap(const uint8_t* src, // on x. const uint8x8_t taps[2] = {VQTbl1U8(filter_taps0, filter_indices), VQTbl1U8(filter_taps1, filter_indices)}; - int y = 0; + int y = intermediate_height; do { // Load a pool of samples to select from using stepped indices. const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x); @@ -900,7 +922,7 @@ inline void ConvolveKernelHorizontal2Tap(const uint8_t* src, kInterRoundBitsHorizontal - 1)); src_x += src_stride; intermediate_x += kIntermediateStride; - } while (++y < intermediate_height); + } while (--y != 0); x += 8; p += step_x8; } while (x < width); @@ -921,7 +943,7 @@ inline uint8x16_t GetPositive4TapFilter(const int tap_index) { // This filter is only possible when width <= 4. void ConvolveKernelHorizontalPositive4Tap( - const uint8_t* src, const ptrdiff_t src_stride, const int subpixel_x, + const uint8_t* const src, const ptrdiff_t src_stride, const int subpixel_x, const int step_x, const int intermediate_height, int16_t* intermediate) { const int kernel_offset = 2; const int ref_x = subpixel_x >> kScaleSubPixelBits; @@ -950,7 +972,7 @@ void ConvolveKernelHorizontalPositive4Tap( const uint8x8_t src_indices = vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits)); - int y = 0; + int y = intermediate_height; do { // Load a pool of samples to select from using stepped index vectors. const uint8x16_t src_vals = vld1q_u8(src_x); @@ -970,7 +992,7 @@ void ConvolveKernelHorizontalPositive4Tap( src_x += src_stride; intermediate += kIntermediateStride; - } while (++y < intermediate_height); + } while (--y != 0); } // Pre-transpose the 4 tap filters in |kAbsHalfSubPixelFilters|[4]. @@ -988,7 +1010,7 @@ inline uint8x16_t GetSigned4TapFilter(const int tap_index) { // This filter is only possible when width <= 4. inline void ConvolveKernelHorizontalSigned4Tap( - const uint8_t* src, const ptrdiff_t src_stride, const int subpixel_x, + const uint8_t* const src, const ptrdiff_t src_stride, const int subpixel_x, const int step_x, const int intermediate_height, int16_t* intermediate) { const int kernel_offset = 2; const int ref_x = subpixel_x >> kScaleSubPixelBits; @@ -1025,7 +1047,7 @@ inline void ConvolveKernelHorizontalSigned4Tap( vadd_u8(src_indices_base, vdup_n_u8(2)), vadd_u8(src_indices_base, vdup_n_u8(3))}; - int y = 0; + int y = intermediate_height; do { // Load a pool of samples to select from using stepped indices. const uint8x16_t src_vals = vld1q_u8(src_x); @@ -1042,7 +1064,7 @@ inline void ConvolveKernelHorizontalSigned4Tap( kInterRoundBitsHorizontal - 1)); src_x += src_stride; intermediate += kIntermediateStride; - } while (++y < intermediate_height); + } while (--y != 0); } // Pre-transpose the 6 tap filters in |kAbsHalfSubPixelFilters|[0]. @@ -1063,9 +1085,9 @@ inline uint8x16_t GetSigned6TapFilter(const int tap_index) { // This filter is only possible when width >= 8. template <int grade_x> inline void ConvolveKernelHorizontalSigned6Tap( - const uint8_t* src, const ptrdiff_t src_stride, const int width, + const uint8_t* const src, const ptrdiff_t src_stride, const int width, const int subpixel_x, const int step_x, const int intermediate_height, - int16_t* intermediate) { + int16_t* const intermediate) { const int kernel_offset = 1; const uint8x8_t one = vdup_n_u8(1); const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask); @@ -1107,7 +1129,7 @@ inline void ConvolveKernelHorizontalSigned6Tap( for (int i = 0; i < 6; ++i) { taps[i] = VQTbl1U8(filter_taps[i], filter_indices); } - int y = 0; + int y = intermediate_height; do { // Load a pool of samples to select from using stepped indices. const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x); @@ -1122,7 +1144,7 @@ inline void ConvolveKernelHorizontalSigned6Tap( kInterRoundBitsHorizontal - 1)); src_x += src_stride; intermediate_x += kIntermediateStride; - } while (++y < intermediate_height); + } while (--y != 0); x += 8; p += step_x8; } while (x < width); @@ -1156,9 +1178,9 @@ inline int8x16_t GetMixed6TapFilter(const int tap_index) { // This filter is only possible when width >= 8. template <int grade_x> inline void ConvolveKernelHorizontalMixed6Tap( - const uint8_t* src, const ptrdiff_t src_stride, const int width, + const uint8_t* const src, const ptrdiff_t src_stride, const int width, const int subpixel_x, const int step_x, const int intermediate_height, - int16_t* intermediate) { + int16_t* const intermediate) { const int kernel_offset = 1; const uint8x8_t one = vdup_n_u8(1); const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask); @@ -1205,7 +1227,7 @@ inline void ConvolveKernelHorizontalMixed6Tap( mixed_taps[0] = vmovl_s8(VQTbl1S8(mixed_filter_taps[0], filter_indices)); mixed_taps[1] = vmovl_s8(VQTbl1S8(mixed_filter_taps[1], filter_indices)); - int y = 0; + int y = intermediate_height; do { // Load a pool of samples to select from using stepped indices. const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x); @@ -1224,7 +1246,7 @@ inline void ConvolveKernelHorizontalMixed6Tap( kInterRoundBitsHorizontal - 1)); src_x += src_stride; intermediate_x += kIntermediateStride; - } while (++y < intermediate_height); + } while (--y != 0); x += 8; p += step_x8; } while (x < width); @@ -1250,9 +1272,9 @@ inline uint8x16_t GetSigned8TapFilter(const int tap_index) { // This filter is only possible when width >= 8. template <int grade_x> inline void ConvolveKernelHorizontalSigned8Tap( - const uint8_t* src, const ptrdiff_t src_stride, const int width, + const uint8_t* const src, const ptrdiff_t src_stride, const int width, const int subpixel_x, const int step_x, const int intermediate_height, - int16_t* intermediate) { + int16_t* const intermediate) { const uint8x8_t one = vdup_n_u8(1); const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask); const int ref_x = subpixel_x >> kScaleSubPixelBits; @@ -1290,7 +1312,7 @@ inline void ConvolveKernelHorizontalSigned8Tap( taps[i] = VQTbl1U8(filter_taps[i], filter_indices); } - int y = 0; + int y = intermediate_height; do { // Load a pool of samples to select from using stepped indices. const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x); @@ -1306,7 +1328,7 @@ inline void ConvolveKernelHorizontalSigned8Tap( kInterRoundBitsHorizontal - 1)); src_x += src_stride; intermediate_x += kIntermediateStride; - } while (++y < intermediate_height); + } while (--y != 0); x += 8; p += step_x8; } while (x < width); @@ -1314,9 +1336,9 @@ inline void ConvolveKernelHorizontalSigned8Tap( // This function handles blocks of width 2 or 4. template <int num_taps, int grade_y, int width, bool is_compound> -void ConvolveVerticalScale4xH(const int16_t* src, const int subpixel_y, +void ConvolveVerticalScale4xH(const int16_t* const src, const int subpixel_y, const int filter_index, const int step_y, - const int height, void* dest, + const int height, void* const dest, const ptrdiff_t dest_stride) { constexpr ptrdiff_t src_stride = kIntermediateStride; const int16_t* src_y = src; @@ -1327,8 +1349,8 @@ void ConvolveVerticalScale4xH(const int16_t* src, const int subpixel_y, int p = subpixel_y & 1023; int prev_p = p; - int y = 0; - do { // y < height + int y = height; + do { for (int i = 0; i < num_taps; ++i) { s[i] = vld1_s16(src_y + i * src_stride); } @@ -1381,16 +1403,16 @@ void ConvolveVerticalScale4xH(const int16_t* src, const int subpixel_y, prev_p = p; dest16_y += dest_stride; dest_y += dest_stride; - - y += 2; - } while (y < height); + y -= 2; + } while (y != 0); } template <int num_taps, int grade_y, bool is_compound> -inline void ConvolveVerticalScale(const int16_t* src, const int width, +inline void ConvolveVerticalScale(const int16_t* const src, const int width, const int subpixel_y, const int filter_index, const int step_y, const int height, - void* dest, const ptrdiff_t dest_stride) { + void* const dest, + const ptrdiff_t dest_stride) { constexpr ptrdiff_t src_stride = kIntermediateStride; // A possible improvement is to use arithmetic to decide how many times to // apply filters to same source before checking whether to load new srcs. @@ -1401,15 +1423,15 @@ inline void ConvolveVerticalScale(const int16_t* src, const int width, uint8_t* dest_y; int x = 0; - do { // x < width - const int16_t* src_x = src + x; + do { + const int16_t* const src_x = src + x; const int16_t* src_y = src_x; dest16_y = static_cast<uint16_t*>(dest) + x; dest_y = static_cast<uint8_t*>(dest) + x; int p = subpixel_y & 1023; int prev_p = p; - int y = 0; - do { // y < height + int y = height; + do { for (int i = 0; i < num_taps; ++i) { s[i] = vld1q_s16(src_y + i * src_stride); } @@ -1448,9 +1470,8 @@ inline void ConvolveVerticalScale(const int16_t* src, const int width, prev_p = p; dest16_y += dest_stride; dest_y += dest_stride; - - y += 2; - } while (y < height); + y -= 2; + } while (y != 0); x += 8; } while (x < width); } @@ -1462,7 +1483,7 @@ void ConvolveScale2D_NEON(const void* const reference, const int vertical_filter_index, const int subpixel_x, const int subpixel_y, const int step_x, const int step_y, const int width, const int height, - void* prediction, const ptrdiff_t pred_stride) { + void* const prediction, const ptrdiff_t pred_stride) { const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width); const int vert_filter_index = GetFilterIndex(vertical_filter_index, height); assert(step_x <= 2048); @@ -1699,12 +1720,13 @@ void ConvolveHorizontal_NEON(const void* const reference, const int /*vertical_filter_index*/, const int horizontal_filter_id, const int /*vertical_filter_id*/, const int width, - const int height, void* prediction, + const int height, void* const prediction, const ptrdiff_t pred_stride) { const int filter_index = GetFilterIndex(horizontal_filter_index, width); // Set |src| to the outermost tap. - const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset; - auto* dest = static_cast<uint8_t*>(prediction); + const auto* const src = + static_cast<const uint8_t*>(reference) - kHorizontalOffset; + auto* const dest = static_cast<uint8_t*>(prediction); DoHorizontalPass(src, reference_stride, dest, pred_stride, width, height, horizontal_filter_id, filter_index); @@ -1719,14 +1741,14 @@ uint16x8_t Compound1DShift(const int16x8_t sum) { template <int filter_index, bool is_compound = false, bool negative_outside_taps = false> -void FilterVertical(const uint8_t* src, const ptrdiff_t src_stride, +void FilterVertical(const uint8_t* const src, const ptrdiff_t src_stride, void* const dst, const ptrdiff_t dst_stride, const int width, const int height, const uint8x8_t* const taps) { const int num_taps = GetNumTapsInFilter(filter_index); const int next_row = num_taps - 1; - auto* dst8 = static_cast<uint8_t*>(dst); - auto* dst16 = static_cast<uint16_t*>(dst); + auto* const dst8 = static_cast<uint8_t*>(dst); + auto* const dst16 = static_cast<uint16_t*>(dst); assert(width >= 8); int x = 0; @@ -1754,6 +1776,9 @@ void FilterVertical(const uint8_t* src, const ptrdiff_t src_stride, } } + // Decreasing the y loop counter produces worse code with clang. + // Don't unroll this loop since it generates too much code and the decoder + // is even slower. int y = 0; do { srcs[next_row] = vld1_u8(src_x); @@ -1804,7 +1829,7 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride, srcs[0] = Load4(src); src += src_stride; - int y = 0; + int y = height; do { srcs[0] = Load4<1>(src, srcs[0]); src += src_stride; @@ -1829,8 +1854,8 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride, } srcs[0] = srcs[2]; - y += 2; - } while (y < height); + y -= 2; + } while (y != 0); } else if (num_taps == 4) { srcs[4] = vdup_n_u8(0); @@ -1842,7 +1867,7 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride, src += src_stride; srcs[1] = vext_u8(srcs[0], srcs[2], 4); - int y = 0; + int y = height; do { srcs[2] = Load4<1>(src, srcs[2]); src += src_stride; @@ -1869,8 +1894,8 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride, srcs[0] = srcs[2]; srcs[1] = srcs[3]; srcs[2] = srcs[4]; - y += 2; - } while (y < height); + y -= 2; + } while (y != 0); } else if (num_taps == 6) { srcs[6] = vdup_n_u8(0); @@ -1887,7 +1912,7 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride, src += src_stride; srcs[3] = vext_u8(srcs[2], srcs[4], 4); - int y = 0; + int y = height; do { srcs[4] = Load4<1>(src, srcs[4]); src += src_stride; @@ -1916,8 +1941,8 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride, srcs[2] = srcs[4]; srcs[3] = srcs[5]; srcs[4] = srcs[6]; - y += 2; - } while (y < height); + y -= 2; + } while (y != 0); } else if (num_taps == 8) { srcs[8] = vdup_n_u8(0); @@ -1939,7 +1964,7 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride, src += src_stride; srcs[5] = vext_u8(srcs[4], srcs[6], 4); - int y = 0; + int y = height; do { srcs[6] = Load4<1>(src, srcs[6]); src += src_stride; @@ -1970,8 +1995,8 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride, srcs[4] = srcs[6]; srcs[5] = srcs[7]; srcs[6] = srcs[8]; - y += 2; - } while (y < height); + y -= 2; + } while (y != 0); } } @@ -2186,14 +2211,14 @@ void ConvolveVertical_NEON(const void* const reference, const int vertical_filter_index, const int /*horizontal_filter_id*/, const int vertical_filter_id, const int width, - const int height, void* prediction, + const int height, void* const prediction, const ptrdiff_t pred_stride) { const int filter_index = GetFilterIndex(vertical_filter_index, height); const int vertical_taps = GetNumTapsInFilter(filter_index); const ptrdiff_t src_stride = reference_stride; const auto* src = static_cast<const uint8_t*>(reference) - (vertical_taps / 2 - 1) * src_stride; - auto* dest = static_cast<uint8_t*>(prediction); + auto* const dest = static_cast<uint8_t*>(prediction); const ptrdiff_t dest_stride = pred_stride; assert(vertical_filter_id != 0); @@ -2303,7 +2328,7 @@ void ConvolveCompoundCopy_NEON( const void* const reference, const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/, const int /*horizontal_filter_id*/, const int /*vertical_filter_id*/, - const int width, const int height, void* prediction, + const int width, const int height, void* const prediction, const ptrdiff_t /*pred_stride*/) { const auto* src = static_cast<const uint8_t*>(reference); const ptrdiff_t src_stride = reference_stride; @@ -2312,7 +2337,7 @@ void ConvolveCompoundCopy_NEON( kInterRoundBitsVertical - kInterRoundBitsCompoundVertical; if (width >= 16) { - int y = 0; + int y = height; do { int x = 0; do { @@ -2328,20 +2353,20 @@ void ConvolveCompoundCopy_NEON( } while (x < width); src += src_stride; dest += width; - } while (++y < height); + } while (--y != 0); } else if (width == 8) { - int y = 0; + int y = height; do { const uint8x8_t v_src = vld1_u8(&src[0]); const uint16x8_t v_dest = vshll_n_u8(v_src, final_shift); vst1q_u16(&dest[0], v_dest); src += src_stride; dest += width; - } while (++y < height); - } else { /* width == 4 */ + } while (--y != 0); + } else { // width == 4 uint8x8_t v_src = vdup_n_u8(0); - int y = 0; + int y = height; do { v_src = Load4<0>(&src[0], v_src); src += src_stride; @@ -2350,8 +2375,8 @@ void ConvolveCompoundCopy_NEON( const uint16x8_t v_dest = vshll_n_u8(v_src, final_shift); vst1q_u16(&dest[0], v_dest); dest += 4 << 1; - y += 2; - } while (y < height); + y -= 2; + } while (y != 0); } } @@ -2359,14 +2384,14 @@ void ConvolveCompoundVertical_NEON( const void* const reference, const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/, const int vertical_filter_index, const int /*horizontal_filter_id*/, const int vertical_filter_id, - const int width, const int height, void* prediction, + const int width, const int height, void* const prediction, const ptrdiff_t /*pred_stride*/) { const int filter_index = GetFilterIndex(vertical_filter_index, height); const int vertical_taps = GetNumTapsInFilter(filter_index); const ptrdiff_t src_stride = reference_stride; const auto* src = static_cast<const uint8_t*>(reference) - (vertical_taps / 2 - 1) * src_stride; - auto* dest = static_cast<uint16_t*>(prediction); + auto* const dest = static_cast<uint16_t*>(prediction); assert(vertical_filter_id != 0); uint8x8_t taps[8]; @@ -2454,24 +2479,39 @@ void ConvolveCompoundHorizontal_NEON( const void* const reference, const ptrdiff_t reference_stride, const int horizontal_filter_index, const int /*vertical_filter_index*/, const int horizontal_filter_id, const int /*vertical_filter_id*/, - const int width, const int height, void* prediction, + const int width, const int height, void* const prediction, const ptrdiff_t /*pred_stride*/) { const int filter_index = GetFilterIndex(horizontal_filter_index, width); - const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset; - auto* dest = static_cast<uint16_t*>(prediction); + const auto* const src = + static_cast<const uint8_t*>(reference) - kHorizontalOffset; + auto* const dest = static_cast<uint16_t*>(prediction); DoHorizontalPass</*is_2d=*/false, /*is_compound=*/true>( src, reference_stride, dest, width, width, height, horizontal_filter_id, filter_index); } +template <int vertical_taps> +void Compound2DVertical(const uint16_t* const intermediate_result, + const int width, const int height, const int16x8_t taps, + void* const prediction) { + auto* const dest = static_cast<uint16_t*>(prediction); + if (width == 4) { + Filter2DVerticalWidth4<vertical_taps, /*is_compound=*/true>( + intermediate_result, dest, width, height, taps); + } else { + Filter2DVerticalWidth8AndUp<vertical_taps, /*is_compound=*/true>( + intermediate_result, dest, width, width, height, taps); + } +} + void ConvolveCompound2D_NEON(const void* const reference, const ptrdiff_t reference_stride, const int horizontal_filter_index, const int vertical_filter_index, const int horizontal_filter_id, const int vertical_filter_id, const int width, - const int height, void* prediction, + const int height, void* const prediction, const ptrdiff_t /*pred_stride*/) { // The output of the horizontal filter, i.e. the intermediate_result, is // guaranteed to fit in int16_t. @@ -2492,55 +2532,26 @@ void ConvolveCompound2D_NEON(const void* const reference, const auto* const src = static_cast<const uint8_t*>(reference) - (vertical_taps / 2 - 1) * src_stride - kHorizontalOffset; - DoHorizontalPass</*is_2d=*/true, /*is_compound=*/true>( src, src_stride, intermediate_result, width, width, intermediate_height, horizontal_filter_id, horiz_filter_index); // Vertical filter. - auto* dest = static_cast<uint16_t*>(prediction); assert(vertical_filter_id != 0); - - const ptrdiff_t dest_stride = width; const int16x8_t taps = vmovl_s8( vld1_s8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id])); - if (vertical_taps == 8) { - if (width == 4) { - Filter2DVertical4xH<8, /*is_compound=*/true>(intermediate_result, dest, - dest_stride, height, taps); - } else { - Filter2DVertical<8, /*is_compound=*/true>( - intermediate_result, dest, dest_stride, width, height, taps); - } + Compound2DVertical<8>(intermediate_result, width, height, taps, prediction); } else if (vertical_taps == 6) { - if (width == 4) { - Filter2DVertical4xH<6, /*is_compound=*/true>(intermediate_result, dest, - dest_stride, height, taps); - } else { - Filter2DVertical<6, /*is_compound=*/true>( - intermediate_result, dest, dest_stride, width, height, taps); - } + Compound2DVertical<6>(intermediate_result, width, height, taps, prediction); } else if (vertical_taps == 4) { - if (width == 4) { - Filter2DVertical4xH<4, /*is_compound=*/true>(intermediate_result, dest, - dest_stride, height, taps); - } else { - Filter2DVertical<4, /*is_compound=*/true>( - intermediate_result, dest, dest_stride, width, height, taps); - } + Compound2DVertical<4>(intermediate_result, width, height, taps, prediction); } else { // |vertical_taps| == 2 - if (width == 4) { - Filter2DVertical4xH<2, /*is_compound=*/true>(intermediate_result, dest, - dest_stride, height, taps); - } else { - Filter2DVertical<2, /*is_compound=*/true>( - intermediate_result, dest, dest_stride, width, height, taps); - } + Compound2DVertical<2>(intermediate_result, width, height, taps, prediction); } } -inline void HalfAddHorizontal(const uint8_t* src, uint8_t* dst) { +inline void HalfAddHorizontal(const uint8_t* const src, uint8_t* const dst) { const uint8x16_t left = vld1q_u8(src); const uint8x16_t right = vld1q_u8(src + 1); vst1q_u8(dst, vrhaddq_u8(left, right)); @@ -2554,7 +2565,7 @@ inline void IntraBlockCopyHorizontal(const uint8_t* src, const ptrdiff_t src_remainder_stride = src_stride - (width - 16); const ptrdiff_t dst_remainder_stride = dst_stride - (width - 16); - int y = 0; + int y = height; do { HalfAddHorizontal(src, dst); if (width >= 32) { @@ -2586,7 +2597,7 @@ inline void IntraBlockCopyHorizontal(const uint8_t* src, } src += src_remainder_stride; dst += dst_remainder_stride; - } while (++y < height); + } while (--y != 0); } void ConvolveIntraBlockCopyHorizontal_NEON( @@ -2610,7 +2621,7 @@ void ConvolveIntraBlockCopyHorizontal_NEON( IntraBlockCopyHorizontal<16>(src, reference_stride, height, dest, pred_stride); } else if (width == 8) { - int y = 0; + int y = height; do { const uint8x8_t left = vld1_u8(src); const uint8x8_t right = vld1_u8(src + 1); @@ -2618,11 +2629,11 @@ void ConvolveIntraBlockCopyHorizontal_NEON( src += reference_stride; dest += pred_stride; - } while (++y < height); + } while (--y != 0); } else if (width == 4) { uint8x8_t left = vdup_n_u8(0); uint8x8_t right = vdup_n_u8(0); - int y = 0; + int y = height; do { left = Load4<0>(src, left); right = Load4<0>(src + 1, right); @@ -2637,13 +2648,13 @@ void ConvolveIntraBlockCopyHorizontal_NEON( dest += pred_stride; StoreHi4(dest, result); dest += pred_stride; - y += 2; - } while (y < height); + y -= 2; + } while (y != 0); } else { assert(width == 2); uint8x8_t left = vdup_n_u8(0); uint8x8_t right = vdup_n_u8(0); - int y = 0; + int y = height; do { left = Load2<0>(src, left); right = Load2<0>(src + 1, right); @@ -2658,8 +2669,8 @@ void ConvolveIntraBlockCopyHorizontal_NEON( dest += pred_stride; Store2<1>(dest, result); dest += pred_stride; - y += 2; - } while (y < height); + y -= 2; + } while (y != 0); } } @@ -2694,7 +2705,7 @@ inline void IntraBlockCopyVertical(const uint8_t* src, } src += src_remainder_stride; - int y = 0; + int y = height; do { below[0] = vld1q_u8(src); if (width >= 32) { @@ -2749,7 +2760,7 @@ inline void IntraBlockCopyVertical(const uint8_t* src, } } dst += dst_remainder_stride; - } while (++y < height); + } while (--y != 0); } void ConvolveIntraBlockCopyVertical_NEON( @@ -2778,7 +2789,7 @@ void ConvolveIntraBlockCopyVertical_NEON( row = vld1_u8(src); src += reference_stride; - int y = 0; + int y = height; do { below = vld1_u8(src); src += reference_stride; @@ -2787,13 +2798,13 @@ void ConvolveIntraBlockCopyVertical_NEON( dest += pred_stride; row = below; - } while (++y < height); + } while (--y != 0); } else if (width == 4) { uint8x8_t row = Load4(src); uint8x8_t below = vdup_n_u8(0); src += reference_stride; - int y = 0; + int y = height; do { below = Load4<0>(src, below); src += reference_stride; @@ -2802,14 +2813,14 @@ void ConvolveIntraBlockCopyVertical_NEON( dest += pred_stride; row = below; - } while (++y < height); + } while (--y != 0); } else { assert(width == 2); uint8x8_t row = Load2(src); uint8x8_t below = vdup_n_u8(0); src += reference_stride; - int y = 0; + int y = height; do { below = Load2<0>(src, below); src += reference_stride; @@ -2818,7 +2829,7 @@ void ConvolveIntraBlockCopyVertical_NEON( dest += pred_stride; row = below; - } while (++y < height); + } while (--y != 0); } } @@ -2870,7 +2881,7 @@ inline void IntraBlockCopy2D(const uint8_t* src, const ptrdiff_t src_stride, } src += src_remainder_stride; - int y = 0; + int y = height; do { const uint16x8_t below_0 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1)); vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[0], below_0), 2)); @@ -2981,7 +2992,7 @@ inline void IntraBlockCopy2D(const uint8_t* src, const ptrdiff_t src_stride, } src += src_remainder_stride; dst += dst_remainder_stride; - } while (++y < height); + } while (--y != 0); } void ConvolveIntraBlockCopy2D_NEON( @@ -3013,7 +3024,7 @@ void ConvolveIntraBlockCopy2D_NEON( uint16x4_t row = vget_low_u16(vaddl_u8(left, right)); - int y = 0; + int y = height; do { left = Load4<0>(src, left); right = Load4<0>(src + 1, right); @@ -3032,8 +3043,8 @@ void ConvolveIntraBlockCopy2D_NEON( dest += pred_stride; row = vget_high_u16(below); - y += 2; - } while (y < height); + y -= 2; + } while (y != 0); } else { uint8x8_t left = Load2(src); uint8x8_t right = Load2(src + 1); @@ -3041,7 +3052,7 @@ void ConvolveIntraBlockCopy2D_NEON( uint16x4_t row = vget_low_u16(vaddl_u8(left, right)); - int y = 0; + int y = height; do { left = Load2<0>(src, left); right = Load2<0>(src + 1, right); @@ -3060,8 +3071,8 @@ void ConvolveIntraBlockCopy2D_NEON( dest += pred_stride; row = vget_high_u16(below); - y += 2; - } while (y < height); + y -= 2; + } while (y != 0); } } @@ -3093,7 +3104,7 @@ void ConvolveInit_NEON() { low_bitdepth::Init8bpp(); } } // namespace dsp } // namespace libgav1 -#else // !LIBGAV1_ENABLE_NEON +#else // !LIBGAV1_ENABLE_NEON namespace libgav1 { namespace dsp { diff --git a/src/dsp/arm/distance_weighted_blend_neon.cc b/src/dsp/arm/distance_weighted_blend_neon.cc index 04952ab..a0cd0ac 100644 --- a/src/dsp/arm/distance_weighted_blend_neon.cc +++ b/src/dsp/arm/distance_weighted_blend_neon.cc @@ -30,10 +30,12 @@ namespace libgav1 { namespace dsp { -namespace { constexpr int kInterPostRoundBit = 4; +namespace low_bitdepth { +namespace { + inline int16x8_t ComputeWeightedAverage8(const int16x8_t pred0, const int16x8_t pred1, const int16x4_t weights[2]) { @@ -185,13 +187,167 @@ void Init8bpp() { } } // namespace +} // namespace low_bitdepth + +//------------------------------------------------------------------------------ +#if LIBGAV1_MAX_BITDEPTH >= 10 +namespace high_bitdepth { +namespace { + +inline uint16x4x2_t ComputeWeightedAverage8(const uint16x4x2_t pred0, + const uint16x4x2_t pred1, + const uint16x4_t weights[2]) { + const uint32x4_t wpred0_lo = vmull_u16(weights[0], pred0.val[0]); + const uint32x4_t wpred0_hi = vmull_u16(weights[0], pred0.val[1]); + const uint32x4_t blended_lo = vmlal_u16(wpred0_lo, weights[1], pred1.val[0]); + const uint32x4_t blended_hi = vmlal_u16(wpred0_hi, weights[1], pred1.val[1]); + const int32x4_t offset = vdupq_n_s32(kCompoundOffset * 16); + const int32x4_t res_lo = vsubq_s32(vreinterpretq_s32_u32(blended_lo), offset); + const int32x4_t res_hi = vsubq_s32(vreinterpretq_s32_u32(blended_hi), offset); + const uint16x4_t bd_max = vdup_n_u16((1 << kBitdepth10) - 1); + // Clip the result at (1 << bd) - 1. + uint16x4x2_t result; + result.val[0] = + vmin_u16(vqrshrun_n_s32(res_lo, kInterPostRoundBit + 4), bd_max); + result.val[1] = + vmin_u16(vqrshrun_n_s32(res_hi, kInterPostRoundBit + 4), bd_max); + return result; +} + +inline uint16x4x4_t ComputeWeightedAverage8(const uint16x4x4_t pred0, + const uint16x4x4_t pred1, + const uint16x4_t weights[2]) { + const int32x4_t offset = vdupq_n_s32(kCompoundOffset * 16); + const uint32x4_t wpred0 = vmull_u16(weights[0], pred0.val[0]); + const uint32x4_t wpred1 = vmull_u16(weights[0], pred0.val[1]); + const uint32x4_t blended0 = vmlal_u16(wpred0, weights[1], pred1.val[0]); + const uint32x4_t blended1 = vmlal_u16(wpred1, weights[1], pred1.val[1]); + const int32x4_t res0 = vsubq_s32(vreinterpretq_s32_u32(blended0), offset); + const int32x4_t res1 = vsubq_s32(vreinterpretq_s32_u32(blended1), offset); + const uint32x4_t wpred2 = vmull_u16(weights[0], pred0.val[2]); + const uint32x4_t wpred3 = vmull_u16(weights[0], pred0.val[3]); + const uint32x4_t blended2 = vmlal_u16(wpred2, weights[1], pred1.val[2]); + const uint32x4_t blended3 = vmlal_u16(wpred3, weights[1], pred1.val[3]); + const int32x4_t res2 = vsubq_s32(vreinterpretq_s32_u32(blended2), offset); + const int32x4_t res3 = vsubq_s32(vreinterpretq_s32_u32(blended3), offset); + const uint16x4_t bd_max = vdup_n_u16((1 << kBitdepth10) - 1); + // Clip the result at (1 << bd) - 1. + uint16x4x4_t result; + result.val[0] = + vmin_u16(vqrshrun_n_s32(res0, kInterPostRoundBit + 4), bd_max); + result.val[1] = + vmin_u16(vqrshrun_n_s32(res1, kInterPostRoundBit + 4), bd_max); + result.val[2] = + vmin_u16(vqrshrun_n_s32(res2, kInterPostRoundBit + 4), bd_max); + result.val[3] = + vmin_u16(vqrshrun_n_s32(res3, kInterPostRoundBit + 4), bd_max); + + return result; +} + +// We could use vld1_u16_x2, but for compatibility reasons, use this function +// instead. The compiler optimizes to the correct instruction. +inline uint16x4x2_t LoadU16x4_x2(uint16_t const* ptr) { + uint16x4x2_t x; + // gcc/clang (64 bit) optimizes the following to ldp. + x.val[0] = vld1_u16(ptr); + x.val[1] = vld1_u16(ptr + 4); + return x; +} + +// We could use vld1_u16_x4, but for compatibility reasons, use this function +// instead. The compiler optimizes to a pair of vld1_u16_x2, which showed better +// performance in the speed tests. +inline uint16x4x4_t LoadU16x4_x4(uint16_t const* ptr) { + uint16x4x4_t x; + x.val[0] = vld1_u16(ptr); + x.val[1] = vld1_u16(ptr + 4); + x.val[2] = vld1_u16(ptr + 8); + x.val[3] = vld1_u16(ptr + 12); + return x; +} + +void DistanceWeightedBlend_NEON(const void* prediction_0, + const void* prediction_1, + const uint8_t weight_0, const uint8_t weight_1, + const int width, const int height, + void* const dest, const ptrdiff_t dest_stride) { + const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); + const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); + auto* dst = static_cast<uint16_t*>(dest); + const ptrdiff_t dst_stride = dest_stride / sizeof(dst[0]); + const uint16x4_t weights[2] = {vdup_n_u16(weight_0), vdup_n_u16(weight_1)}; -void DistanceWeightedBlendInit_NEON() { Init8bpp(); } + if (width == 4) { + int y = height; + do { + const uint16x4x2_t src0 = LoadU16x4_x2(pred_0); + const uint16x4x2_t src1 = LoadU16x4_x2(pred_1); + const uint16x4x2_t res = ComputeWeightedAverage8(src0, src1, weights); + vst1_u16(dst, res.val[0]); + vst1_u16(dst + dst_stride, res.val[1]); + dst += dst_stride << 1; + pred_0 += 8; + pred_1 += 8; + y -= 2; + } while (y != 0); + } else if (width == 8) { + int y = height; + do { + const uint16x4x4_t src0 = LoadU16x4_x4(pred_0); + const uint16x4x4_t src1 = LoadU16x4_x4(pred_1); + const uint16x4x4_t res = ComputeWeightedAverage8(src0, src1, weights); + vst1_u16(dst, res.val[0]); + vst1_u16(dst + 4, res.val[1]); + vst1_u16(dst + dst_stride, res.val[2]); + vst1_u16(dst + dst_stride + 4, res.val[3]); + dst += dst_stride << 1; + pred_0 += 16; + pred_1 += 16; + y -= 2; + } while (y != 0); + } else { + int y = height; + do { + int x = 0; + do { + const uint16x4x4_t src0 = LoadU16x4_x4(pred_0 + x); + const uint16x4x4_t src1 = LoadU16x4_x4(pred_1 + x); + const uint16x4x4_t res = ComputeWeightedAverage8(src0, src1, weights); + vst1_u16(dst + x, res.val[0]); + vst1_u16(dst + x + 4, res.val[1]); + vst1_u16(dst + x + 8, res.val[2]); + vst1_u16(dst + x + 12, res.val[3]); + x += 16; + } while (x < width); + dst += dst_stride; + pred_0 += width; + pred_1 += width; + } while (--y != 0); + } +} + +void Init10bpp() { + Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth10); + assert(dsp != nullptr); + dsp->distance_weighted_blend = DistanceWeightedBlend_NEON; +} + +} // namespace +} // namespace high_bitdepth +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +void DistanceWeightedBlendInit_NEON() { + low_bitdepth::Init8bpp(); +#if LIBGAV1_MAX_BITDEPTH >= 10 + high_bitdepth::Init10bpp(); +#endif +} } // namespace dsp } // namespace libgav1 -#else // !LIBGAV1_ENABLE_NEON +#else // !LIBGAV1_ENABLE_NEON namespace libgav1 { namespace dsp { diff --git a/src/dsp/arm/distance_weighted_blend_neon.h b/src/dsp/arm/distance_weighted_blend_neon.h index 4d8824c..94a799c 100644 --- a/src/dsp/arm/distance_weighted_blend_neon.h +++ b/src/dsp/arm/distance_weighted_blend_neon.h @@ -34,6 +34,8 @@ void DistanceWeightedBlendInit_NEON(); #if LIBGAV1_ENABLE_NEON #define LIBGAV1_Dsp8bpp_DistanceWeightedBlend LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_DistanceWeightedBlend LIBGAV1_CPU_NEON + #endif // LIBGAV1_ENABLE_NEON #endif // LIBGAV1_SRC_DSP_ARM_DISTANCE_WEIGHTED_BLEND_NEON_H_ diff --git a/src/dsp/arm/film_grain_neon.cc b/src/dsp/arm/film_grain_neon.cc index 2612466..8ee3745 100644 --- a/src/dsp/arm/film_grain_neon.cc +++ b/src/dsp/arm/film_grain_neon.cc @@ -1176,7 +1176,7 @@ void FilmGrainInit_NEON() { } // namespace dsp } // namespace libgav1 -#else // !LIBGAV1_ENABLE_NEON +#else // !LIBGAV1_ENABLE_NEON namespace libgav1 { namespace dsp { diff --git a/src/dsp/arm/intra_edge_neon.cc b/src/dsp/arm/intra_edge_neon.cc index 00b186a..074283f 100644 --- a/src/dsp/arm/intra_edge_neon.cc +++ b/src/dsp/arm/intra_edge_neon.cc @@ -25,7 +25,7 @@ #include "src/dsp/arm/common_neon.h" #include "src/dsp/constants.h" #include "src/dsp/dsp.h" -#include "src/utils/common.h" // RightShiftWithRounding() +#include "src/utils/common.h" namespace libgav1 { namespace dsp { @@ -35,6 +35,11 @@ namespace { // required. constexpr int kKernelsNEON[3][2] = {{4, 8}, {5, 6}}; +} // namespace + +namespace low_bitdepth { +namespace { + void IntraEdgeFilter_NEON(void* buffer, const int size, const int strength) { assert(strength == 1 || strength == 2 || strength == 3); const int kernel_index = strength - 1; @@ -44,6 +49,8 @@ void IntraEdgeFilter_NEON(void* buffer, const int size, const int strength) { // elements written is |size| - 1. if (size == 1) return; + const uint8x16_t v_index = vcombine_u8(vcreate_u8(0x0706050403020100), + vcreate_u8(0x0f0e0d0c0b0a0908)); // |strength| 1 and 2 use a 3 tap filter. if (strength < 3) { // The last value requires extending the buffer (duplicating @@ -89,7 +96,6 @@ void IntraEdgeFilter_NEON(void* buffer, const int size, const int strength) { // |remainder| == 1 then we don't have to do anything. const int remainder = (size - 1) & 0xf; if (remainder > 1) { - uint8_t temp[16]; const uint8x16_t src_1 = vld1q_u8(dst_buffer + i); const uint8x16_t src_2 = vld1q_u8(dst_buffer + i + 1); @@ -102,9 +108,11 @@ void IntraEdgeFilter_NEON(void* buffer, const int size, const int strength) { const uint8x16_t result = vcombine_u8(vrshrn_n_u16(sum_lo, 4), vrshrn_n_u16(sum_hi, 4)); - - vst1q_u8(temp, result); - memcpy(dst_buffer + i, temp, remainder); + const uint8x16_t v_remainder = vdupq_n_u8(remainder); + // Create over write mask. + const uint8x16_t mask = vcleq_u8(v_remainder, v_index); + const uint8x16_t dst_remainder = vbslq_u8(mask, src_1, result); + vst1q_u8(dst_buffer + i, dst_remainder); } dst_buffer[size - 1] = last_val; @@ -173,7 +181,6 @@ void IntraEdgeFilter_NEON(void* buffer, const int size, const int strength) { // Like the 3 tap but if there are two remaining values we have already // calculated them. if (remainder > 2) { - uint8_t temp[16]; const uint8x16_t src_2 = vld1q_u8(dst_buffer + i); const uint8x16_t src_3 = vld1q_u8(dst_buffer + i + 1); const uint8x16_t src_4 = vld1q_u8(dst_buffer + i + 2); @@ -193,9 +200,11 @@ void IntraEdgeFilter_NEON(void* buffer, const int size, const int strength) { const uint8x16_t result = vcombine_u8(vrshrn_n_u16(sum_lo, 4), vrshrn_n_u16(sum_hi, 4)); - - vst1q_u8(temp, result); - memcpy(dst_buffer + i, temp, remainder); + const uint8x16_t v_remainder = vdupq_n_u8(remainder); + // Create over write mask. + const uint8x16_t mask = vcleq_u8(v_remainder, v_index); + const uint8x16_t dst_remainder = vbslq_u8(mask, src_2, result); + vst1q_u8(dst_buffer + i, dst_remainder); } dst_buffer[1] = special_vals[0]; @@ -284,13 +293,225 @@ void Init8bpp() { } } // namespace +} // namespace low_bitdepth + +//------------------------------------------------------------------------------ +#if LIBGAV1_MAX_BITDEPTH >= 10 +namespace high_bitdepth { +namespace { + +const uint16_t kRemainderMask[8][8] = { + {0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000}, + {0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000}, + {0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000}, + {0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000}, + {0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000}, + {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000}, + {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000}, + {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000}, +}; + +void IntraEdgeFilter_NEON(void* buffer, const int size, const int strength) { + assert(strength == 1 || strength == 2 || strength == 3); + const int kernel_index = strength - 1; + auto* const dst_buffer = static_cast<uint16_t*>(buffer); + + // The first element is not written out (but it is input) so the number of + // elements written is |size| - 1. + if (size == 1) return; + + // |strength| 1 and 2 use a 3 tap filter. + if (strength < 3) { + // The last value requires extending the buffer (duplicating + // |dst_buffer[size - 1]). Calculate it here to avoid extra processing in + // neon. + const uint16_t last_val = RightShiftWithRounding( + kKernelsNEON[kernel_index][0] * dst_buffer[size - 2] + + kKernelsNEON[kernel_index][1] * dst_buffer[size - 1] + + kKernelsNEON[kernel_index][0] * dst_buffer[size - 1], + 4); + + const uint16_t krn0 = kKernelsNEON[kernel_index][0]; + const uint16_t krn1 = kKernelsNEON[kernel_index][1]; + + // The first value we need gets overwritten by the output from the + // previous iteration. + uint16x8_t src_0 = vld1q_u16(dst_buffer); + int i = 1; + + // Process blocks until there are less than 16 values remaining. + for (; i < size - 7; i += 8) { + // Loading these at the end of the block with |src_0| will read past the + // end of |top_row_data[160]|, the source of |buffer|. + const uint16x8_t src_1 = vld1q_u16(dst_buffer + i); + const uint16x8_t src_2 = vld1q_u16(dst_buffer + i + 1); + const uint16x8_t sum_02 = vmulq_n_u16(vaddq_u16(src_0, src_2), krn0); + const uint16x8_t sum = vmlaq_n_u16(sum_02, src_1, krn1); + const uint16x8_t result = vrshrq_n_u16(sum, 4); + // Load the next row before overwriting. This loads an extra 7 values + // past |size| on the trailing iteration. + src_0 = vld1q_u16(dst_buffer + i + 7); + vst1q_u16(dst_buffer + i, result); + } + + // The last output value |last_val| was already calculated so if + // |remainder| == 1 then we don't have to do anything. + const int remainder = (size - 1) & 0x7; + if (remainder > 1) { + const uint16x8_t src_1 = vld1q_u16(dst_buffer + i); + const uint16x8_t src_2 = vld1q_u16(dst_buffer + i + 1); + const uint16x8_t sum_02 = vmulq_n_u16(vaddq_u16(src_0, src_2), krn0); + const uint16x8_t sum = vmlaq_n_u16(sum_02, src_1, krn1); + const uint16x8_t result = vrshrq_n_u16(sum, 4); + const uint16x8_t mask = vld1q_u16(kRemainderMask[remainder]); + const uint16x8_t dst_remainder = vbslq_u16(mask, result, src_1); + vst1q_u16(dst_buffer + i, dst_remainder); + } + + dst_buffer[size - 1] = last_val; + return; + } + + assert(strength == 3); + // 5 tap filter. The first element requires duplicating |buffer[0]| and the + // last two elements require duplicating |buffer[size - 1]|. + uint16_t special_vals[3]; + special_vals[0] = RightShiftWithRounding( + (dst_buffer[0] << 1) + (dst_buffer[0] << 2) + (dst_buffer[1] << 2) + + (dst_buffer[2] << 2) + (dst_buffer[3] << 1), + 4); + // Clamp index for very small |size| values. + const int first_index_min = std::max(size - 4, 0); + const int second_index_min = std::max(size - 3, 0); + const int third_index_min = std::max(size - 2, 0); + special_vals[1] = RightShiftWithRounding( + (dst_buffer[first_index_min] << 1) + (dst_buffer[second_index_min] << 2) + + (dst_buffer[third_index_min] << 2) + (dst_buffer[size - 1] << 2) + + (dst_buffer[size - 1] << 1), + 4); + special_vals[2] = RightShiftWithRounding( + (dst_buffer[second_index_min] << 1) + (dst_buffer[third_index_min] << 2) + + // x << 2 + x << 2 == x << 3 + (dst_buffer[size - 1] << 3) + (dst_buffer[size - 1] << 1), + 4); + + // The first two values we need get overwritten by the output from the + // previous iteration. + uint16x8_t src_0 = vld1q_u16(dst_buffer - 1); + uint16x8_t src_1 = vld1q_u16(dst_buffer); + int i = 1; + + for (; i < size - 7; i += 8) { + // Loading these at the end of the block with |src_[01]| will read past + // the end of |top_row_data[160]|, the source of |buffer|. + const uint16x8_t src_2 = vld1q_u16(dst_buffer + i); + const uint16x8_t src_3 = vld1q_u16(dst_buffer + i + 1); + const uint16x8_t src_4 = vld1q_u16(dst_buffer + i + 2); + const uint16x8_t sum_04 = vshlq_n_u16(vaddq_u16(src_0, src_4), 1); + const uint16x8_t sum_123 = vaddq_u16(vaddq_u16(src_1, src_2), src_3); + const uint16x8_t sum = vaddq_u16(sum_04, vshlq_n_u16(sum_123, 2)); + const uint16x8_t result = vrshrq_n_u16(sum, 4); + + // Load the next before overwriting. + src_0 = vld1q_u16(dst_buffer + i + 6); + src_1 = vld1q_u16(dst_buffer + i + 7); + + vst1q_u16(dst_buffer + i, result); + } + + const int remainder = (size - 1) & 0x7; + // Like the 3 tap but if there are two remaining values we have already + // calculated them. + if (remainder > 2) { + const uint16x8_t src_2 = vld1q_u16(dst_buffer + i); + const uint16x8_t src_3 = vld1q_u16(dst_buffer + i + 1); + const uint16x8_t src_4 = vld1q_u16(dst_buffer + i + 2); + const uint16x8_t sum_04 = vshlq_n_u16(vaddq_u16(src_0, src_4), 1); + const uint16x8_t sum_123 = vaddq_u16(vaddq_u16(src_1, src_2), src_3); + const uint16x8_t sum = vaddq_u16(sum_04, vshlq_n_u16(sum_123, 2)); + const uint16x8_t result = vrshrq_n_u16(sum, 4); + const uint16x8_t mask = vld1q_u16(kRemainderMask[remainder]); + const uint16x8_t dst_remainder = vbslq_u16(mask, result, src_2); + vst1q_u16(dst_buffer + i, dst_remainder); + } + + dst_buffer[1] = special_vals[0]; + // Avoid overwriting |dst_buffer[0]|. + if (size > 2) dst_buffer[size - 2] = special_vals[1]; + dst_buffer[size - 1] = special_vals[2]; +} + +void IntraEdgeUpsampler_NEON(void* buffer, const int size) { + assert(size % 4 == 0 && size <= 16); + auto* const pixel_buffer = static_cast<uint16_t*>(buffer); -void IntraEdgeInit_NEON() { Init8bpp(); } + // Extend first/last samples + pixel_buffer[-2] = pixel_buffer[-1]; + pixel_buffer[size] = pixel_buffer[size - 1]; + + const int16x8_t src_lo = vreinterpretq_s16_u16(vld1q_u16(pixel_buffer - 2)); + const int16x8_t src_hi = + vreinterpretq_s16_u16(vld1q_u16(pixel_buffer - 2 + 8)); + const int16x8_t src9_hi = vaddq_s16(src_hi, vshlq_n_s16(src_hi, 3)); + const int16x8_t src9_lo = vaddq_s16(src_lo, vshlq_n_s16(src_lo, 3)); + + int16x8_t sum_lo = vsubq_s16(vextq_s16(src9_lo, src9_hi, 1), src_lo); + sum_lo = vaddq_s16(sum_lo, vextq_s16(src9_lo, src9_hi, 2)); + sum_lo = vsubq_s16(sum_lo, vextq_s16(src_lo, src_hi, 3)); + sum_lo = vrshrq_n_s16(sum_lo, 4); + + uint16x8x2_t result_lo; + result_lo.val[0] = + vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(sum_lo, vdupq_n_s16(0))), + vdupq_n_u16((1 << kBitdepth10) - 1)); + result_lo.val[1] = vreinterpretq_u16_s16(vextq_s16(src_lo, src_hi, 2)); + + if (size > 8) { + const int16x8_t src_hi_extra = + vreinterpretq_s16_u16(vld1q_u16(pixel_buffer + 16 - 2)); + const int16x8_t src9_hi_extra = + vaddq_s16(src_hi_extra, vshlq_n_s16(src_hi_extra, 3)); + + int16x8_t sum_hi = vsubq_s16(vextq_s16(src9_hi, src9_hi_extra, 1), src_hi); + sum_hi = vaddq_s16(sum_hi, vextq_s16(src9_hi, src9_hi_extra, 2)); + sum_hi = vsubq_s16(sum_hi, vextq_s16(src_hi, src_hi_extra, 3)); + sum_hi = vrshrq_n_s16(sum_hi, 4); + + uint16x8x2_t result_hi; + result_hi.val[0] = + vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(sum_hi, vdupq_n_s16(0))), + vdupq_n_u16((1 << kBitdepth10) - 1)); + result_hi.val[1] = + vreinterpretq_u16_s16(vextq_s16(src_hi, src_hi_extra, 2)); + vst2q_u16(pixel_buffer - 1, result_lo); + vst2q_u16(pixel_buffer + 15, result_hi); + } else { + vst2q_u16(pixel_buffer - 1, result_lo); + } +} + +void Init10bpp() { + Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth10); + assert(dsp != nullptr); + dsp->intra_edge_filter = IntraEdgeFilter_NEON; + dsp->intra_edge_upsampler = IntraEdgeUpsampler_NEON; +} + +} // namespace +} // namespace high_bitdepth +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +void IntraEdgeInit_NEON() { + low_bitdepth::Init8bpp(); +#if LIBGAV1_MAX_BITDEPTH >= 10 + high_bitdepth::Init10bpp(); +#endif +} } // namespace dsp } // namespace libgav1 -#else // !LIBGAV1_ENABLE_NEON +#else // !LIBGAV1_ENABLE_NEON namespace libgav1 { namespace dsp { diff --git a/src/dsp/arm/intra_edge_neon.h b/src/dsp/arm/intra_edge_neon.h index d3bb243..28e3494 100644 --- a/src/dsp/arm/intra_edge_neon.h +++ b/src/dsp/arm/intra_edge_neon.h @@ -34,6 +34,9 @@ void IntraEdgeInit_NEON(); #define LIBGAV1_Dsp8bpp_IntraEdgeFilter LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_IntraEdgeUpsampler LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_IntraEdgeFilter LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_IntraEdgeUpsampler LIBGAV1_CPU_NEON + #endif // LIBGAV1_ENABLE_NEON #endif // LIBGAV1_SRC_DSP_ARM_INTRA_EDGE_NEON_H_ diff --git a/src/dsp/arm/intrapred_cfl_neon.cc b/src/dsp/arm/intrapred_cfl_neon.cc index 45fe33b..8d8748f 100644 --- a/src/dsp/arm/intrapred_cfl_neon.cc +++ b/src/dsp/arm/intrapred_cfl_neon.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "src/dsp/intrapred.h" +#include "src/dsp/intrapred_cfl.h" #include "src/utils/cpu.h" #if LIBGAV1_ENABLE_NEON @@ -27,45 +27,20 @@ #include "src/dsp/constants.h" #include "src/dsp/dsp.h" #include "src/utils/common.h" +#include "src/utils/constants.h" namespace libgav1 { namespace dsp { -namespace low_bitdepth { -namespace { - -uint8x16_t Set2ValuesQ(const uint8_t* a) { - uint16_t combined_values = a[0] | a[1] << 8; - return vreinterpretq_u8_u16(vdupq_n_u16(combined_values)); -} - -uint32_t SumVector(uint32x2_t a) { -#if defined(__aarch64__) - return vaddv_u32(a); -#else - const uint64x1_t b = vpaddl_u32(a); - return vget_lane_u32(vreinterpret_u32_u64(b), 0); -#endif // defined(__aarch64__) -} - -uint32_t SumVector(uint32x4_t a) { -#if defined(__aarch64__) - return vaddvq_u32(a); -#else - const uint64x2_t b = vpaddlq_u32(a); - const uint64x1_t c = vadd_u64(vget_low_u64(b), vget_high_u64(b)); - return vget_lane_u32(vreinterpret_u32_u64(c), 0); -#endif // defined(__aarch64__) -} // Divide by the number of elements. -uint32_t Average(const uint32_t sum, const int width, const int height) { +inline uint32_t Average(const uint32_t sum, const int width, const int height) { return RightShiftWithRounding(sum, FloorLog2(width) + FloorLog2(height)); } // Subtract |val| from every element in |a|. -void BlockSubtract(const uint32_t val, - int16_t a[kCflLumaBufferStride][kCflLumaBufferStride], - const int width, const int height) { +inline void BlockSubtract(const uint32_t val, + int16_t a[kCflLumaBufferStride][kCflLumaBufferStride], + const int width, const int height) { assert(val <= INT16_MAX); const int16x8_t val_v = vdupq_n_s16(static_cast<int16_t>(val)); @@ -94,6 +69,9 @@ void BlockSubtract(const uint32_t val, } } +namespace low_bitdepth { +namespace { + template <int block_width, int block_height> void CflSubsampler420_NEON( int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], @@ -122,26 +100,27 @@ void CflSubsampler420_NEON( sum = SumVector(running_sum); } else if (block_width == 8) { - const uint8x16_t x_index = {0, 0, 2, 2, 4, 4, 6, 6, - 8, 8, 10, 10, 12, 12, 14, 14}; - const uint8x16_t x_max_index = vdupq_n_u8(max_luma_width - 2); - const uint8x16_t x_mask = vcltq_u8(x_index, x_max_index); + const uint16x8_t x_index = {0, 2, 4, 6, 8, 10, 12, 14}; + const uint16x8_t x_max_index = + vdupq_n_u16(max_luma_width == 8 ? max_luma_width - 2 : 16); + const uint16x8_t x_mask = vcltq_u16(x_index, x_max_index); uint32x4_t running_sum = vdupq_n_u32(0); for (int y = 0; y < block_height; ++y) { - const uint8x16_t x_max0 = Set2ValuesQ(src + max_luma_width - 2); - const uint8x16_t x_max1 = Set2ValuesQ(src + max_luma_width - 2 + stride); + const uint8x16_t row0 = vld1q_u8(src); + const uint8x16_t row1 = vld1q_u8(src + stride); + const uint16x8_t sum_row = vpadalq_u8(vpaddlq_u8(row0), row1); + const uint16x8_t sum_row_shifted = vshlq_n_u16(sum_row, 1); - uint8x16_t row0 = vld1q_u8(src); - row0 = vbslq_u8(x_mask, row0, x_max0); - uint8x16_t row1 = vld1q_u8(src + stride); - row1 = vbslq_u8(x_mask, row1, x_max1); + // Dup the 2x2 sum at the max luma offset. + const uint16x8_t max_luma_sum = + vdupq_lane_u16(vget_low_u16(sum_row_shifted), 3); + const uint16x8_t final_sum_row = + vbslq_u16(x_mask, sum_row_shifted, max_luma_sum); + vst1q_s16(luma[y], vreinterpretq_s16_u16(final_sum_row)); - uint16x8_t sum_row = vpadalq_u8(vpaddlq_u8(row0), row1); - sum_row = vshlq_n_u16(sum_row, 1); - running_sum = vpadalq_u16(running_sum, sum_row); - vst1q_s16(luma[y], vreinterpretq_s16_u16(sum_row)); + running_sum = vpadalq_u16(running_sum, final_sum_row); if (y << 1 < max_luma_height - 2) { src += stride << 1; @@ -150,45 +129,35 @@ void CflSubsampler420_NEON( sum = SumVector(running_sum); } else /* block_width >= 16 */ { - const uint8x16_t x_max_index = vdupq_n_u8(max_luma_width - 2); + const uint16x8_t x_max_index = vdupq_n_u16(max_luma_width - 2); uint32x4_t running_sum = vdupq_n_u32(0); for (int y = 0; y < block_height; ++y) { - uint8x16_t x_index = {0, 2, 4, 6, 8, 10, 12, 14, - 16, 18, 20, 22, 24, 26, 28, 30}; - const uint8x16_t x_max00 = vdupq_n_u8(src[max_luma_width - 2]); - const uint8x16_t x_max01 = vdupq_n_u8(src[max_luma_width - 2 + 1]); - const uint8x16_t x_max10 = vdupq_n_u8(src[stride + max_luma_width - 2]); - const uint8x16_t x_max11 = - vdupq_n_u8(src[stride + max_luma_width - 2 + 1]); - for (int x = 0; x < block_width; x += 16) { - const ptrdiff_t src_x_offset = x << 1; - const uint8x16_t x_mask = vcltq_u8(x_index, x_max_index); - const uint8x16x2_t row0 = vld2q_u8(src + src_x_offset); - const uint8x16x2_t row1 = vld2q_u8(src + src_x_offset + stride); - const uint8x16_t row_masked_00 = vbslq_u8(x_mask, row0.val[0], x_max00); - const uint8x16_t row_masked_01 = vbslq_u8(x_mask, row0.val[1], x_max01); - const uint8x16_t row_masked_10 = vbslq_u8(x_mask, row1.val[0], x_max10); - const uint8x16_t row_masked_11 = vbslq_u8(x_mask, row1.val[1], x_max11); - - uint16x8_t sum_row_lo = - vaddl_u8(vget_low_u8(row_masked_00), vget_low_u8(row_masked_01)); - sum_row_lo = vaddw_u8(sum_row_lo, vget_low_u8(row_masked_10)); - sum_row_lo = vaddw_u8(sum_row_lo, vget_low_u8(row_masked_11)); - sum_row_lo = vshlq_n_u16(sum_row_lo, 1); - running_sum = vpadalq_u16(running_sum, sum_row_lo); - vst1q_s16(luma[y] + x, vreinterpretq_s16_u16(sum_row_lo)); - - uint16x8_t sum_row_hi = - vaddl_u8(vget_high_u8(row_masked_00), vget_high_u8(row_masked_01)); - sum_row_hi = vaddw_u8(sum_row_hi, vget_high_u8(row_masked_10)); - sum_row_hi = vaddw_u8(sum_row_hi, vget_high_u8(row_masked_11)); - sum_row_hi = vshlq_n_u16(sum_row_hi, 1); - running_sum = vpadalq_u16(running_sum, sum_row_hi); - vst1q_s16(luma[y] + x + 8, vreinterpretq_s16_u16(sum_row_hi)); - - x_index = vaddq_u8(x_index, vdupq_n_u8(32)); + // Calculate the 2x2 sum at the max_luma offset + const uint8_t a00 = src[max_luma_width - 2]; + const uint8_t a01 = src[max_luma_width - 1]; + const uint8_t a10 = src[max_luma_width - 2 + stride]; + const uint8_t a11 = src[max_luma_width - 1 + stride]; + // Dup the 2x2 sum at the max luma offset. + const uint16x8_t max_luma_sum = + vdupq_n_u16((uint16_t)((a00 + a01 + a10 + a11) << 1)); + uint16x8_t x_index = {0, 2, 4, 6, 8, 10, 12, 14}; + + ptrdiff_t src_x_offset = 0; + for (int x = 0; x < block_width; x += 8, src_x_offset += 16) { + const uint16x8_t x_mask = vcltq_u16(x_index, x_max_index); + const uint8x16_t row0 = vld1q_u8(src + src_x_offset); + const uint8x16_t row1 = vld1q_u8(src + src_x_offset + stride); + const uint16x8_t sum_row = vpadalq_u8(vpaddlq_u8(row0), row1); + const uint16x8_t sum_row_shifted = vshlq_n_u16(sum_row, 1); + const uint16x8_t final_sum_row = + vbslq_u16(x_mask, sum_row_shifted, max_luma_sum); + vst1q_s16(luma[y] + x, vreinterpretq_s16_u16(final_sum_row)); + + running_sum = vpadalq_u16(running_sum, final_sum_row); + x_index = vaddq_u16(x_index, vdupq_n_u16(16)); } + if (y << 1 < max_luma_height - 2) { src += stride << 1; } @@ -209,17 +178,30 @@ void CflSubsampler444_NEON( uint32_t sum; if (block_width == 4) { assert(max_luma_width >= 4); + assert(max_luma_height <= block_height); + assert((max_luma_height % 2) == 0); uint32x4_t running_sum = vdupq_n_u32(0); uint8x8_t row = vdup_n_u8(0); - for (int y = 0; y < block_height; y += 2) { + uint16x8_t row_shifted; + int y = 0; + do { row = Load4<0>(src, row); row = Load4<1>(src + stride, row); if (y < (max_luma_height - 1)) { src += stride << 1; } - const uint16x8_t row_shifted = vshll_n_u8(row, 3); + row_shifted = vshll_n_u8(row, 3); + running_sum = vpadalq_u16(running_sum, row_shifted); + vst1_s16(luma[y], vreinterpret_s16_u16(vget_low_u16(row_shifted))); + vst1_s16(luma[y + 1], vreinterpret_s16_u16(vget_high_u16(row_shifted))); + y += 2; + } while (y < max_luma_height); + + row_shifted = + vcombine_u16(vget_high_u16(row_shifted), vget_high_u16(row_shifted)); + for (; y < block_height; y += 2) { running_sum = vpadalq_u16(running_sum, row_shifted); vst1_s16(luma[y], vreinterpret_s16_u16(vget_low_u16(row_shifted))); vst1_s16(luma[y + 1], vreinterpret_s16_u16(vget_high_u16(row_shifted))); @@ -463,12 +445,874 @@ void Init8bpp() { } // namespace } // namespace low_bitdepth -void IntraPredCflInit_NEON() { low_bitdepth::Init8bpp(); } +//------------------------------------------------------------------------------ +#if LIBGAV1_MAX_BITDEPTH >= 10 +namespace high_bitdepth { +namespace { + +//------------------------------------------------------------------------------ +// CflSubsampler +#ifndef __aarch64__ +uint16x8_t vpaddq_u16(uint16x8_t a, uint16x8_t b) { + return vcombine_u16(vpadd_u16(vget_low_u16(a), vget_high_u16(a)), + vpadd_u16(vget_low_u16(b), vget_high_u16(b))); +} +#endif + +// This duplicates the last two 16-bit values in |row|. +inline uint16x8_t LastRowSamples(const uint16x8_t row) { + const uint32x2_t a = vget_high_u32(vreinterpretq_u32_u16(row)); + const uint32x4_t b = vdupq_lane_u32(a, 1); + return vreinterpretq_u16_u32(b); +} + +// This duplicates the last unsigned 16-bit value in |row|. +inline uint16x8_t LastRowResult(const uint16x8_t row) { + const uint16x4_t a = vget_high_u16(row); + const uint16x8_t b = vdupq_lane_u16(a, 0x3); + return b; +} + +// This duplicates the last signed 16-bit value in |row|. +inline int16x8_t LastRowResult(const int16x8_t row) { + const int16x4_t a = vget_high_s16(row); + const int16x8_t b = vdupq_lane_s16(a, 0x3); + return b; +} + +// Takes in two sums of input row pairs, and completes the computation for two +// output rows. +inline uint16x8_t StoreLumaResults4_420(const uint16x8_t vertical_sum0, + const uint16x8_t vertical_sum1, + int16_t* luma_ptr) { + const uint16x8_t result = vpaddq_u16(vertical_sum0, vertical_sum1); + const uint16x8_t result_shifted = vshlq_n_u16(result, 1); + vst1_s16(luma_ptr, vreinterpret_s16_u16(vget_low_u16(result_shifted))); + vst1_s16(luma_ptr + kCflLumaBufferStride, + vreinterpret_s16_u16(vget_high_u16(result_shifted))); + return result_shifted; +} + +// Takes two halves of a vertically added pair of rows and completes the +// computation for one output row. +inline uint16x8_t StoreLumaResults8_420(const uint16x8_t vertical_sum0, + const uint16x8_t vertical_sum1, + int16_t* luma_ptr) { + const uint16x8_t result = vpaddq_u16(vertical_sum0, vertical_sum1); + const uint16x8_t result_shifted = vshlq_n_u16(result, 1); + vst1q_s16(luma_ptr, vreinterpretq_s16_u16(result_shifted)); + return result_shifted; +} + +template <int block_height_log2, bool is_inside> +void CflSubsampler444_4xH_NEON( + int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int max_luma_height, const void* const source, ptrdiff_t stride) { + static_assert(block_height_log2 <= 4, ""); + const int block_height = 1 << block_height_log2; + const int visible_height = max_luma_height; + const auto* src = static_cast<const uint16_t*>(source); + const ptrdiff_t src_stride = stride / sizeof(src[0]); + int16_t* luma_ptr = luma[0]; + uint16x4_t sum = vdup_n_u16(0); + uint16x4_t samples[2]; + int y = visible_height; + + do { + samples[0] = vld1_u16(src); + samples[1] = vld1_u16(src + src_stride); + src += src_stride << 1; + sum = vadd_u16(sum, samples[0]); + sum = vadd_u16(sum, samples[1]); + y -= 2; + } while (y != 0); + + if (!is_inside) { + y = visible_height; + samples[1] = vshl_n_u16(samples[1], 1); + do { + sum = vadd_u16(sum, samples[1]); + y += 2; + } while (y < block_height); + } + + // Here the left shift by 3 (to increase precision) is nullified in right + // shift ((log2 of width 4) + 1). + const uint32_t average_sum = + RightShiftWithRounding(SumVector(vpaddl_u16(sum)), block_height_log2 - 1); + const int16x4_t averages = vdup_n_s16(static_cast<int16_t>(average_sum)); + + const auto* ssrc = static_cast<const int16_t*>(source); + int16x4_t ssample; + luma_ptr = luma[0]; + y = visible_height; + do { + ssample = vld1_s16(ssrc); + ssample = vshl_n_s16(ssample, 3); + vst1_s16(luma_ptr, vsub_s16(ssample, averages)); + ssrc += src_stride; + luma_ptr += kCflLumaBufferStride; + } while (--y != 0); + + if (!is_inside) { + y = visible_height; + // Replicate last line + do { + vst1_s16(luma_ptr, vsub_s16(ssample, averages)); + luma_ptr += kCflLumaBufferStride; + } while (++y < block_height); + } +} + +template <int block_height_log2> +void CflSubsampler444_4xH_NEON( + int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int max_luma_width, const int max_luma_height, + const void* const source, ptrdiff_t stride) { + static_cast<void>(max_luma_width); + static_cast<void>(max_luma_height); + static_assert(block_height_log2 <= 4, ""); + assert(max_luma_width >= 4); + assert(max_luma_height >= 4); + const int block_height = 1 << block_height_log2; + + if (block_height <= max_luma_height) { + CflSubsampler444_4xH_NEON<block_height_log2, true>(luma, max_luma_height, + source, stride); + } else { + CflSubsampler444_4xH_NEON<block_height_log2, false>(luma, max_luma_height, + source, stride); + } +} + +template <int block_height_log2, bool is_inside> +void CflSubsampler444_8xH_NEON( + int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int max_luma_height, const void* const source, ptrdiff_t stride) { + const int block_height = 1 << block_height_log2; + const int visible_height = max_luma_height; + const auto* src = static_cast<const uint16_t*>(source); + const ptrdiff_t src_stride = stride / sizeof(src[0]); + int16_t* luma_ptr = luma[0]; + uint32x4_t sum = vdupq_n_u32(0); + uint16x8_t samples; + int y = visible_height; + + do { + samples = vld1q_u16(src); + src += src_stride; + sum = vpadalq_u16(sum, samples); + } while (--y != 0); + + if (!is_inside) { + y = visible_height; + do { + sum = vpadalq_u16(sum, samples); + } while (++y < block_height); + } + + // Here the left shift by 3 (to increase precision) is nullified in right + // shift (log2 of width 8). + const uint32_t average_sum = + RightShiftWithRounding(SumVector(sum), block_height_log2); + const int16x8_t averages = vdupq_n_s16(static_cast<int16_t>(average_sum)); + + const auto* ssrc = static_cast<const int16_t*>(source); + int16x8_t ssample; + luma_ptr = luma[0]; + y = visible_height; + do { + ssample = vld1q_s16(ssrc); + ssample = vshlq_n_s16(ssample, 3); + vst1q_s16(luma_ptr, vsubq_s16(ssample, averages)); + ssrc += src_stride; + luma_ptr += kCflLumaBufferStride; + } while (--y != 0); + + if (!is_inside) { + y = visible_height; + // Replicate last line + do { + vst1q_s16(luma_ptr, vsubq_s16(ssample, averages)); + luma_ptr += kCflLumaBufferStride; + } while (++y < block_height); + } +} + +template <int block_height_log2> +void CflSubsampler444_8xH_NEON( + int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int max_luma_width, const int max_luma_height, + const void* const source, ptrdiff_t stride) { + static_cast<void>(max_luma_width); + static_cast<void>(max_luma_height); + static_assert(block_height_log2 <= 5, ""); + assert(max_luma_width >= 4); + assert(max_luma_height >= 4); + const int block_height = 1 << block_height_log2; + const int block_width = 8; + + const int horz_inside = block_width <= max_luma_width; + const int vert_inside = block_height <= max_luma_height; + if (horz_inside && vert_inside) { + CflSubsampler444_8xH_NEON<block_height_log2, true>(luma, max_luma_height, + source, stride); + } else { + CflSubsampler444_8xH_NEON<block_height_log2, false>(luma, max_luma_height, + source, stride); + } +} + +template <int block_width_log2, int block_height_log2, bool is_inside> +void CflSubsampler444_WxH_NEON( + int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int max_luma_width, const int max_luma_height, + const void* const source, ptrdiff_t stride) { + const int block_height = 1 << block_height_log2; + const int visible_height = max_luma_height; + const int block_width = 1 << block_width_log2; + const auto* src = static_cast<const uint16_t*>(source); + const ptrdiff_t src_stride = stride / sizeof(src[0]); + int16_t* luma_ptr = luma[0]; + uint32x4_t sum = vdupq_n_u32(0); + uint16x8_t samples[4]; + int y = visible_height; + + do { + samples[0] = vld1q_u16(src); + samples[1] = + (max_luma_width >= 16) ? vld1q_u16(src + 8) : LastRowResult(samples[0]); + uint16x8_t inner_sum = vaddq_u16(samples[0], samples[1]); + if (block_width == 32) { + samples[2] = (max_luma_width >= 24) ? vld1q_u16(src + 16) + : LastRowResult(samples[1]); + samples[3] = (max_luma_width == 32) ? vld1q_u16(src + 24) + : LastRowResult(samples[2]); + inner_sum = vaddq_u16(samples[2], inner_sum); + inner_sum = vaddq_u16(samples[3], inner_sum); + } + sum = vpadalq_u16(sum, inner_sum); + src += src_stride; + } while (--y != 0); + + if (!is_inside) { + y = visible_height; + uint16x8_t inner_sum = vaddq_u16(samples[0], samples[1]); + if (block_width == 32) { + inner_sum = vaddq_u16(samples[2], inner_sum); + inner_sum = vaddq_u16(samples[3], inner_sum); + } + do { + sum = vpadalq_u16(sum, inner_sum); + } while (++y < block_height); + } + + // Here the left shift by 3 (to increase precision) is subtracted in right + // shift factor (block_width_log2 + block_height_log2 - 3). + const uint32_t average_sum = RightShiftWithRounding( + SumVector(sum), block_width_log2 + block_height_log2 - 3); + const int16x8_t averages = vdupq_n_s16(static_cast<int16_t>(average_sum)); + + const auto* ssrc = static_cast<const int16_t*>(source); + int16x8_t ssamples_ext = vdupq_n_s16(0); + int16x8_t ssamples[4]; + luma_ptr = luma[0]; + y = visible_height; + do { + int idx = 0; + for (int x = 0; x < block_width; x += 8) { + if (max_luma_width > x) { + ssamples[idx] = vld1q_s16(&ssrc[x]); + ssamples[idx] = vshlq_n_s16(ssamples[idx], 3); + ssamples_ext = ssamples[idx]; + } else { + ssamples[idx] = LastRowResult(ssamples_ext); + } + vst1q_s16(&luma_ptr[x], vsubq_s16(ssamples[idx++], averages)); + } + ssrc += src_stride; + luma_ptr += kCflLumaBufferStride; + } while (--y != 0); + + if (!is_inside) { + y = visible_height; + // Replicate last line + do { + int idx = 0; + for (int x = 0; x < block_width; x += 8) { + vst1q_s16(&luma_ptr[x], vsubq_s16(ssamples[idx++], averages)); + } + luma_ptr += kCflLumaBufferStride; + } while (++y < block_height); + } +} + +template <int block_width_log2, int block_height_log2> +void CflSubsampler444_WxH_NEON( + int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int max_luma_width, const int max_luma_height, + const void* const source, ptrdiff_t stride) { + static_assert(block_width_log2 == 4 || block_width_log2 == 5, + "This function will only work for block_width 16 and 32."); + static_assert(block_height_log2 <= 5, ""); + assert(max_luma_width >= 4); + assert(max_luma_height >= 4); + + const int block_height = 1 << block_height_log2; + const int vert_inside = block_height <= max_luma_height; + if (vert_inside) { + CflSubsampler444_WxH_NEON<block_width_log2, block_height_log2, true>( + luma, max_luma_width, max_luma_height, source, stride); + } else { + CflSubsampler444_WxH_NEON<block_width_log2, block_height_log2, false>( + luma, max_luma_width, max_luma_height, source, stride); + } +} + +template <int block_height_log2> +void CflSubsampler420_4xH_NEON( + int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int /*max_luma_width*/, const int max_luma_height, + const void* const source, ptrdiff_t stride) { + const int block_height = 1 << block_height_log2; + const auto* src = static_cast<const uint16_t*>(source); + const ptrdiff_t src_stride = stride / sizeof(src[0]); + int16_t* luma_ptr = luma[0]; + const int luma_height = std::min(block_height, max_luma_height >> 1); + int y = luma_height; + + uint32x4_t final_sum = vdupq_n_u32(0); + do { + const uint16x8_t samples_row0 = vld1q_u16(src); + src += src_stride; + const uint16x8_t samples_row1 = vld1q_u16(src); + src += src_stride; + const uint16x8_t luma_sum01 = vaddq_u16(samples_row0, samples_row1); + + const uint16x8_t samples_row2 = vld1q_u16(src); + src += src_stride; + const uint16x8_t samples_row3 = vld1q_u16(src); + src += src_stride; + const uint16x8_t luma_sum23 = vaddq_u16(samples_row2, samples_row3); + uint16x8_t sum = StoreLumaResults4_420(luma_sum01, luma_sum23, luma_ptr); + luma_ptr += kCflLumaBufferStride << 1; + + const uint16x8_t samples_row4 = vld1q_u16(src); + src += src_stride; + const uint16x8_t samples_row5 = vld1q_u16(src); + src += src_stride; + const uint16x8_t luma_sum45 = vaddq_u16(samples_row4, samples_row5); + + const uint16x8_t samples_row6 = vld1q_u16(src); + src += src_stride; + const uint16x8_t samples_row7 = vld1q_u16(src); + src += src_stride; + const uint16x8_t luma_sum67 = vaddq_u16(samples_row6, samples_row7); + sum = + vaddq_u16(sum, StoreLumaResults4_420(luma_sum45, luma_sum67, luma_ptr)); + luma_ptr += kCflLumaBufferStride << 1; + + final_sum = vpadalq_u16(final_sum, sum); + y -= 4; + } while (y != 0); + + const uint16x4_t final_fill = + vreinterpret_u16_s16(vld1_s16(luma_ptr - kCflLumaBufferStride)); + const uint32x4_t final_fill_to_sum = vmovl_u16(final_fill); + for (y = luma_height; y < block_height; ++y) { + vst1_s16(luma_ptr, vreinterpret_s16_u16(final_fill)); + luma_ptr += kCflLumaBufferStride; + final_sum = vaddq_u32(final_sum, final_fill_to_sum); + } + const uint32_t average_sum = RightShiftWithRounding( + SumVector(final_sum), block_height_log2 + 2 /*log2 of width 4*/); + const int16x4_t averages = vdup_n_s16(static_cast<int16_t>(average_sum)); + luma_ptr = luma[0]; + y = block_height; + do { + const int16x4_t samples = vld1_s16(luma_ptr); + vst1_s16(luma_ptr, vsub_s16(samples, averages)); + luma_ptr += kCflLumaBufferStride; + } while (--y != 0); +} + +template <int block_height_log2, int max_luma_width> +inline void CflSubsampler420Impl_8xH_NEON( + int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int max_luma_height, const void* const source, ptrdiff_t stride) { + const int block_height = 1 << block_height_log2; + const auto* src = static_cast<const uint16_t*>(source); + const ptrdiff_t src_stride = stride / sizeof(src[0]); + int16_t* luma_ptr = luma[0]; + const int luma_height = std::min(block_height, max_luma_height >> 1); + int y = luma_height; + + uint32x4_t final_sum = vdupq_n_u32(0); + do { + const uint16x8_t samples_row00 = vld1q_u16(src); + const uint16x8_t samples_row01 = (max_luma_width == 16) + ? vld1q_u16(src + 8) + : LastRowSamples(samples_row00); + src += src_stride; + const uint16x8_t samples_row10 = vld1q_u16(src); + const uint16x8_t samples_row11 = (max_luma_width == 16) + ? vld1q_u16(src + 8) + : LastRowSamples(samples_row10); + src += src_stride; + const uint16x8_t luma_sum00 = vaddq_u16(samples_row00, samples_row10); + const uint16x8_t luma_sum01 = vaddq_u16(samples_row01, samples_row11); + uint16x8_t sum = StoreLumaResults8_420(luma_sum00, luma_sum01, luma_ptr); + luma_ptr += kCflLumaBufferStride; + + const uint16x8_t samples_row20 = vld1q_u16(src); + const uint16x8_t samples_row21 = (max_luma_width == 16) + ? vld1q_u16(src + 8) + : LastRowSamples(samples_row20); + src += src_stride; + const uint16x8_t samples_row30 = vld1q_u16(src); + const uint16x8_t samples_row31 = (max_luma_width == 16) + ? vld1q_u16(src + 8) + : LastRowSamples(samples_row30); + src += src_stride; + const uint16x8_t luma_sum10 = vaddq_u16(samples_row20, samples_row30); + const uint16x8_t luma_sum11 = vaddq_u16(samples_row21, samples_row31); + sum = + vaddq_u16(sum, StoreLumaResults8_420(luma_sum10, luma_sum11, luma_ptr)); + luma_ptr += kCflLumaBufferStride; + + const uint16x8_t samples_row40 = vld1q_u16(src); + const uint16x8_t samples_row41 = (max_luma_width == 16) + ? vld1q_u16(src + 8) + : LastRowSamples(samples_row40); + src += src_stride; + const uint16x8_t samples_row50 = vld1q_u16(src); + const uint16x8_t samples_row51 = (max_luma_width == 16) + ? vld1q_u16(src + 8) + : LastRowSamples(samples_row50); + src += src_stride; + const uint16x8_t luma_sum20 = vaddq_u16(samples_row40, samples_row50); + const uint16x8_t luma_sum21 = vaddq_u16(samples_row41, samples_row51); + sum = + vaddq_u16(sum, StoreLumaResults8_420(luma_sum20, luma_sum21, luma_ptr)); + luma_ptr += kCflLumaBufferStride; + + const uint16x8_t samples_row60 = vld1q_u16(src); + const uint16x8_t samples_row61 = (max_luma_width == 16) + ? vld1q_u16(src + 8) + : LastRowSamples(samples_row60); + src += src_stride; + const uint16x8_t samples_row70 = vld1q_u16(src); + const uint16x8_t samples_row71 = (max_luma_width == 16) + ? vld1q_u16(src + 8) + : LastRowSamples(samples_row70); + src += src_stride; + const uint16x8_t luma_sum30 = vaddq_u16(samples_row60, samples_row70); + const uint16x8_t luma_sum31 = vaddq_u16(samples_row61, samples_row71); + sum = + vaddq_u16(sum, StoreLumaResults8_420(luma_sum30, luma_sum31, luma_ptr)); + luma_ptr += kCflLumaBufferStride; + + final_sum = vpadalq_u16(final_sum, sum); + y -= 4; + } while (y != 0); + + // Duplicate the final row downward to the end after max_luma_height. + const uint16x8_t final_fill = + vreinterpretq_u16_s16(vld1q_s16(luma_ptr - kCflLumaBufferStride)); + const uint32x4_t final_fill_to_sum = + vaddl_u16(vget_low_u16(final_fill), vget_high_u16(final_fill)); + + for (y = luma_height; y < block_height; ++y) { + vst1q_s16(luma_ptr, vreinterpretq_s16_u16(final_fill)); + luma_ptr += kCflLumaBufferStride; + final_sum = vaddq_u32(final_sum, final_fill_to_sum); + } + + const uint32_t average_sum = RightShiftWithRounding( + SumVector(final_sum), block_height_log2 + 3 /*log2 of width 8*/); + const int16x8_t averages = vdupq_n_s16(static_cast<int16_t>(average_sum)); + luma_ptr = luma[0]; + y = block_height; + do { + const int16x8_t samples = vld1q_s16(luma_ptr); + vst1q_s16(luma_ptr, vsubq_s16(samples, averages)); + luma_ptr += kCflLumaBufferStride; + } while (--y != 0); +} + +template <int block_height_log2> +void CflSubsampler420_8xH_NEON( + int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int max_luma_width, const int max_luma_height, + const void* const source, ptrdiff_t stride) { + if (max_luma_width == 8) { + CflSubsampler420Impl_8xH_NEON<block_height_log2, 8>(luma, max_luma_height, + source, stride); + } else { + CflSubsampler420Impl_8xH_NEON<block_height_log2, 16>(luma, max_luma_height, + source, stride); + } +} + +template <int block_width_log2, int block_height_log2, int max_luma_width> +inline void CflSubsampler420Impl_WxH_NEON( + int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int max_luma_height, const void* const source, ptrdiff_t stride) { + const auto* src = static_cast<const uint16_t*>(source); + const ptrdiff_t src_stride = stride / sizeof(src[0]); + const int block_height = 1 << block_height_log2; + const int luma_height = std::min(block_height, max_luma_height >> 1); + int16_t* luma_ptr = luma[0]; + // Begin first y section, covering width up to 32. + int y = luma_height; + + uint16x8_t final_fill0, final_fill1; + uint32x4_t final_sum = vdupq_n_u32(0); + do { + const uint16_t* src_next = src + src_stride; + const uint16x8_t samples_row00 = vld1q_u16(src); + const uint16x8_t samples_row01 = (max_luma_width >= 16) + ? vld1q_u16(src + 8) + : LastRowSamples(samples_row00); + const uint16x8_t samples_row02 = (max_luma_width >= 24) + ? vld1q_u16(src + 16) + : LastRowSamples(samples_row01); + const uint16x8_t samples_row03 = (max_luma_width == 32) + ? vld1q_u16(src + 24) + : LastRowSamples(samples_row02); + const uint16x8_t samples_row10 = vld1q_u16(src_next); + const uint16x8_t samples_row11 = (max_luma_width >= 16) + ? vld1q_u16(src_next + 8) + : LastRowSamples(samples_row10); + const uint16x8_t samples_row12 = (max_luma_width >= 24) + ? vld1q_u16(src_next + 16) + : LastRowSamples(samples_row11); + const uint16x8_t samples_row13 = (max_luma_width == 32) + ? vld1q_u16(src_next + 24) + : LastRowSamples(samples_row12); + const uint16x8_t luma_sum0 = vaddq_u16(samples_row00, samples_row10); + const uint16x8_t luma_sum1 = vaddq_u16(samples_row01, samples_row11); + const uint16x8_t luma_sum2 = vaddq_u16(samples_row02, samples_row12); + const uint16x8_t luma_sum3 = vaddq_u16(samples_row03, samples_row13); + final_fill0 = StoreLumaResults8_420(luma_sum0, luma_sum1, luma_ptr); + final_fill1 = StoreLumaResults8_420(luma_sum2, luma_sum3, luma_ptr + 8); + const uint16x8_t sum = vaddq_u16(final_fill0, final_fill1); + + final_sum = vpadalq_u16(final_sum, sum); + + // Because max_luma_width is at most 32, any values beyond x=16 will + // necessarily be duplicated. + if (block_width_log2 == 5) { + const uint16x8_t wide_fill = LastRowResult(final_fill1); + final_sum = vpadalq_u16(final_sum, vshlq_n_u16(wide_fill, 1)); + } + src += src_stride << 1; + luma_ptr += kCflLumaBufferStride; + } while (--y != 0); + + // Begin second y section. + y = luma_height; + if (y < block_height) { + uint32x4_t wide_fill; + if (block_width_log2 == 5) { + // There are 16 16-bit fill values per row, shifting by 2 accounts for + // the widening to 32-bit. (a << 2) = (a + a) << 1. + wide_fill = vshll_n_u16(vget_low_u16(LastRowResult(final_fill1)), 2); + } + const uint16x8_t final_inner_sum = vaddq_u16(final_fill0, final_fill1); + const uint32x4_t final_fill_to_sum = vaddl_u16( + vget_low_u16(final_inner_sum), vget_high_u16(final_inner_sum)); + + do { + vst1q_s16(luma_ptr, vreinterpretq_s16_u16(final_fill0)); + vst1q_s16(luma_ptr + 8, vreinterpretq_s16_u16(final_fill1)); + if (block_width_log2 == 5) { + final_sum = vaddq_u32(final_sum, wide_fill); + } + luma_ptr += kCflLumaBufferStride; + final_sum = vaddq_u32(final_sum, final_fill_to_sum); + } while (++y < block_height); + } // End second y section. + + const uint32_t average_sum = RightShiftWithRounding( + SumVector(final_sum), block_width_log2 + block_height_log2); + const int16x8_t averages = vdupq_n_s16(static_cast<int16_t>(average_sum)); + + luma_ptr = luma[0]; + y = block_height; + do { + const int16x8_t samples0 = vld1q_s16(luma_ptr); + vst1q_s16(luma_ptr, vsubq_s16(samples0, averages)); + const int16x8_t samples1 = vld1q_s16(luma_ptr + 8); + const int16x8_t final_row_result = vsubq_s16(samples1, averages); + vst1q_s16(luma_ptr + 8, final_row_result); + + if (block_width_log2 == 5) { + const int16x8_t wide_fill = LastRowResult(final_row_result); + vst1q_s16(luma_ptr + 16, wide_fill); + vst1q_s16(luma_ptr + 24, wide_fill); + } + luma_ptr += kCflLumaBufferStride; + } while (--y != 0); +} + +//------------------------------------------------------------------------------ +// Choose subsampler based on max_luma_width +template <int block_width_log2, int block_height_log2> +void CflSubsampler420_WxH_NEON( + int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int max_luma_width, const int max_luma_height, + const void* const source, ptrdiff_t stride) { + switch (max_luma_width) { + case 8: + CflSubsampler420Impl_WxH_NEON<block_width_log2, block_height_log2, 8>( + luma, max_luma_height, source, stride); + return; + case 16: + CflSubsampler420Impl_WxH_NEON<block_width_log2, block_height_log2, 16>( + luma, max_luma_height, source, stride); + return; + case 24: + CflSubsampler420Impl_WxH_NEON<block_width_log2, block_height_log2, 24>( + luma, max_luma_height, source, stride); + return; + default: + assert(max_luma_width == 32); + CflSubsampler420Impl_WxH_NEON<block_width_log2, block_height_log2, 32>( + luma, max_luma_height, source, stride); + return; + } +} + +//------------------------------------------------------------------------------ +// CflIntraPredictor + +// |luma| can be within +/-(((1 << bitdepth) - 1) << 3), inclusive. +// |alpha| can be -16 to 16 (inclusive). +// Clip |dc + ((alpha * luma) >> 6))| to 0, (1 << bitdepth) - 1. +inline uint16x8_t Combine8(const int16x8_t luma, const int16x8_t alpha_abs, + const int16x8_t alpha_signed, const int16x8_t dc, + const uint16x8_t max_value) { + const int16x8_t luma_abs = vabsq_s16(luma); + const int16x8_t luma_alpha_sign = + vshrq_n_s16(veorq_s16(luma, alpha_signed), 15); + // (alpha * luma) >> 6 + const int16x8_t la_abs = vqrdmulhq_s16(luma_abs, alpha_abs); + // Convert back to signed values. + const int16x8_t la = + vsubq_s16(veorq_s16(la_abs, luma_alpha_sign), luma_alpha_sign); + const int16x8_t result = vaddq_s16(la, dc); + const int16x8_t zero = vdupq_n_s16(0); + // Clip. + return vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(result, zero)), max_value); +} + +template <int block_height, int bitdepth = 10> +inline void CflIntraPredictor4xN_NEON( + void* const dest, const ptrdiff_t stride, + const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int alpha) { + auto* dst = static_cast<uint16_t*>(dest); + const ptrdiff_t dst_stride = stride >> 1; + const uint16x8_t max_value = vdupq_n_u16((1 << bitdepth) - 1); + const int16x8_t alpha_signed = vdupq_n_s16(alpha << 9); + const int16x8_t alpha_abs = vabsq_s16(alpha_signed); + const int16x8_t dc = vdupq_n_s16(dst[0]); + for (int y = 0; y < block_height; y += 2) { + const int16x4_t luma_row0 = vld1_s16(luma[y]); + const int16x4_t luma_row1 = vld1_s16(luma[y + 1]); + const int16x8_t combined_luma = vcombine_s16(luma_row0, luma_row1); + const uint16x8_t sum = + Combine8(combined_luma, alpha_abs, alpha_signed, dc, max_value); + vst1_u16(dst, vget_low_u16(sum)); + dst += dst_stride; + vst1_u16(dst, vget_high_u16(sum)); + dst += dst_stride; + } +} + +template <int block_height, int bitdepth = 10> +inline void CflIntraPredictor8xN_NEON( + void* const dest, const ptrdiff_t stride, + const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int alpha) { + auto* dst = static_cast<uint16_t*>(dest); + const ptrdiff_t dst_stride = stride >> 1; + const uint16x8_t max_value = vdupq_n_u16((1 << bitdepth) - 1); + const int16x8_t alpha_signed = vdupq_n_s16(alpha << 9); + const int16x8_t alpha_abs = vabsq_s16(alpha_signed); + const int16x8_t dc = vdupq_n_s16(dst[0]); + for (int y = 0; y < block_height; ++y) { + const int16x8_t luma_row = vld1q_s16(luma[y]); + const uint16x8_t sum = + Combine8(luma_row, alpha_abs, alpha_signed, dc, max_value); + vst1q_u16(dst, sum); + dst += dst_stride; + } +} + +template <int block_height, int bitdepth = 10> +inline void CflIntraPredictor16xN_NEON( + void* const dest, const ptrdiff_t stride, + const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int alpha) { + auto* dst = static_cast<uint16_t*>(dest); + const ptrdiff_t dst_stride = stride >> 1; + const uint16x8_t max_value = vdupq_n_u16((1 << bitdepth) - 1); + const int16x8_t alpha_signed = vdupq_n_s16(alpha << 9); + const int16x8_t alpha_abs = vabsq_s16(alpha_signed); + const int16x8_t dc = vdupq_n_s16(dst[0]); + for (int y = 0; y < block_height; ++y) { + const int16x8_t luma_row_0 = vld1q_s16(luma[y]); + const int16x8_t luma_row_1 = vld1q_s16(luma[y] + 8); + const uint16x8_t sum_0 = + Combine8(luma_row_0, alpha_abs, alpha_signed, dc, max_value); + const uint16x8_t sum_1 = + Combine8(luma_row_1, alpha_abs, alpha_signed, dc, max_value); + vst1q_u16(dst, sum_0); + vst1q_u16(dst + 8, sum_1); + dst += dst_stride; + } +} + +template <int block_height, int bitdepth = 10> +inline void CflIntraPredictor32xN_NEON( + void* const dest, const ptrdiff_t stride, + const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int alpha) { + auto* dst = static_cast<uint16_t*>(dest); + const ptrdiff_t dst_stride = stride >> 1; + const uint16x8_t max_value = vdupq_n_u16((1 << bitdepth) - 1); + const int16x8_t alpha_signed = vdupq_n_s16(alpha << 9); + const int16x8_t alpha_abs = vabsq_s16(alpha_signed); + const int16x8_t dc = vdupq_n_s16(dst[0]); + for (int y = 0; y < block_height; ++y) { + const int16x8_t luma_row_0 = vld1q_s16(luma[y]); + const int16x8_t luma_row_1 = vld1q_s16(luma[y] + 8); + const int16x8_t luma_row_2 = vld1q_s16(luma[y] + 16); + const int16x8_t luma_row_3 = vld1q_s16(luma[y] + 24); + const uint16x8_t sum_0 = + Combine8(luma_row_0, alpha_abs, alpha_signed, dc, max_value); + const uint16x8_t sum_1 = + Combine8(luma_row_1, alpha_abs, alpha_signed, dc, max_value); + const uint16x8_t sum_2 = + Combine8(luma_row_2, alpha_abs, alpha_signed, dc, max_value); + const uint16x8_t sum_3 = + Combine8(luma_row_3, alpha_abs, alpha_signed, dc, max_value); + vst1q_u16(dst, sum_0); + vst1q_u16(dst + 8, sum_1); + vst1q_u16(dst + 16, sum_2); + vst1q_u16(dst + 24, sum_3); + dst += dst_stride; + } +} + +void Init10bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10); + assert(dsp != nullptr); + + dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] = + CflSubsampler420_4xH_NEON<2>; + dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] = + CflSubsampler420_4xH_NEON<3>; + dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] = + CflSubsampler420_4xH_NEON<4>; + + dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] = + CflSubsampler420_8xH_NEON<2>; + dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] = + CflSubsampler420_8xH_NEON<3>; + dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] = + CflSubsampler420_8xH_NEON<4>; + dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] = + CflSubsampler420_8xH_NEON<5>; + + dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] = + CflSubsampler420_WxH_NEON<4, 2>; + dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] = + CflSubsampler420_WxH_NEON<4, 3>; + dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] = + CflSubsampler420_WxH_NEON<4, 4>; + dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] = + CflSubsampler420_WxH_NEON<4, 5>; + + dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] = + CflSubsampler420_WxH_NEON<5, 3>; + dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] = + CflSubsampler420_WxH_NEON<5, 4>; + dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] = + CflSubsampler420_WxH_NEON<5, 5>; + + dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] = + CflSubsampler444_4xH_NEON<2>; + dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] = + CflSubsampler444_4xH_NEON<3>; + dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] = + CflSubsampler444_4xH_NEON<4>; + + dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] = + CflSubsampler444_8xH_NEON<2>; + dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] = + CflSubsampler444_8xH_NEON<3>; + dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] = + CflSubsampler444_8xH_NEON<4>; + dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] = + CflSubsampler444_8xH_NEON<5>; + + dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] = + CflSubsampler444_WxH_NEON<4, 2>; + dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] = + CflSubsampler444_WxH_NEON<4, 3>; + dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] = + CflSubsampler444_WxH_NEON<4, 4>; + dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] = + CflSubsampler444_WxH_NEON<4, 5>; + + dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] = + CflSubsampler444_WxH_NEON<5, 3>; + dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] = + CflSubsampler444_WxH_NEON<5, 4>; + dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] = + CflSubsampler444_WxH_NEON<5, 5>; + + dsp->cfl_intra_predictors[kTransformSize4x4] = CflIntraPredictor4xN_NEON<4>; + dsp->cfl_intra_predictors[kTransformSize4x8] = CflIntraPredictor4xN_NEON<8>; + dsp->cfl_intra_predictors[kTransformSize4x16] = CflIntraPredictor4xN_NEON<16>; + + dsp->cfl_intra_predictors[kTransformSize8x4] = CflIntraPredictor8xN_NEON<4>; + dsp->cfl_intra_predictors[kTransformSize8x8] = CflIntraPredictor8xN_NEON<8>; + dsp->cfl_intra_predictors[kTransformSize8x16] = CflIntraPredictor8xN_NEON<16>; + dsp->cfl_intra_predictors[kTransformSize8x32] = CflIntraPredictor8xN_NEON<32>; + + dsp->cfl_intra_predictors[kTransformSize16x4] = CflIntraPredictor16xN_NEON<4>; + dsp->cfl_intra_predictors[kTransformSize16x8] = CflIntraPredictor16xN_NEON<8>; + dsp->cfl_intra_predictors[kTransformSize16x16] = + CflIntraPredictor16xN_NEON<16>; + dsp->cfl_intra_predictors[kTransformSize16x32] = + CflIntraPredictor16xN_NEON<32>; + dsp->cfl_intra_predictors[kTransformSize32x8] = CflIntraPredictor32xN_NEON<8>; + dsp->cfl_intra_predictors[kTransformSize32x16] = + CflIntraPredictor32xN_NEON<16>; + dsp->cfl_intra_predictors[kTransformSize32x32] = + CflIntraPredictor32xN_NEON<32>; + // Max Cfl predictor size is 32x32. +} + +} // namespace +} // namespace high_bitdepth +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +void IntraPredCflInit_NEON() { + low_bitdepth::Init8bpp(); +#if LIBGAV1_MAX_BITDEPTH >= 10 + high_bitdepth::Init10bpp(); +#endif +} } // namespace dsp } // namespace libgav1 -#else // !LIBGAV1_ENABLE_NEON +#else // !LIBGAV1_ENABLE_NEON namespace libgav1 { namespace dsp { diff --git a/src/dsp/arm/intrapred_cfl_neon.h b/src/dsp/arm/intrapred_cfl_neon.h new file mode 100644 index 0000000..b4f983a --- /dev/null +++ b/src/dsp/arm/intrapred_cfl_neon.h @@ -0,0 +1,179 @@ +/* + * Copyright 2021 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_ARM_INTRAPRED_CFL_NEON_H_ +#define LIBGAV1_SRC_DSP_ARM_INTRAPRED_CFL_NEON_H_ + +#include "src/dsp/dsp.h" +#include "src/utils/cpu.h" + +namespace libgav1 { +namespace dsp { + +// Initializes Dsp::cfl_intra_predictors and Dsp::cfl_subsamplers, see the +// defines below for specifics. These functions are not thread-safe. +void IntraPredCflInit_NEON(); + +} // namespace dsp +} // namespace libgav1 + +#if LIBGAV1_ENABLE_NEON +// 4x4 +#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflIntraPredictor LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler420 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler444 LIBGAV1_CPU_NEON + +// 4x8 +#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflIntraPredictor LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler420 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler444 LIBGAV1_CPU_NEON + +// 4x16 +#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflIntraPredictor LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler420 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler444 LIBGAV1_CPU_NEON + +// 8x4 +#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflIntraPredictor LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler420 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler444 LIBGAV1_CPU_NEON + +// 8x8 +#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflIntraPredictor LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler420 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler444 LIBGAV1_CPU_NEON + +// 8x16 +#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflIntraPredictor LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler420 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler444 LIBGAV1_CPU_NEON + +// 8x32 +#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflIntraPredictor LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler420 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler444 LIBGAV1_CPU_NEON + +// 16x4 +#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflIntraPredictor LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler420 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler444 LIBGAV1_CPU_NEON + +// 16x8 +#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflIntraPredictor LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler420 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler444 LIBGAV1_CPU_NEON + +// 16x16 +#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflIntraPredictor LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler420 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler444 LIBGAV1_CPU_NEON + +// 16x32 +#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflIntraPredictor LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler420 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler444 LIBGAV1_CPU_NEON + +// 32x8 +#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflIntraPredictor LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler420 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler444 LIBGAV1_CPU_NEON + +// 32x16 +#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflIntraPredictor LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler420 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler444 LIBGAV1_CPU_NEON + +// 32x32 +#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflIntraPredictor LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler420 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler444 LIBGAV1_CPU_NEON + +// ----------------------------------------------------------------------------- +// 10bpp + +// 4x4 +#define LIBGAV1_Dsp10bpp_TransformSize4x4_CflIntraPredictor LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler420 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler444 LIBGAV1_CPU_NEON + +// 4x8 +#define LIBGAV1_Dsp10bpp_TransformSize4x8_CflIntraPredictor LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler420 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler444 LIBGAV1_CPU_NEON + +// 4x16 +#define LIBGAV1_Dsp10bpp_TransformSize4x16_CflIntraPredictor LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler420 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler444 LIBGAV1_CPU_NEON + +// 8x4 +#define LIBGAV1_Dsp10bpp_TransformSize8x4_CflIntraPredictor LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler420 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler444 LIBGAV1_CPU_NEON + +// 8x8 +#define LIBGAV1_Dsp10bpp_TransformSize8x8_CflIntraPredictor LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler420 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler444 LIBGAV1_CPU_NEON + +// 8x16 +#define LIBGAV1_Dsp10bpp_TransformSize8x16_CflIntraPredictor LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler420 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler444 LIBGAV1_CPU_NEON + +// 8x32 +#define LIBGAV1_Dsp10bpp_TransformSize8x32_CflIntraPredictor LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler420 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler444 LIBGAV1_CPU_NEON + +// 16x4 +#define LIBGAV1_Dsp10bpp_TransformSize16x4_CflIntraPredictor LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler420 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler444 LIBGAV1_CPU_NEON + +// 16x8 +#define LIBGAV1_Dsp10bpp_TransformSize16x8_CflIntraPredictor LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler420 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler444 LIBGAV1_CPU_NEON + +// 16x16 +#define LIBGAV1_Dsp10bpp_TransformSize16x16_CflIntraPredictor LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler420 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler444 LIBGAV1_CPU_NEON + +// 16x32 +#define LIBGAV1_Dsp10bpp_TransformSize16x32_CflIntraPredictor LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler420 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler444 LIBGAV1_CPU_NEON + +// 32x8 +#define LIBGAV1_Dsp10bpp_TransformSize32x8_CflIntraPredictor LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler420 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler444 LIBGAV1_CPU_NEON + +// 32x16 +#define LIBGAV1_Dsp10bpp_TransformSize32x16_CflIntraPredictor LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler420 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler444 LIBGAV1_CPU_NEON + +// 32x32 +#define LIBGAV1_Dsp10bpp_TransformSize32x32_CflIntraPredictor LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler420 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler444 LIBGAV1_CPU_NEON + +#endif // LIBGAV1_ENABLE_NEON + +#endif // LIBGAV1_SRC_DSP_ARM_INTRAPRED_CFL_NEON_H_ diff --git a/src/dsp/arm/intrapred_directional_neon.cc b/src/dsp/arm/intrapred_directional_neon.cc index 805ba81..3f5edbd 100644 --- a/src/dsp/arm/intrapred_directional_neon.cc +++ b/src/dsp/arm/intrapred_directional_neon.cc @@ -12,18 +12,18 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "src/dsp/intrapred.h" +#include "src/dsp/intrapred_directional.h" #include "src/utils/cpu.h" #if LIBGAV1_ENABLE_NEON #include <arm_neon.h> -#include <algorithm> // std::min +#include <algorithm> #include <cassert> #include <cstddef> #include <cstdint> -#include <cstring> // memset +#include <cstring> #include "src/dsp/arm/common_neon.h" #include "src/dsp/constants.h" @@ -35,14 +35,14 @@ namespace dsp { namespace low_bitdepth { namespace { -// Blend two values based on a 32 bit weight. +// Blend two values based on weights that sum to 32. inline uint8x8_t WeightedBlend(const uint8x8_t a, const uint8x8_t b, const uint8x8_t a_weight, const uint8x8_t b_weight) { const uint16x8_t a_product = vmull_u8(a, a_weight); const uint16x8_t b_product = vmull_u8(b, b_weight); - return vrshrn_n_u16(vaddq_u16(a_product, b_product), 5); + return vrshrn_n_u16(vaddq_u16(a_product, b_product), 5 /*log2(32)*/); } // For vertical operations the weights are one constant value. @@ -112,7 +112,7 @@ inline void DirectionalZone1_WxH(uint8_t* dst, const ptrdiff_t stride, // 4 wide subsamples the output. 8 wide subsamples the input. if (width == 4) { const uint8x8_t left_values = vld1_u8(top + top_base_x); - const uint8x8_t right_values = RightShift<8>(left_values); + const uint8x8_t right_values = RightShiftVector<8>(left_values); const uint8x8_t value = WeightedBlend(left_values, right_values, shift); // If |upsampled| is true then extract every other value for output. @@ -910,12 +910,590 @@ void Init8bpp() { } // namespace } // namespace low_bitdepth -void IntraPredDirectionalInit_NEON() { low_bitdepth::Init8bpp(); } +#if LIBGAV1_MAX_BITDEPTH >= 10 +namespace high_bitdepth { +namespace { + +// Blend two values based on weights that sum to 32. +inline uint16x4_t WeightedBlend(const uint16x4_t a, const uint16x4_t b, + const int a_weight, const int b_weight) { + const uint16x4_t a_product = vmul_n_u16(a, a_weight); + const uint16x4_t sum = vmla_n_u16(a_product, b, b_weight); + + return vrshr_n_u16(sum, 5 /*log2(32)*/); +} + +// Blend two values based on weights that sum to 32. +inline uint16x8_t WeightedBlend(const uint16x8_t a, const uint16x8_t b, + const uint16_t a_weight, + const uint16_t b_weight) { + const uint16x8_t a_product = vmulq_n_u16(a, a_weight); + const uint16x8_t sum = vmlaq_n_u16(a_product, b, b_weight); + + return vrshrq_n_u16(sum, 5 /*log2(32)*/); +} + +// Each element of |dest| contains values associated with one weight value. +inline void LoadEdgeVals(uint16x4x2_t* dest, const uint16_t* const source, + const bool upsampled) { + if (upsampled) { + *dest = vld2_u16(source); + } else { + dest->val[0] = vld1_u16(source); + dest->val[1] = vld1_u16(source + 1); + } +} + +// Each element of |dest| contains values associated with one weight value. +inline void LoadEdgeVals(uint16x8x2_t* dest, const uint16_t* const source, + const bool upsampled) { + if (upsampled) { + *dest = vld2q_u16(source); + } else { + dest->val[0] = vld1q_u16(source); + dest->val[1] = vld1q_u16(source + 1); + } +} + +template <bool upsampled> +inline void DirectionalZone1_4xH(uint16_t* dst, const ptrdiff_t stride, + const int height, const uint16_t* const top, + const int xstep) { + const int upsample_shift = static_cast<int>(upsampled); + const int index_scale_bits = 6 - upsample_shift; + + const int max_base_x = (4 + height - 1) << upsample_shift; + const int16x4_t max_base = vdup_n_s16(max_base_x); + const uint16x4_t final_top_val = vdup_n_u16(top[max_base_x]); + const int16x4_t index_offset = {0, 1, 2, 3}; + + // All rows from |min_corner_only_y| down will simply use Memset. + // |max_base_x| is always greater than |height|, so clipping the denominator + // to 1 is enough to make the logic work. + const int xstep_units = std::max(xstep >> index_scale_bits, 1); + const int min_corner_only_y = std::min(max_base_x / xstep_units, height); + + int top_x = xstep; + int y = 0; + for (; y < min_corner_only_y; ++y, dst += stride, top_x += xstep) { + const int top_base_x = top_x >> index_scale_bits; + + // To accommodate reuse of this function in Zone2, permit negative values + // for |xstep|. + const uint16_t shift_0 = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1; + const uint16_t shift_1 = 32 - shift_0; + + // Use signed values to compare |top_base_x| to |max_base_x|. + const int16x4_t base_x = vadd_s16(vdup_n_s16(top_base_x), index_offset); + const uint16x4_t max_base_mask = vclt_s16(base_x, max_base); + + uint16x4x2_t sampled_top_row; + LoadEdgeVals(&sampled_top_row, top + top_base_x, upsampled); + const uint16x4_t combined = WeightedBlend( + sampled_top_row.val[0], sampled_top_row.val[1], shift_1, shift_0); + + // If |upsampled| is true then extract every other value for output. + const uint16x4_t masked_result = + vbsl_u16(max_base_mask, combined, final_top_val); + + vst1_u16(dst, masked_result); + } + for (; y < height; ++y) { + Memset(dst, top[max_base_x], 4 /* width */); + dst += stride; + } +} + +// Process a multiple of 8 |width| by any |height|. Processes horizontally +// before vertically in the hopes of being a little more cache friendly. +template <bool upsampled> +inline void DirectionalZone1_WxH(uint16_t* dst, const ptrdiff_t stride, + const int width, const int height, + const uint16_t* const top, const int xstep) { + assert(width % 8 == 0); + const int upsample_shift = static_cast<int>(upsampled); + const int index_scale_bits = 6 - upsample_shift; + + const int max_base_index = (width + height - 1) << upsample_shift; + const int16x8_t max_base_x = vdupq_n_s16(max_base_index); + const uint16x8_t final_top_val = vdupq_n_u16(top[max_base_index]); + const int16x8_t index_offset = {0, 1, 2, 3, 4, 5, 6, 7}; + + const int base_step = 1 << upsample_shift; + const int base_step8 = base_step << 3; + const int16x8_t block_step = vdupq_n_s16(base_step8); + + // All rows from |min_corner_only_y| down will simply use Memset. + // |max_base_x| is always greater than |height|, so clipping the denominator + // to 1 is enough to make the logic work. + const int xstep_units = std::max(xstep >> index_scale_bits, 1); + const int min_corner_only_y = std::min(max_base_index / xstep_units, height); + + int top_x = xstep; + int y = 0; + for (; y < min_corner_only_y; ++y, dst += stride, top_x += xstep) { + int top_base_x = top_x >> index_scale_bits; + + // To accommodate reuse of this function in Zone2, permit negative values + // for |xstep|. + const uint16_t shift_0 = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1; + const uint16_t shift_1 = 32 - shift_0; + + // Use signed values to compare |top_base_x| to |max_base_x|. + int16x8_t base_x = vaddq_s16(vdupq_n_s16(top_base_x), index_offset); + + int x = 0; + do { + const uint16x8_t max_base_mask = vcltq_s16(base_x, max_base_x); + + uint16x8x2_t sampled_top_row; + LoadEdgeVals(&sampled_top_row, top + top_base_x, upsampled); + const uint16x8_t combined = WeightedBlend( + sampled_top_row.val[0], sampled_top_row.val[1], shift_1, shift_0); + + const uint16x8_t masked_result = + vbslq_u16(max_base_mask, combined, final_top_val); + vst1q_u16(dst + x, masked_result); + + base_x = vaddq_s16(base_x, block_step); + top_base_x += base_step8; + x += 8; + } while (x < width); + } + for (int i = y; i < height; ++i) { + Memset(dst, top[max_base_index], width); + dst += stride; + } +} + +// Process a multiple of 8 |width| by any |height|. Processes horizontally +// before vertically in the hopes of being a little more cache friendly. +inline void DirectionalZone1_Large(uint16_t* dst, const ptrdiff_t stride, + const int width, const int height, + const uint16_t* const top, const int xstep, + const bool upsampled) { + assert(width % 8 == 0); + const int upsample_shift = static_cast<int>(upsampled); + const int index_scale_bits = 6 - upsample_shift; + + const int max_base_index = (width + height - 1) << upsample_shift; + const int16x8_t max_base_x = vdupq_n_s16(max_base_index); + const uint16x8_t final_top_val = vdupq_n_u16(top[max_base_index]); + const int16x8_t index_offset = {0, 1, 2, 3, 4, 5, 6, 7}; + + const int base_step = 1 << upsample_shift; + const int base_step8 = base_step << 3; + const int16x8_t block_step = vdupq_n_s16(base_step8); + + // All rows from |min_corner_only_y| down will simply use Memset. + // |max_base_x| is always greater than |height|, so clipping the denominator + // to 1 is enough to make the logic work. + const int xstep_units = std::max(xstep >> index_scale_bits, 1); + const int min_corner_only_y = std::min(max_base_index / xstep_units, height); + + // Rows up to this y-value can be computed without checking for bounds. + const int max_no_corner_y = std::min( + ((max_base_index - (base_step * width)) << index_scale_bits) / xstep, + height); + // No need to check for exceeding |max_base_x| in the first loop. + int y = 0; + int top_x = xstep; + for (; y < max_no_corner_y; ++y, dst += stride, top_x += xstep) { + int top_base_x = top_x >> index_scale_bits; + // To accommodate reuse of this function in Zone2, permit negative values + // for |xstep|. + const uint16_t shift_0 = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1; + const uint16_t shift_1 = 32 - shift_0; + + int x = 0; + do { + uint16x8x2_t sampled_top_row; + LoadEdgeVals(&sampled_top_row, top + top_base_x, upsampled); + const uint16x8_t combined = WeightedBlend( + sampled_top_row.val[0], sampled_top_row.val[1], shift_1, shift_0); + + vst1q_u16(dst + x, combined); + + top_base_x += base_step8; + x += 8; + } while (x < width); + } + + for (; y < min_corner_only_y; ++y, dst += stride, top_x += xstep) { + int top_base_x = top_x >> index_scale_bits; + + // To accommodate reuse of this function in Zone2, permit negative values + // for |xstep|. + const uint16_t shift_0 = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1; + const uint16_t shift_1 = 32 - shift_0; + + // Use signed values to compare |top_base_x| to |max_base_x|. + int16x8_t base_x = vaddq_s16(vdupq_n_s16(top_base_x), index_offset); + + int x = 0; + const int min_corner_only_x = + std::min(width, ((max_base_index - top_base_x) >> upsample_shift) + 7) & + ~7; + for (; x < min_corner_only_x; x += 8, top_base_x += base_step8, + base_x = vaddq_s16(base_x, block_step)) { + const uint16x8_t max_base_mask = vcltq_s16(base_x, max_base_x); + + uint16x8x2_t sampled_top_row; + LoadEdgeVals(&sampled_top_row, top + top_base_x, upsampled); + const uint16x8_t combined = WeightedBlend( + sampled_top_row.val[0], sampled_top_row.val[1], shift_1, shift_0); + + const uint16x8_t masked_result = + vbslq_u16(max_base_mask, combined, final_top_val); + vst1q_u16(dst + x, masked_result); + } + // Corner-only section of the row. + Memset(dst + x, top[max_base_index], width - x); + } + for (; y < height; ++y) { + Memset(dst, top[max_base_index], width); + dst += stride; + } +} + +void DirectionalIntraPredictorZone1_NEON(void* const dest, ptrdiff_t stride, + const void* const top_row, + const int width, const int height, + const int xstep, + const bool upsampled_top) { + const uint16_t* const top = static_cast<const uint16_t*>(top_row); + uint16_t* dst = static_cast<uint16_t*>(dest); + stride /= sizeof(top[0]); + + assert(xstep > 0); + + if (xstep == 64) { + assert(!upsampled_top); + const uint16_t* top_ptr = top + 1; + const int width_bytes = width * sizeof(top[0]); + int y = height; + do { + memcpy(dst, top_ptr, width_bytes); + memcpy(dst + stride, top_ptr + 1, width_bytes); + memcpy(dst + 2 * stride, top_ptr + 2, width_bytes); + memcpy(dst + 3 * stride, top_ptr + 3, width_bytes); + dst += 4 * stride; + top_ptr += 4; + y -= 4; + } while (y != 0); + } else { + if (width == 4) { + if (upsampled_top) { + DirectionalZone1_4xH<true>(dst, stride, height, top, xstep); + } else { + DirectionalZone1_4xH<false>(dst, stride, height, top, xstep); + } + } else if (width >= 32) { + if (upsampled_top) { + DirectionalZone1_Large(dst, stride, width, height, top, xstep, true); + } else { + DirectionalZone1_Large(dst, stride, width, height, top, xstep, false); + } + } else if (upsampled_top) { + DirectionalZone1_WxH<true>(dst, stride, width, height, top, xstep); + } else { + DirectionalZone1_WxH<false>(dst, stride, width, height, top, xstep); + } + } +} + +// ----------------------------------------------------------------------------- +// Zone 3 +// This can be considered "the transpose of Zone 1." In Zone 1, the fractional +// step applies when moving vertically in the destination block, connected to +// the change in |y|, whereas in this mode, the step applies when moving +// horizontally, connected to the change in |x|. This makes vectorization very +// complicated in row-order, because a given vector may need source pixels that +// span 16 or 32 pixels in steep angles, requiring multiple expensive table +// lookups and checked loads. Rather than work in row order, it is simpler to +// compute |dest| in column order, and then store the transposed results. + +// Compute 4x4 sub-blocks. +// Example of computed sub-blocks of a 4x8 block before and after transpose: +// 00 10 20 30 00 01 02 03 +// 01 11 21 31 10 11 12 13 +// 02 12 22 32 20 21 22 23 +// 03 13 23 33 30 31 32 33 +// ----------- --> ----------- +// 40 50 60 70 40 41 42 43 +// 41 51 61 71 50 51 52 53 +// 42 52 62 72 60 61 62 63 +// 43 53 63 73 70 71 72 73 +template <bool upsampled> +inline void DirectionalZone3_4x4(uint8_t* dst, const ptrdiff_t stride, + const uint16_t* const left, const int ystep, + const int base_left_y = 0) { + const int upsample_shift = static_cast<int>(upsampled); + const int index_scale_bits = 6 - upsample_shift; + + // Compute one column at a time, then transpose for storage. + uint16x4_t result[4]; + + int left_y = base_left_y + ystep; + int left_offset = left_y >> index_scale_bits; + int shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1; + int shift_1 = 32 - shift_0; + uint16x4x2_t sampled_left_col; + LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled); + result[0] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], + shift_1, shift_0); + + left_y += ystep; + left_offset = left_y >> index_scale_bits; + shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1; + shift_1 = 32 - shift_0; + LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled); + result[1] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], + shift_1, shift_0); + + left_y += ystep; + left_offset = left_y >> index_scale_bits; + shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1; + shift_1 = 32 - shift_0; + LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled); + result[2] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], + shift_1, shift_0); + + left_y += ystep; + left_offset = left_y >> index_scale_bits; + shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1; + shift_1 = 32 - shift_0; + LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled); + result[3] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], + shift_1, shift_0); + + Transpose4x4(result); + Store4(dst, result[0]); + dst += stride; + Store4(dst, result[1]); + dst += stride; + Store4(dst, result[2]); + dst += stride; + Store4(dst, result[3]); +} + +template <bool upsampled> +inline void DirectionalZone3_4xH(uint8_t* dest, const ptrdiff_t stride, + const int height, const uint16_t* const left, + const int ystep) { + const int upsample_shift = static_cast<int>(upsampled); + int y = 0; + do { + DirectionalZone3_4x4<upsampled>(dest, stride, left + (y << upsample_shift), + ystep); + dest += 4 * stride; + y += 4; + } while (y < height); +} + +template <bool upsampled> +inline void DirectionalZone3_Wx4(uint8_t* dest, const ptrdiff_t stride, + const int width, const uint16_t* const left, + const int ystep) { + int x = 0; + int base_left_y = 0; + do { + // TODO(petersonab): Establish 8x4 transpose to reserve this function for + // 8x4 and 16x4. + DirectionalZone3_4x4<upsampled>(dest + 2 * x, stride, left, ystep, + base_left_y); + base_left_y += 4 * ystep; + x += 4; + } while (x < width); +} + +template <bool upsampled> +inline void DirectionalZone3_8x8(uint8_t* dest, const ptrdiff_t stride, + const uint16_t* const left, const int ystep, + const int base_left_y = 0) { + const int upsample_shift = static_cast<int>(upsampled); + const int index_scale_bits = 6 - upsample_shift; + + // Compute one column at a time, then transpose for storage. + uint16x8_t result[8]; + + int left_y = base_left_y + ystep; + uint16x8x2_t sampled_left_col; + int left_offset = left_y >> index_scale_bits; + int shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1; + int shift_1 = 32 - shift_0; + LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled); + result[0] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], + shift_1, shift_0); + left_y += ystep; + left_offset = left_y >> index_scale_bits; + shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1; + shift_1 = 32 - shift_0; + LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled); + result[1] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], + shift_1, shift_0); + + left_y += ystep; + left_offset = left_y >> index_scale_bits; + shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1; + shift_1 = 32 - shift_0; + LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled); + result[2] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], + shift_1, shift_0); + + left_y += ystep; + left_offset = left_y >> index_scale_bits; + shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1; + shift_1 = 32 - shift_0; + LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled); + result[3] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], + shift_1, shift_0); + + left_y += ystep; + left_offset = left_y >> index_scale_bits; + shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1; + shift_1 = 32 - shift_0; + LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled); + result[4] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], + shift_1, shift_0); + + left_y += ystep; + left_offset = left_y >> index_scale_bits; + shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1; + shift_1 = 32 - shift_0; + LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled); + result[5] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], + shift_1, shift_0); + + left_y += ystep; + left_offset = left_y >> index_scale_bits; + shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1; + shift_1 = 32 - shift_0; + LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled); + result[6] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], + shift_1, shift_0); + + left_y += ystep; + left_offset = left_y >> index_scale_bits; + shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1; + shift_1 = 32 - shift_0; + LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled); + result[7] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], + shift_1, shift_0); + + Transpose8x8(result); + Store8(dest, result[0]); + dest += stride; + Store8(dest, result[1]); + dest += stride; + Store8(dest, result[2]); + dest += stride; + Store8(dest, result[3]); + dest += stride; + Store8(dest, result[4]); + dest += stride; + Store8(dest, result[5]); + dest += stride; + Store8(dest, result[6]); + dest += stride; + Store8(dest, result[7]); +} + +template <bool upsampled> +inline void DirectionalZone3_WxH(uint8_t* dest, const ptrdiff_t stride, + const int width, const int height, + const uint16_t* const left, const int ystep) { + const int upsample_shift = static_cast<int>(upsampled); + // Zone3 never runs out of left_column values. + assert((width + height - 1) << upsample_shift > // max_base_y + ((ystep * width) >> (6 - upsample_shift)) + + (/* base_step */ 1 << upsample_shift) * + (height - 1)); // left_base_y + int y = 0; + do { + int x = 0; + uint8_t* dst_x = dest + y * stride; + do { + const int base_left_y = ystep * x; + DirectionalZone3_8x8<upsampled>( + dst_x, stride, left + (y << upsample_shift), ystep, base_left_y); + dst_x += 8 * sizeof(uint16_t); + x += 8; + } while (x < width); + y += 8; + } while (y < height); +} + +void DirectionalIntraPredictorZone3_NEON(void* const dest, + const ptrdiff_t stride, + const void* const left_column, + const int width, const int height, + const int ystep, + const bool upsampled_left) { + const uint16_t* const left = static_cast<const uint16_t*>(left_column); + uint8_t* dst = static_cast<uint8_t*>(dest); + + if (ystep == 64) { + assert(!upsampled_left); + const int width_bytes = width * sizeof(left[0]); + int y = height; + do { + const uint16_t* left_ptr = left + 1; + memcpy(dst, left_ptr, width_bytes); + memcpy(dst + stride, left_ptr + 1, width_bytes); + memcpy(dst + 2 * stride, left_ptr + 2, width_bytes); + memcpy(dst + 3 * stride, left_ptr + 3, width_bytes); + dst += 4 * stride; + left_ptr += 4; + y -= 4; + } while (y != 0); + return; + } + if (width == 4) { + if (upsampled_left) { + DirectionalZone3_4xH<true>(dst, stride, height, left, ystep); + } else { + DirectionalZone3_4xH<false>(dst, stride, height, left, ystep); + } + } else if (height == 4) { + if (upsampled_left) { + DirectionalZone3_Wx4<true>(dst, stride, width, left, ystep); + } else { + DirectionalZone3_Wx4<false>(dst, stride, width, left, ystep); + } + } else { + if (upsampled_left) { + // |upsampled_left| can only be true if |width| + |height| <= 16, + // therefore this is 8x8. + DirectionalZone3_8x8<true>(dst, stride, left, ystep); + } else { + DirectionalZone3_WxH<false>(dst, stride, width, height, left, ystep); + } + } +} + +void Init10bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10); + assert(dsp != nullptr); + dsp->directional_intra_predictor_zone1 = DirectionalIntraPredictorZone1_NEON; + dsp->directional_intra_predictor_zone3 = DirectionalIntraPredictorZone3_NEON; +} + +} // namespace +} // namespace high_bitdepth +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +void IntraPredDirectionalInit_NEON() { + low_bitdepth::Init8bpp(); +#if LIBGAV1_MAX_BITDEPTH >= 10 + high_bitdepth::Init10bpp(); +#endif // LIBGAV1_MAX_BITDEPTH >= 10 +} } // namespace dsp } // namespace libgav1 -#else // !LIBGAV1_ENABLE_NEON +#else // !LIBGAV1_ENABLE_NEON namespace libgav1 { namespace dsp { diff --git a/src/dsp/arm/intrapred_directional_neon.h b/src/dsp/arm/intrapred_directional_neon.h new file mode 100644 index 0000000..f7d6235 --- /dev/null +++ b/src/dsp/arm/intrapred_directional_neon.h @@ -0,0 +1,56 @@ +/* + * Copyright 2021 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_ARM_INTRAPRED_DIRECTIONAL_NEON_H_ +#define LIBGAV1_SRC_DSP_ARM_INTRAPRED_DIRECTIONAL_NEON_H_ + +#include "src/dsp/dsp.h" +#include "src/utils/cpu.h" + +namespace libgav1 { +namespace dsp { + +// Initializes Dsp::directional_intra_predictor_zone*, see the defines below for +// specifics. These functions are not thread-safe. +void IntraPredDirectionalInit_NEON(); + +} // namespace dsp +} // namespace libgav1 + +#if LIBGAV1_ENABLE_NEON +#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1 +#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1 LIBGAV1_CPU_NEON +#endif + +#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2 +#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2 LIBGAV1_CPU_NEON +#endif + +#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3 +#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3 LIBGAV1_CPU_NEON +#endif + +#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone1 +#define LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone1 LIBGAV1_CPU_NEON +#endif + +#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone3 +#define LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone3 LIBGAV1_CPU_NEON +#endif + +#endif // LIBGAV1_ENABLE_NEON + +#endif // LIBGAV1_SRC_DSP_ARM_INTRAPRED_DIRECTIONAL_NEON_H_ diff --git a/src/dsp/arm/intrapred_filter_intra_neon.cc b/src/dsp/arm/intrapred_filter_neon.cc index 411708e..bd9f61d 100644 --- a/src/dsp/arm/intrapred_filter_intra_neon.cc +++ b/src/dsp/arm/intrapred_filter_neon.cc @@ -1,4 +1,4 @@ -// Copyright 2019 The libgav1 Authors +// Copyright 2021 The libgav1 Authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "src/dsp/intrapred.h" +#include "src/dsp/intrapred_filter.h" #include "src/utils/cpu.h" #if LIBGAV1_ENABLE_NEON @@ -160,16 +160,16 @@ void Init8bpp() { } // namespace } // namespace low_bitdepth -void IntraPredFilterIntraInit_NEON() { low_bitdepth::Init8bpp(); } +void IntraPredFilterInit_NEON() { low_bitdepth::Init8bpp(); } } // namespace dsp } // namespace libgav1 -#else // !LIBGAV1_ENABLE_NEON +#else // !LIBGAV1_ENABLE_NEON namespace libgav1 { namespace dsp { -void IntraPredFilterIntraInit_NEON() {} +void IntraPredFilterInit_NEON() {} } // namespace dsp } // namespace libgav1 diff --git a/src/dsp/arm/intrapred_filter_neon.h b/src/dsp/arm/intrapred_filter_neon.h new file mode 100644 index 0000000..283c1b1 --- /dev/null +++ b/src/dsp/arm/intrapred_filter_neon.h @@ -0,0 +1,37 @@ +/* + * Copyright 2021 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_ARM_INTRAPRED_FILTER_NEON_H_ +#define LIBGAV1_SRC_DSP_ARM_INTRAPRED_FILTER_NEON_H_ + +#include "src/dsp/dsp.h" +#include "src/utils/cpu.h" + +namespace libgav1 { +namespace dsp { + +// Initializes Dsp::filter_intra_predictor, see the defines below for specifics. +// These functions are not thread-safe. +void IntraPredFilterInit_NEON(); + +} // namespace dsp +} // namespace libgav1 + +#if LIBGAV1_ENABLE_NEON +#define LIBGAV1_Dsp8bpp_FilterIntraPredictor LIBGAV1_CPU_NEON +#endif // LIBGAV1_ENABLE_NEON + +#endif // LIBGAV1_SRC_DSP_ARM_INTRAPRED_FILTER_NEON_H_ diff --git a/src/dsp/arm/intrapred_neon.cc b/src/dsp/arm/intrapred_neon.cc index c967d82..c143648 100644 --- a/src/dsp/arm/intrapred_neon.cc +++ b/src/dsp/arm/intrapred_neon.cc @@ -26,6 +26,7 @@ #include "src/dsp/arm/common_neon.h" #include "src/dsp/constants.h" #include "src/dsp/dsp.h" +#include "src/utils/constants.h" namespace libgav1 { namespace dsp { @@ -964,6 +965,200 @@ struct DcDefs { using _64x64 = DcPredFuncs_NEON<6, 6, DcSum_NEON, DcStore_NEON<64, 64>>; }; +// IntraPredFuncs_NEON::Horizontal -- duplicate left column across all rows + +template <int block_height> +void Horizontal4xH_NEON(void* const dest, ptrdiff_t stride, + const void* /*top_row*/, + const void* const left_column) { + const auto* const left = static_cast<const uint16_t*>(left_column); + auto* dst = static_cast<uint8_t*>(dest); + int y = 0; + do { + auto* dst16 = reinterpret_cast<uint16_t*>(dst); + const uint16x4_t row = vld1_dup_u16(left + y); + vst1_u16(dst16, row); + dst += stride; + } while (++y < block_height); +} + +template <int block_height> +void Horizontal8xH_NEON(void* const dest, ptrdiff_t stride, + const void* /*top_row*/, + const void* const left_column) { + const auto* const left = static_cast<const uint16_t*>(left_column); + auto* dst = static_cast<uint8_t*>(dest); + int y = 0; + do { + auto* dst16 = reinterpret_cast<uint16_t*>(dst); + const uint16x8_t row = vld1q_dup_u16(left + y); + vst1q_u16(dst16, row); + dst += stride; + } while (++y < block_height); +} + +template <int block_height> +void Horizontal16xH_NEON(void* const dest, ptrdiff_t stride, + const void* /*top_row*/, + const void* const left_column) { + const auto* const left = static_cast<const uint16_t*>(left_column); + auto* dst = static_cast<uint8_t*>(dest); + int y = 0; + do { + const uint16x8_t row0 = vld1q_dup_u16(left + y); + const uint16x8_t row1 = vld1q_dup_u16(left + y + 1); + auto* dst16 = reinterpret_cast<uint16_t*>(dst); + vst1q_u16(dst16, row0); + vst1q_u16(dst16 + 8, row0); + dst += stride; + dst16 = reinterpret_cast<uint16_t*>(dst); + vst1q_u16(dst16, row1); + vst1q_u16(dst16 + 8, row1); + dst += stride; + y += 2; + } while (y < block_height); +} + +template <int block_height> +void Horizontal32xH_NEON(void* const dest, ptrdiff_t stride, + const void* /*top_row*/, + const void* const left_column) { + const auto* const left = static_cast<const uint16_t*>(left_column); + auto* dst = static_cast<uint8_t*>(dest); + int y = 0; + do { + const uint16x8_t row0 = vld1q_dup_u16(left + y); + const uint16x8_t row1 = vld1q_dup_u16(left + y + 1); + auto* dst16 = reinterpret_cast<uint16_t*>(dst); + vst1q_u16(dst16, row0); + vst1q_u16(dst16 + 8, row0); + vst1q_u16(dst16 + 16, row0); + vst1q_u16(dst16 + 24, row0); + dst += stride; + dst16 = reinterpret_cast<uint16_t*>(dst); + vst1q_u16(dst16, row1); + vst1q_u16(dst16 + 8, row1); + vst1q_u16(dst16 + 16, row1); + vst1q_u16(dst16 + 24, row1); + dst += stride; + y += 2; + } while (y < block_height); +} + +// IntraPredFuncs_NEON::Vertical -- copy top row to all rows + +template <int block_height> +void Vertical4xH_NEON(void* const dest, ptrdiff_t stride, + const void* const top_row, + const void* const /*left_column*/) { + const auto* const top = static_cast<const uint8_t*>(top_row); + auto* dst = static_cast<uint8_t*>(dest); + const uint8x8_t row = vld1_u8(top); + int y = block_height; + do { + vst1_u8(dst, row); + dst += stride; + } while (--y != 0); +} + +template <int block_height> +void Vertical8xH_NEON(void* const dest, ptrdiff_t stride, + const void* const top_row, + const void* const /*left_column*/) { + const auto* const top = static_cast<const uint8_t*>(top_row); + auto* dst = static_cast<uint8_t*>(dest); + const uint8x16_t row = vld1q_u8(top); + int y = block_height; + do { + vst1q_u8(dst, row); + dst += stride; + } while (--y != 0); +} + +template <int block_height> +void Vertical16xH_NEON(void* const dest, ptrdiff_t stride, + const void* const top_row, + const void* const /*left_column*/) { + const auto* const top = static_cast<const uint8_t*>(top_row); + auto* dst = static_cast<uint8_t*>(dest); + const uint8x16_t row0 = vld1q_u8(top); + const uint8x16_t row1 = vld1q_u8(top + 16); + int y = block_height; + do { + vst1q_u8(dst, row0); + vst1q_u8(dst + 16, row1); + dst += stride; + vst1q_u8(dst, row0); + vst1q_u8(dst + 16, row1); + dst += stride; + y -= 2; + } while (y != 0); +} + +template <int block_height> +void Vertical32xH_NEON(void* const dest, ptrdiff_t stride, + const void* const top_row, + const void* const /*left_column*/) { + const auto* const top = static_cast<const uint8_t*>(top_row); + auto* dst = static_cast<uint8_t*>(dest); + const uint8x16_t row0 = vld1q_u8(top); + const uint8x16_t row1 = vld1q_u8(top + 16); + const uint8x16_t row2 = vld1q_u8(top + 32); + const uint8x16_t row3 = vld1q_u8(top + 48); + int y = block_height; + do { + vst1q_u8(dst, row0); + vst1q_u8(dst + 16, row1); + vst1q_u8(dst + 32, row2); + vst1q_u8(dst + 48, row3); + dst += stride; + vst1q_u8(dst, row0); + vst1q_u8(dst + 16, row1); + vst1q_u8(dst + 32, row2); + vst1q_u8(dst + 48, row3); + dst += stride; + y -= 2; + } while (y != 0); +} + +template <int block_height> +void Vertical64xH_NEON(void* const dest, ptrdiff_t stride, + const void* const top_row, + const void* const /*left_column*/) { + const auto* const top = static_cast<const uint8_t*>(top_row); + auto* dst = static_cast<uint8_t*>(dest); + const uint8x16_t row0 = vld1q_u8(top); + const uint8x16_t row1 = vld1q_u8(top + 16); + const uint8x16_t row2 = vld1q_u8(top + 32); + const uint8x16_t row3 = vld1q_u8(top + 48); + const uint8x16_t row4 = vld1q_u8(top + 64); + const uint8x16_t row5 = vld1q_u8(top + 80); + const uint8x16_t row6 = vld1q_u8(top + 96); + const uint8x16_t row7 = vld1q_u8(top + 112); + int y = block_height; + do { + vst1q_u8(dst, row0); + vst1q_u8(dst + 16, row1); + vst1q_u8(dst + 32, row2); + vst1q_u8(dst + 48, row3); + vst1q_u8(dst + 64, row4); + vst1q_u8(dst + 80, row5); + vst1q_u8(dst + 96, row6); + vst1q_u8(dst + 112, row7); + dst += stride; + vst1q_u8(dst, row0); + vst1q_u8(dst + 16, row1); + vst1q_u8(dst + 32, row2); + vst1q_u8(dst + 48, row3); + vst1q_u8(dst + 64, row4); + vst1q_u8(dst + 80, row5); + vst1q_u8(dst + 96, row6); + vst1q_u8(dst + 112, row7); + dst += stride; + y -= 2; + } while (y != 0); +} + void Init10bpp() { Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10); assert(dsp != nullptr); @@ -973,6 +1168,8 @@ void Init10bpp() { DcDefs::_4x4::DcLeft; dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDc] = DcDefs::_4x4::Dc; + dsp->intra_predictors[kTransformSize4x4][kIntraPredictorVertical] = + Vertical4xH_NEON<4>; // 4x8 dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcTop] = @@ -981,6 +1178,10 @@ void Init10bpp() { DcDefs::_4x8::DcLeft; dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDc] = DcDefs::_4x8::Dc; + dsp->intra_predictors[kTransformSize4x8][kIntraPredictorHorizontal] = + Horizontal4xH_NEON<8>; + dsp->intra_predictors[kTransformSize4x8][kIntraPredictorVertical] = + Vertical4xH_NEON<8>; // 4x16 dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcTop] = @@ -989,6 +1190,10 @@ void Init10bpp() { DcDefs::_4x16::DcLeft; dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDc] = DcDefs::_4x16::Dc; + dsp->intra_predictors[kTransformSize4x16][kIntraPredictorHorizontal] = + Horizontal4xH_NEON<16>; + dsp->intra_predictors[kTransformSize4x16][kIntraPredictorVertical] = + Vertical4xH_NEON<16>; // 8x4 dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcTop] = @@ -997,6 +1202,8 @@ void Init10bpp() { DcDefs::_8x4::DcLeft; dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDc] = DcDefs::_8x4::Dc; + dsp->intra_predictors[kTransformSize8x4][kIntraPredictorVertical] = + Vertical8xH_NEON<4>; // 8x8 dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcTop] = @@ -1005,6 +1212,10 @@ void Init10bpp() { DcDefs::_8x8::DcLeft; dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDc] = DcDefs::_8x8::Dc; + dsp->intra_predictors[kTransformSize8x8][kIntraPredictorHorizontal] = + Horizontal8xH_NEON<8>; + dsp->intra_predictors[kTransformSize8x8][kIntraPredictorVertical] = + Vertical8xH_NEON<8>; // 8x16 dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcTop] = @@ -1013,6 +1224,8 @@ void Init10bpp() { DcDefs::_8x16::DcLeft; dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDc] = DcDefs::_8x16::Dc; + dsp->intra_predictors[kTransformSize8x16][kIntraPredictorVertical] = + Vertical8xH_NEON<16>; // 8x32 dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcTop] = @@ -1021,6 +1234,10 @@ void Init10bpp() { DcDefs::_8x32::DcLeft; dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDc] = DcDefs::_8x32::Dc; + dsp->intra_predictors[kTransformSize8x32][kIntraPredictorHorizontal] = + Horizontal8xH_NEON<32>; + dsp->intra_predictors[kTransformSize8x32][kIntraPredictorVertical] = + Vertical8xH_NEON<32>; // 16x4 dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcTop] = @@ -1029,6 +1246,8 @@ void Init10bpp() { DcDefs::_16x4::DcLeft; dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDc] = DcDefs::_16x4::Dc; + dsp->intra_predictors[kTransformSize16x4][kIntraPredictorVertical] = + Vertical16xH_NEON<4>; // 16x8 dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcTop] = @@ -1037,6 +1256,10 @@ void Init10bpp() { DcDefs::_16x8::DcLeft; dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDc] = DcDefs::_16x8::Dc; + dsp->intra_predictors[kTransformSize16x8][kIntraPredictorHorizontal] = + Horizontal16xH_NEON<8>; + dsp->intra_predictors[kTransformSize16x8][kIntraPredictorVertical] = + Vertical16xH_NEON<8>; // 16x16 dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcTop] = @@ -1045,6 +1268,8 @@ void Init10bpp() { DcDefs::_16x16::DcLeft; dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDc] = DcDefs::_16x16::Dc; + dsp->intra_predictors[kTransformSize16x16][kIntraPredictorVertical] = + Vertical16xH_NEON<16>; // 16x32 dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcTop] = @@ -1053,6 +1278,8 @@ void Init10bpp() { DcDefs::_16x32::DcLeft; dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDc] = DcDefs::_16x32::Dc; + dsp->intra_predictors[kTransformSize16x32][kIntraPredictorVertical] = + Vertical16xH_NEON<32>; // 16x64 dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcTop] = @@ -1061,6 +1288,8 @@ void Init10bpp() { DcDefs::_16x64::DcLeft; dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDc] = DcDefs::_16x64::Dc; + dsp->intra_predictors[kTransformSize16x64][kIntraPredictorVertical] = + Vertical16xH_NEON<64>; // 32x8 dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcTop] = @@ -1069,6 +1298,8 @@ void Init10bpp() { DcDefs::_32x8::DcLeft; dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDc] = DcDefs::_32x8::Dc; + dsp->intra_predictors[kTransformSize32x8][kIntraPredictorVertical] = + Vertical32xH_NEON<8>; // 32x16 dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcTop] = @@ -1077,6 +1308,8 @@ void Init10bpp() { DcDefs::_32x16::DcLeft; dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDc] = DcDefs::_32x16::Dc; + dsp->intra_predictors[kTransformSize32x16][kIntraPredictorVertical] = + Vertical32xH_NEON<16>; // 32x32 dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcTop] = @@ -1085,6 +1318,8 @@ void Init10bpp() { DcDefs::_32x32::DcLeft; dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDc] = DcDefs::_32x32::Dc; + dsp->intra_predictors[kTransformSize32x32][kIntraPredictorVertical] = + Vertical32xH_NEON<32>; // 32x64 dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcTop] = @@ -1093,6 +1328,10 @@ void Init10bpp() { DcDefs::_32x64::DcLeft; dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDc] = DcDefs::_32x64::Dc; + dsp->intra_predictors[kTransformSize32x64][kIntraPredictorHorizontal] = + Horizontal32xH_NEON<64>; + dsp->intra_predictors[kTransformSize32x64][kIntraPredictorVertical] = + Vertical32xH_NEON<64>; // 64x16 dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcTop] = @@ -1101,6 +1340,8 @@ void Init10bpp() { DcDefs::_64x16::DcLeft; dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDc] = DcDefs::_64x16::Dc; + dsp->intra_predictors[kTransformSize64x16][kIntraPredictorVertical] = + Vertical64xH_NEON<16>; // 64x32 dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcTop] = @@ -1109,6 +1350,8 @@ void Init10bpp() { DcDefs::_64x32::DcLeft; dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDc] = DcDefs::_64x32::Dc; + dsp->intra_predictors[kTransformSize64x32][kIntraPredictorVertical] = + Vertical64xH_NEON<32>; // 64x64 dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcTop] = @@ -1117,6 +1360,8 @@ void Init10bpp() { DcDefs::_64x64::DcLeft; dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDc] = DcDefs::_64x64::Dc; + dsp->intra_predictors[kTransformSize64x64][kIntraPredictorVertical] = + Vertical64xH_NEON<64>; } } // namespace @@ -1133,7 +1378,7 @@ void IntraPredInit_NEON() { } // namespace dsp } // namespace libgav1 -#else // !LIBGAV1_ENABLE_NEON +#else // !LIBGAV1_ENABLE_NEON namespace libgav1 { namespace dsp { diff --git a/src/dsp/arm/intrapred_neon.h b/src/dsp/arm/intrapred_neon.h index 16f858c..b27f29f 100644 --- a/src/dsp/arm/intrapred_neon.h +++ b/src/dsp/arm/intrapred_neon.h @@ -23,396 +23,282 @@ namespace libgav1 { namespace dsp { -// Initializes Dsp::intra_predictors, Dsp::directional_intra_predictor_zone*, -// Dsp::cfl_intra_predictors, Dsp::cfl_subsamplers and -// Dsp::filter_intra_predictor, see the defines below for specifics. These -// functions are not thread-safe. -void IntraPredCflInit_NEON(); -void IntraPredDirectionalInit_NEON(); -void IntraPredFilterIntraInit_NEON(); +// Initializes Dsp::intra_predictors. +// See the defines below for specifics. These functions are not thread-safe. void IntraPredInit_NEON(); -void IntraPredSmoothInit_NEON(); } // namespace dsp } // namespace libgav1 #if LIBGAV1_ENABLE_NEON -// 8 bit -#define LIBGAV1_Dsp8bpp_FilterIntraPredictor LIBGAV1_CPU_NEON - -#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1 LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2 LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3 LIBGAV1_CPU_NEON - // 4x4 #define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDc LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorPaeth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmooth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothVertical \ - LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothHorizontal \ - LIBGAV1_CPU_NEON - -#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflIntraPredictor LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler420 LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler444 LIBGAV1_CPU_NEON // 4x8 #define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDc LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorPaeth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmooth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothVertical \ - LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothHorizontal \ - LIBGAV1_CPU_NEON - -#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflIntraPredictor LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler420 LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler444 LIBGAV1_CPU_NEON // 4x16 #define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDc LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorPaeth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmooth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothVertical \ - LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothHorizontal \ - LIBGAV1_CPU_NEON - -#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflIntraPredictor LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler420 LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler444 LIBGAV1_CPU_NEON // 8x4 #define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDc LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorPaeth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmooth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothVertical \ - LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothHorizontal \ - LIBGAV1_CPU_NEON - -#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflIntraPredictor LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler420 LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler444 LIBGAV1_CPU_NEON // 8x8 #define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDc LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorPaeth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmooth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothVertical \ - LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothHorizontal \ - LIBGAV1_CPU_NEON - -#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflIntraPredictor LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler420 LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler444 LIBGAV1_CPU_NEON // 8x16 #define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDc LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorPaeth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmooth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothVertical \ - LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothHorizontal \ - LIBGAV1_CPU_NEON - -#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflIntraPredictor LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler420 LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler444 LIBGAV1_CPU_NEON // 8x32 #define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcLeft LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDc LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorPaeth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmooth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothVertical \ - LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothHorizontal \ - LIBGAV1_CPU_NEON - -#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflIntraPredictor LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler420 LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler444 LIBGAV1_CPU_NEON // 16x4 #define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDc LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorPaeth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmooth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothVertical \ - LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothHorizontal \ - LIBGAV1_CPU_NEON - -#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflIntraPredictor LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler420 LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler444 LIBGAV1_CPU_NEON // 16x8 #define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDc LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorPaeth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmooth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothVertical \ - LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothHorizontal \ - LIBGAV1_CPU_NEON - -#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflIntraPredictor LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler420 LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler444 LIBGAV1_CPU_NEON // 16x16 #define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDc LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorPaeth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmooth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothVertical \ - LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothHorizontal \ - LIBGAV1_CPU_NEON - -#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflIntraPredictor LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler420 LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler444 LIBGAV1_CPU_NEON // 16x32 #define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcLeft LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDc LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorPaeth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmooth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothVertical \ - LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothHorizontal \ - LIBGAV1_CPU_NEON - -#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflIntraPredictor LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler420 LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler444 LIBGAV1_CPU_NEON // 16x64 #define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcLeft LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDc LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorPaeth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmooth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothVertical \ - LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothHorizontal \ - LIBGAV1_CPU_NEON // 32x8 #define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDc LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorPaeth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmooth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothVertical \ - LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothHorizontal \ - LIBGAV1_CPU_NEON - -#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflIntraPredictor LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler420 LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler444 LIBGAV1_CPU_NEON // 32x16 #define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDc LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorPaeth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmooth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothVertical \ - LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothHorizontal \ - LIBGAV1_CPU_NEON - -#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflIntraPredictor LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler420 LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler444 LIBGAV1_CPU_NEON // 32x32 #define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcLeft LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDc LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorPaeth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmooth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothVertical \ - LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothHorizontal \ - LIBGAV1_CPU_NEON - -#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflIntraPredictor LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler420 LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler444 LIBGAV1_CPU_NEON // 32x64 #define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcLeft LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDc LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorPaeth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmooth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothVertical \ - LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothHorizontal \ - LIBGAV1_CPU_NEON // 64x16 #define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDc LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorPaeth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmooth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothVertical \ - LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothHorizontal \ - LIBGAV1_CPU_NEON // 64x32 #define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcLeft LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDc LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorPaeth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmooth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothVertical \ - LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothHorizontal \ - LIBGAV1_CPU_NEON // 64x64 #define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcLeft LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDc LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorPaeth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmooth LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothVertical \ - LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothHorizontal \ - LIBGAV1_CPU_NEON // 10 bit // 4x4 #define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDc LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorVertical \ + LIBGAV1_CPU_NEON // 4x8 #define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorDc LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorHorizontal \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorVertical \ + LIBGAV1_CPU_NEON // 4x16 #define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorDc LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorHorizontal \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorVertical \ + LIBGAV1_CPU_NEON // 8x4 #define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorDc LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorVertical \ + LIBGAV1_CPU_NEON // 8x8 #define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorDc LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorHorizontal \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorVertical \ + LIBGAV1_CPU_NEON // 8x16 #define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorDc LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorVertical \ + LIBGAV1_CPU_NEON // 8x32 #define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorDcLeft LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorDc LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorHorizontal \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorVertical \ + LIBGAV1_CPU_NEON // 16x4 #define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorDc LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorVertical \ + LIBGAV1_CPU_NEON // 16x8 #define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorDc LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorHorizontal \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorVertical \ + LIBGAV1_CPU_NEON // 16x16 #define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorDcLeft \ LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorDc LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorVertical \ + LIBGAV1_CPU_NEON // 16x32 #define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorDcLeft \ LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorDc LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorVertical \ + LIBGAV1_CPU_NEON // 16x64 #define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorDcLeft \ LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorDc LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorVertical \ + LIBGAV1_CPU_NEON // 32x8 #define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorDc LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorVertical \ + LIBGAV1_CPU_NEON // 32x16 #define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorDcLeft \ LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorDc LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorVertical \ + LIBGAV1_CPU_NEON // 32x32 #define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorDcLeft \ LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorDc LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorVertical \ + LIBGAV1_CPU_NEON // 32x64 #define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorDcLeft \ LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorDc LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorHorizontal \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorVertical \ + LIBGAV1_CPU_NEON // 64x16 #define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorDcLeft \ LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorDc LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorVertical \ + LIBGAV1_CPU_NEON // 64x32 #define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorDcLeft \ LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorDc LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorVertical \ + LIBGAV1_CPU_NEON // 64x64 #define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorDcTop LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorDcLeft \ LIBGAV1_CPU_NEON #define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorDc LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorVertical \ + LIBGAV1_CPU_NEON #endif // LIBGAV1_ENABLE_NEON #endif // LIBGAV1_SRC_DSP_ARM_INTRAPRED_NEON_H_ diff --git a/src/dsp/arm/intrapred_smooth_neon.cc b/src/dsp/arm/intrapred_smooth_neon.cc index abc93e8..c33f333 100644 --- a/src/dsp/arm/intrapred_smooth_neon.cc +++ b/src/dsp/arm/intrapred_smooth_neon.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "src/dsp/intrapred.h" +#include "src/dsp/intrapred_smooth.h" #include "src/utils/cpu.h" #if LIBGAV1_ENABLE_NEON @@ -26,6 +26,7 @@ #include "src/dsp/arm/common_neon.h" #include "src/dsp/constants.h" #include "src/dsp/dsp.h" +#include "src/utils/constants.h" namespace libgav1 { namespace dsp { @@ -605,7 +606,7 @@ void IntraPredSmoothInit_NEON() { low_bitdepth::Init8bpp(); } } // namespace dsp } // namespace libgav1 -#else // !LIBGAV1_ENABLE_NEON +#else // !LIBGAV1_ENABLE_NEON namespace libgav1 { namespace dsp { diff --git a/src/dsp/arm/intrapred_smooth_neon.h b/src/dsp/arm/intrapred_smooth_neon.h new file mode 100644 index 0000000..edd01be --- /dev/null +++ b/src/dsp/arm/intrapred_smooth_neon.h @@ -0,0 +1,149 @@ +/* + * Copyright 2021 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_ARM_INTRAPRED_SMOOTH_NEON_H_ +#define LIBGAV1_SRC_DSP_ARM_INTRAPRED_SMOOTH_NEON_H_ + +#include "src/dsp/dsp.h" +#include "src/utils/cpu.h" + +namespace libgav1 { +namespace dsp { + +// Initializes Dsp::intra_predictors[][kIntraPredictorSmooth.*]. +// This function is not thread-safe. +void IntraPredSmoothInit_NEON(); + +} // namespace dsp +} // namespace libgav1 + +#if LIBGAV1_ENABLE_NEON +#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmooth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_NEON + +#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmooth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_NEON + +#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmooth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_NEON + +#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmooth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_NEON + +#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmooth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_NEON + +#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmooth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_NEON + +#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmooth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_NEON + +#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmooth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_NEON + +#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmooth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_NEON + +#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmooth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_NEON + +#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmooth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_NEON + +#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmooth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_NEON + +#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmooth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_NEON + +#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmooth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_NEON + +#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmooth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_NEON + +#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmooth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_NEON + +#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmooth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_NEON + +#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmooth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_NEON + +#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmooth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_NEON +#endif // LIBGAV1_ENABLE_NEON + +#endif // LIBGAV1_SRC_DSP_ARM_INTRAPRED_SMOOTH_NEON_H_ diff --git a/src/dsp/arm/inverse_transform_10bit_neon.cc b/src/dsp/arm/inverse_transform_10bit_neon.cc new file mode 100644 index 0000000..ff184a1 --- /dev/null +++ b/src/dsp/arm/inverse_transform_10bit_neon.cc @@ -0,0 +1,2543 @@ +// Copyright 2021 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/inverse_transform.h" +#include "src/utils/cpu.h" + +#if LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10 + +#include <arm_neon.h> + +#include <algorithm> +#include <cassert> +#include <cstdint> + +#include "src/dsp/arm/common_neon.h" +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/utils/array_2d.h" +#include "src/utils/common.h" +#include "src/utils/compiler_attributes.h" +#include "src/utils/constants.h" + +namespace libgav1 { +namespace dsp { +namespace { + +// Include the constants and utility functions inside the anonymous namespace. +#include "src/dsp/inverse_transform.inc" + +//------------------------------------------------------------------------------ + +LIBGAV1_ALWAYS_INLINE void Transpose4x4(const int32x4_t in[4], + int32x4_t out[4]) { + // in: + // 00 01 02 03 + // 10 11 12 13 + // 20 21 22 23 + // 30 31 32 33 + + // 00 10 02 12 a.val[0] + // 01 11 03 13 a.val[1] + // 20 30 22 32 b.val[0] + // 21 31 23 33 b.val[1] + const int32x4x2_t a = vtrnq_s32(in[0], in[1]); + const int32x4x2_t b = vtrnq_s32(in[2], in[3]); + out[0] = vextq_s32(vextq_s32(a.val[0], a.val[0], 2), b.val[0], 2); + out[1] = vextq_s32(vextq_s32(a.val[1], a.val[1], 2), b.val[1], 2); + out[2] = vextq_s32(a.val[0], vextq_s32(b.val[0], b.val[0], 2), 2); + out[3] = vextq_s32(a.val[1], vextq_s32(b.val[1], b.val[1], 2), 2); + // out: + // 00 10 20 30 + // 01 11 21 31 + // 02 12 22 32 + // 03 13 23 33 +} + +//------------------------------------------------------------------------------ +template <int store_count> +LIBGAV1_ALWAYS_INLINE void StoreDst(int32_t* dst, int32_t stride, int32_t idx, + const int32x4_t* const s) { + assert(store_count % 4 == 0); + for (int i = 0; i < store_count; i += 4) { + vst1q_s32(&dst[i * stride + idx], s[i]); + vst1q_s32(&dst[(i + 1) * stride + idx], s[i + 1]); + vst1q_s32(&dst[(i + 2) * stride + idx], s[i + 2]); + vst1q_s32(&dst[(i + 3) * stride + idx], s[i + 3]); + } +} + +template <int load_count> +LIBGAV1_ALWAYS_INLINE void LoadSrc(const int32_t* src, int32_t stride, + int32_t idx, int32x4_t* x) { + assert(load_count % 4 == 0); + for (int i = 0; i < load_count; i += 4) { + x[i] = vld1q_s32(&src[i * stride + idx]); + x[i + 1] = vld1q_s32(&src[(i + 1) * stride + idx]); + x[i + 2] = vld1q_s32(&src[(i + 2) * stride + idx]); + x[i + 3] = vld1q_s32(&src[(i + 3) * stride + idx]); + } +} + +// Butterfly rotate 4 values. +LIBGAV1_ALWAYS_INLINE void ButterflyRotation_4(int32x4_t* a, int32x4_t* b, + const int angle, + const bool flip) { + const int32_t cos128 = Cos128(angle); + const int32_t sin128 = Sin128(angle); + const int32x4_t acc_x = vmulq_n_s32(*a, cos128); + const int32x4_t acc_y = vmulq_n_s32(*a, sin128); + // The max range for the input is 18 bits. The cos128/sin128 is 13 bits, + // which leaves 1 bit for the add/subtract. For 10bpp, x/y will fit in a 32 + // bit lane. + const int32x4_t x0 = vmlsq_n_s32(acc_x, *b, sin128); + const int32x4_t y0 = vmlaq_n_s32(acc_y, *b, cos128); + const int32x4_t x = vrshrq_n_s32(x0, 12); + const int32x4_t y = vrshrq_n_s32(y0, 12); + if (flip) { + *a = y; + *b = x; + } else { + *a = x; + *b = y; + } +} + +LIBGAV1_ALWAYS_INLINE void ButterflyRotation_FirstIsZero(int32x4_t* a, + int32x4_t* b, + const int angle, + const bool flip) { + const int32_t cos128 = Cos128(angle); + const int32_t sin128 = Sin128(angle); + assert(sin128 <= 0xfff); + const int32x4_t x0 = vmulq_n_s32(*b, -sin128); + const int32x4_t y0 = vmulq_n_s32(*b, cos128); + const int32x4_t x = vrshrq_n_s32(x0, 12); + const int32x4_t y = vrshrq_n_s32(y0, 12); + if (flip) { + *a = y; + *b = x; + } else { + *a = x; + *b = y; + } +} + +LIBGAV1_ALWAYS_INLINE void ButterflyRotation_SecondIsZero(int32x4_t* a, + int32x4_t* b, + const int angle, + const bool flip) { + const int32_t cos128 = Cos128(angle); + const int32_t sin128 = Sin128(angle); + const int32x4_t x0 = vmulq_n_s32(*a, cos128); + const int32x4_t y0 = vmulq_n_s32(*a, sin128); + const int32x4_t x = vrshrq_n_s32(x0, 12); + const int32x4_t y = vrshrq_n_s32(y0, 12); + if (flip) { + *a = y; + *b = x; + } else { + *a = x; + *b = y; + } +} + +LIBGAV1_ALWAYS_INLINE void HadamardRotation(int32x4_t* a, int32x4_t* b, + bool flip) { + int32x4_t x, y; + if (flip) { + y = vqaddq_s32(*b, *a); + x = vqsubq_s32(*b, *a); + } else { + x = vqaddq_s32(*a, *b); + y = vqsubq_s32(*a, *b); + } + *a = x; + *b = y; +} + +LIBGAV1_ALWAYS_INLINE void HadamardRotation(int32x4_t* a, int32x4_t* b, + bool flip, const int32x4_t* min, + const int32x4_t* max) { + int32x4_t x, y; + if (flip) { + y = vqaddq_s32(*b, *a); + x = vqsubq_s32(*b, *a); + } else { + x = vqaddq_s32(*a, *b); + y = vqsubq_s32(*a, *b); + } + *a = vmaxq_s32(vminq_s32(x, *max), *min); + *b = vmaxq_s32(vminq_s32(y, *max), *min); +} + +using ButterflyRotationFunc = void (*)(int32x4_t* a, int32x4_t* b, int angle, + bool flip); + +//------------------------------------------------------------------------------ +// Discrete Cosine Transforms (DCT). + +template <int width> +LIBGAV1_ALWAYS_INLINE bool DctDcOnly(void* dest, int adjusted_tx_height, + bool should_round, int row_shift) { + if (adjusted_tx_height > 1) return false; + + auto* dst = static_cast<int32_t*>(dest); + const int32x4_t v_src = vdupq_n_s32(dst[0]); + const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0); + const int32x4_t v_src_round = + vqrdmulhq_n_s32(v_src, kTransformRowMultiplier << (31 - 12)); + const int32x4_t s0 = vbslq_s32(v_mask, v_src_round, v_src); + const int32_t cos128 = Cos128(32); + const int32x4_t xy = vqrdmulhq_n_s32(s0, cos128 << (31 - 12)); + // vqrshlq_s32 will shift right if shift value is negative. + const int32x4_t xy_shifted = vqrshlq_s32(xy, vdupq_n_s32(-row_shift)); + // Clamp result to signed 16 bits. + const int32x4_t result = vmovl_s16(vqmovn_s32(xy_shifted)); + if (width == 4) { + vst1q_s32(dst, result); + } else { + for (int i = 0; i < width; i += 4) { + vst1q_s32(dst, result); + dst += 4; + } + } + return true; +} + +template <int height> +LIBGAV1_ALWAYS_INLINE bool DctDcOnlyColumn(void* dest, int adjusted_tx_height, + int width) { + if (adjusted_tx_height > 1) return false; + + auto* dst = static_cast<int32_t*>(dest); + const int32_t cos128 = Cos128(32); + + // Calculate dc values for first row. + if (width == 4) { + const int32x4_t v_src = vld1q_s32(dst); + const int32x4_t xy = vqrdmulhq_n_s32(v_src, cos128 << (31 - 12)); + vst1q_s32(dst, xy); + } else { + int i = 0; + do { + const int32x4_t v_src = vld1q_s32(&dst[i]); + const int32x4_t xy = vqrdmulhq_n_s32(v_src, cos128 << (31 - 12)); + vst1q_s32(&dst[i], xy); + i += 4; + } while (i < width); + } + + // Copy first row to the rest of the block. + for (int y = 1; y < height; ++y) { + memcpy(&dst[y * width], dst, width * sizeof(dst[0])); + } + return true; +} + +template <ButterflyRotationFunc butterfly_rotation, + bool is_fast_butterfly = false> +LIBGAV1_ALWAYS_INLINE void Dct4Stages(int32x4_t* s, const int32x4_t* min, + const int32x4_t* max, + const bool is_last_stage) { + // stage 12. + if (is_fast_butterfly) { + ButterflyRotation_SecondIsZero(&s[0], &s[1], 32, true); + ButterflyRotation_SecondIsZero(&s[2], &s[3], 48, false); + } else { + butterfly_rotation(&s[0], &s[1], 32, true); + butterfly_rotation(&s[2], &s[3], 48, false); + } + + // stage 17. + if (is_last_stage) { + HadamardRotation(&s[0], &s[3], false); + HadamardRotation(&s[1], &s[2], false); + } else { + HadamardRotation(&s[0], &s[3], false, min, max); + HadamardRotation(&s[1], &s[2], false, min, max); + } +} + +template <ButterflyRotationFunc butterfly_rotation> +LIBGAV1_ALWAYS_INLINE void Dct4_NEON(void* dest, int32_t step, bool is_row, + int row_shift) { + auto* const dst = static_cast<int32_t*>(dest); + // When |is_row| is true, set range to the row range, otherwise, set to the + // column range. + const int32_t range = is_row ? kBitdepth10 + 7 : 15; + const int32x4_t min = vdupq_n_s32(-(1 << range)); + const int32x4_t max = vdupq_n_s32((1 << range) - 1); + int32x4_t s[4], x[4]; + + LoadSrc<4>(dst, step, 0, x); + if (is_row) { + Transpose4x4(x, x); + } + + // stage 1. + // kBitReverseLookup 0, 2, 1, 3 + s[0] = x[0]; + s[1] = x[2]; + s[2] = x[1]; + s[3] = x[3]; + + Dct4Stages<butterfly_rotation>(s, &min, &max, /*is_last_stage=*/true); + + if (is_row) { + const int32x4_t v_row_shift = vdupq_n_s32(-row_shift); + for (int i = 0; i < 4; ++i) { + s[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(s[i], v_row_shift))); + } + Transpose4x4(s, s); + } + StoreDst<4>(dst, step, 0, s); +} + +template <ButterflyRotationFunc butterfly_rotation, + bool is_fast_butterfly = false> +LIBGAV1_ALWAYS_INLINE void Dct8Stages(int32x4_t* s, const int32x4_t* min, + const int32x4_t* max, + const bool is_last_stage) { + // stage 8. + if (is_fast_butterfly) { + ButterflyRotation_SecondIsZero(&s[4], &s[7], 56, false); + ButterflyRotation_FirstIsZero(&s[5], &s[6], 24, false); + } else { + butterfly_rotation(&s[4], &s[7], 56, false); + butterfly_rotation(&s[5], &s[6], 24, false); + } + + // stage 13. + HadamardRotation(&s[4], &s[5], false, min, max); + HadamardRotation(&s[6], &s[7], true, min, max); + + // stage 18. + butterfly_rotation(&s[6], &s[5], 32, true); + + // stage 22. + if (is_last_stage) { + HadamardRotation(&s[0], &s[7], false); + HadamardRotation(&s[1], &s[6], false); + HadamardRotation(&s[2], &s[5], false); + HadamardRotation(&s[3], &s[4], false); + } else { + HadamardRotation(&s[0], &s[7], false, min, max); + HadamardRotation(&s[1], &s[6], false, min, max); + HadamardRotation(&s[2], &s[5], false, min, max); + HadamardRotation(&s[3], &s[4], false, min, max); + } +} + +// Process dct8 rows or columns, depending on the |is_row| flag. +template <ButterflyRotationFunc butterfly_rotation> +LIBGAV1_ALWAYS_INLINE void Dct8_NEON(void* dest, int32_t step, bool is_row, + int row_shift) { + auto* const dst = static_cast<int32_t*>(dest); + const int32_t range = is_row ? kBitdepth10 + 7 : 15; + const int32x4_t min = vdupq_n_s32(-(1 << range)); + const int32x4_t max = vdupq_n_s32((1 << range) - 1); + int32x4_t s[8], x[8]; + + if (is_row) { + LoadSrc<4>(dst, step, 0, &x[0]); + LoadSrc<4>(dst, step, 4, &x[4]); + Transpose4x4(&x[0], &x[0]); + Transpose4x4(&x[4], &x[4]); + } else { + LoadSrc<8>(dst, step, 0, &x[0]); + } + + // stage 1. + // kBitReverseLookup 0, 4, 2, 6, 1, 5, 3, 7, + s[0] = x[0]; + s[1] = x[4]; + s[2] = x[2]; + s[3] = x[6]; + s[4] = x[1]; + s[5] = x[5]; + s[6] = x[3]; + s[7] = x[7]; + + Dct4Stages<butterfly_rotation>(s, &min, &max, /*is_last_stage=*/false); + Dct8Stages<butterfly_rotation>(s, &min, &max, /*is_last_stage=*/true); + + if (is_row) { + const int32x4_t v_row_shift = vdupq_n_s32(-row_shift); + for (int i = 0; i < 8; ++i) { + s[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(s[i], v_row_shift))); + } + Transpose4x4(&s[0], &s[0]); + Transpose4x4(&s[4], &s[4]); + StoreDst<4>(dst, step, 0, &s[0]); + StoreDst<4>(dst, step, 4, &s[4]); + } else { + StoreDst<8>(dst, step, 0, &s[0]); + } +} + +template <ButterflyRotationFunc butterfly_rotation, + bool is_fast_butterfly = false> +LIBGAV1_ALWAYS_INLINE void Dct16Stages(int32x4_t* s, const int32x4_t* min, + const int32x4_t* max, + const bool is_last_stage) { + // stage 5. + if (is_fast_butterfly) { + ButterflyRotation_SecondIsZero(&s[8], &s[15], 60, false); + ButterflyRotation_FirstIsZero(&s[9], &s[14], 28, false); + ButterflyRotation_SecondIsZero(&s[10], &s[13], 44, false); + ButterflyRotation_FirstIsZero(&s[11], &s[12], 12, false); + } else { + butterfly_rotation(&s[8], &s[15], 60, false); + butterfly_rotation(&s[9], &s[14], 28, false); + butterfly_rotation(&s[10], &s[13], 44, false); + butterfly_rotation(&s[11], &s[12], 12, false); + } + + // stage 9. + HadamardRotation(&s[8], &s[9], false, min, max); + HadamardRotation(&s[10], &s[11], true, min, max); + HadamardRotation(&s[12], &s[13], false, min, max); + HadamardRotation(&s[14], &s[15], true, min, max); + + // stage 14. + butterfly_rotation(&s[14], &s[9], 48, true); + butterfly_rotation(&s[13], &s[10], 112, true); + + // stage 19. + HadamardRotation(&s[8], &s[11], false, min, max); + HadamardRotation(&s[9], &s[10], false, min, max); + HadamardRotation(&s[12], &s[15], true, min, max); + HadamardRotation(&s[13], &s[14], true, min, max); + + // stage 23. + butterfly_rotation(&s[13], &s[10], 32, true); + butterfly_rotation(&s[12], &s[11], 32, true); + + // stage 26. + if (is_last_stage) { + HadamardRotation(&s[0], &s[15], false); + HadamardRotation(&s[1], &s[14], false); + HadamardRotation(&s[2], &s[13], false); + HadamardRotation(&s[3], &s[12], false); + HadamardRotation(&s[4], &s[11], false); + HadamardRotation(&s[5], &s[10], false); + HadamardRotation(&s[6], &s[9], false); + HadamardRotation(&s[7], &s[8], false); + } else { + HadamardRotation(&s[0], &s[15], false, min, max); + HadamardRotation(&s[1], &s[14], false, min, max); + HadamardRotation(&s[2], &s[13], false, min, max); + HadamardRotation(&s[3], &s[12], false, min, max); + HadamardRotation(&s[4], &s[11], false, min, max); + HadamardRotation(&s[5], &s[10], false, min, max); + HadamardRotation(&s[6], &s[9], false, min, max); + HadamardRotation(&s[7], &s[8], false, min, max); + } +} + +// Process dct16 rows or columns, depending on the |is_row| flag. +template <ButterflyRotationFunc butterfly_rotation> +LIBGAV1_ALWAYS_INLINE void Dct16_NEON(void* dest, int32_t step, bool is_row, + int row_shift) { + auto* const dst = static_cast<int32_t*>(dest); + const int32_t range = is_row ? kBitdepth10 + 7 : 15; + const int32x4_t min = vdupq_n_s32(-(1 << range)); + const int32x4_t max = vdupq_n_s32((1 << range) - 1); + int32x4_t s[16], x[16]; + + if (is_row) { + for (int idx = 0; idx < 16; idx += 8) { + LoadSrc<4>(dst, step, idx, &x[idx]); + LoadSrc<4>(dst, step, idx + 4, &x[idx + 4]); + Transpose4x4(&x[idx], &x[idx]); + Transpose4x4(&x[idx + 4], &x[idx + 4]); + } + } else { + LoadSrc<16>(dst, step, 0, &x[0]); + } + + // stage 1 + // kBitReverseLookup 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15, + s[0] = x[0]; + s[1] = x[8]; + s[2] = x[4]; + s[3] = x[12]; + s[4] = x[2]; + s[5] = x[10]; + s[6] = x[6]; + s[7] = x[14]; + s[8] = x[1]; + s[9] = x[9]; + s[10] = x[5]; + s[11] = x[13]; + s[12] = x[3]; + s[13] = x[11]; + s[14] = x[7]; + s[15] = x[15]; + + Dct4Stages<butterfly_rotation>(s, &min, &max, /*is_last_stage=*/false); + Dct8Stages<butterfly_rotation>(s, &min, &max, /*is_last_stage=*/false); + Dct16Stages<butterfly_rotation>(s, &min, &max, /*is_last_stage=*/true); + + if (is_row) { + const int32x4_t v_row_shift = vdupq_n_s32(-row_shift); + for (int i = 0; i < 16; ++i) { + s[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(s[i], v_row_shift))); + } + for (int idx = 0; idx < 16; idx += 8) { + Transpose4x4(&s[idx], &s[idx]); + Transpose4x4(&s[idx + 4], &s[idx + 4]); + StoreDst<4>(dst, step, idx, &s[idx]); + StoreDst<4>(dst, step, idx + 4, &s[idx + 4]); + } + } else { + StoreDst<16>(dst, step, 0, &s[0]); + } +} + +template <ButterflyRotationFunc butterfly_rotation, + bool is_fast_butterfly = false> +LIBGAV1_ALWAYS_INLINE void Dct32Stages(int32x4_t* s, const int32x4_t* min, + const int32x4_t* max, + const bool is_last_stage) { + // stage 3 + if (is_fast_butterfly) { + ButterflyRotation_SecondIsZero(&s[16], &s[31], 62, false); + ButterflyRotation_FirstIsZero(&s[17], &s[30], 30, false); + ButterflyRotation_SecondIsZero(&s[18], &s[29], 46, false); + ButterflyRotation_FirstIsZero(&s[19], &s[28], 14, false); + ButterflyRotation_SecondIsZero(&s[20], &s[27], 54, false); + ButterflyRotation_FirstIsZero(&s[21], &s[26], 22, false); + ButterflyRotation_SecondIsZero(&s[22], &s[25], 38, false); + ButterflyRotation_FirstIsZero(&s[23], &s[24], 6, false); + } else { + butterfly_rotation(&s[16], &s[31], 62, false); + butterfly_rotation(&s[17], &s[30], 30, false); + butterfly_rotation(&s[18], &s[29], 46, false); + butterfly_rotation(&s[19], &s[28], 14, false); + butterfly_rotation(&s[20], &s[27], 54, false); + butterfly_rotation(&s[21], &s[26], 22, false); + butterfly_rotation(&s[22], &s[25], 38, false); + butterfly_rotation(&s[23], &s[24], 6, false); + } + + // stage 6. + HadamardRotation(&s[16], &s[17], false, min, max); + HadamardRotation(&s[18], &s[19], true, min, max); + HadamardRotation(&s[20], &s[21], false, min, max); + HadamardRotation(&s[22], &s[23], true, min, max); + HadamardRotation(&s[24], &s[25], false, min, max); + HadamardRotation(&s[26], &s[27], true, min, max); + HadamardRotation(&s[28], &s[29], false, min, max); + HadamardRotation(&s[30], &s[31], true, min, max); + + // stage 10. + butterfly_rotation(&s[30], &s[17], 24 + 32, true); + butterfly_rotation(&s[29], &s[18], 24 + 64 + 32, true); + butterfly_rotation(&s[26], &s[21], 24, true); + butterfly_rotation(&s[25], &s[22], 24 + 64, true); + + // stage 15. + HadamardRotation(&s[16], &s[19], false, min, max); + HadamardRotation(&s[17], &s[18], false, min, max); + HadamardRotation(&s[20], &s[23], true, min, max); + HadamardRotation(&s[21], &s[22], true, min, max); + HadamardRotation(&s[24], &s[27], false, min, max); + HadamardRotation(&s[25], &s[26], false, min, max); + HadamardRotation(&s[28], &s[31], true, min, max); + HadamardRotation(&s[29], &s[30], true, min, max); + + // stage 20. + butterfly_rotation(&s[29], &s[18], 48, true); + butterfly_rotation(&s[28], &s[19], 48, true); + butterfly_rotation(&s[27], &s[20], 48 + 64, true); + butterfly_rotation(&s[26], &s[21], 48 + 64, true); + + // stage 24. + HadamardRotation(&s[16], &s[23], false, min, max); + HadamardRotation(&s[17], &s[22], false, min, max); + HadamardRotation(&s[18], &s[21], false, min, max); + HadamardRotation(&s[19], &s[20], false, min, max); + HadamardRotation(&s[24], &s[31], true, min, max); + HadamardRotation(&s[25], &s[30], true, min, max); + HadamardRotation(&s[26], &s[29], true, min, max); + HadamardRotation(&s[27], &s[28], true, min, max); + + // stage 27. + butterfly_rotation(&s[27], &s[20], 32, true); + butterfly_rotation(&s[26], &s[21], 32, true); + butterfly_rotation(&s[25], &s[22], 32, true); + butterfly_rotation(&s[24], &s[23], 32, true); + + // stage 29. + if (is_last_stage) { + HadamardRotation(&s[0], &s[31], false); + HadamardRotation(&s[1], &s[30], false); + HadamardRotation(&s[2], &s[29], false); + HadamardRotation(&s[3], &s[28], false); + HadamardRotation(&s[4], &s[27], false); + HadamardRotation(&s[5], &s[26], false); + HadamardRotation(&s[6], &s[25], false); + HadamardRotation(&s[7], &s[24], false); + HadamardRotation(&s[8], &s[23], false); + HadamardRotation(&s[9], &s[22], false); + HadamardRotation(&s[10], &s[21], false); + HadamardRotation(&s[11], &s[20], false); + HadamardRotation(&s[12], &s[19], false); + HadamardRotation(&s[13], &s[18], false); + HadamardRotation(&s[14], &s[17], false); + HadamardRotation(&s[15], &s[16], false); + } else { + HadamardRotation(&s[0], &s[31], false, min, max); + HadamardRotation(&s[1], &s[30], false, min, max); + HadamardRotation(&s[2], &s[29], false, min, max); + HadamardRotation(&s[3], &s[28], false, min, max); + HadamardRotation(&s[4], &s[27], false, min, max); + HadamardRotation(&s[5], &s[26], false, min, max); + HadamardRotation(&s[6], &s[25], false, min, max); + HadamardRotation(&s[7], &s[24], false, min, max); + HadamardRotation(&s[8], &s[23], false, min, max); + HadamardRotation(&s[9], &s[22], false, min, max); + HadamardRotation(&s[10], &s[21], false, min, max); + HadamardRotation(&s[11], &s[20], false, min, max); + HadamardRotation(&s[12], &s[19], false, min, max); + HadamardRotation(&s[13], &s[18], false, min, max); + HadamardRotation(&s[14], &s[17], false, min, max); + HadamardRotation(&s[15], &s[16], false, min, max); + } +} + +// Process dct32 rows or columns, depending on the |is_row| flag. +LIBGAV1_ALWAYS_INLINE void Dct32_NEON(void* dest, const int32_t step, + const bool is_row, int row_shift) { + auto* const dst = static_cast<int32_t*>(dest); + const int32_t range = is_row ? kBitdepth10 + 7 : 15; + const int32x4_t min = vdupq_n_s32(-(1 << range)); + const int32x4_t max = vdupq_n_s32((1 << range) - 1); + int32x4_t s[32], x[32]; + + if (is_row) { + for (int idx = 0; idx < 32; idx += 8) { + LoadSrc<4>(dst, step, idx, &x[idx]); + LoadSrc<4>(dst, step, idx + 4, &x[idx + 4]); + Transpose4x4(&x[idx], &x[idx]); + Transpose4x4(&x[idx + 4], &x[idx + 4]); + } + } else { + LoadSrc<32>(dst, step, 0, &x[0]); + } + + // stage 1 + // kBitReverseLookup + // 0, 16, 8, 24, 4, 20, 12, 28, 2, 18, 10, 26, 6, 22, 14, 30, + s[0] = x[0]; + s[1] = x[16]; + s[2] = x[8]; + s[3] = x[24]; + s[4] = x[4]; + s[5] = x[20]; + s[6] = x[12]; + s[7] = x[28]; + s[8] = x[2]; + s[9] = x[18]; + s[10] = x[10]; + s[11] = x[26]; + s[12] = x[6]; + s[13] = x[22]; + s[14] = x[14]; + s[15] = x[30]; + + // 1, 17, 9, 25, 5, 21, 13, 29, 3, 19, 11, 27, 7, 23, 15, 31, + s[16] = x[1]; + s[17] = x[17]; + s[18] = x[9]; + s[19] = x[25]; + s[20] = x[5]; + s[21] = x[21]; + s[22] = x[13]; + s[23] = x[29]; + s[24] = x[3]; + s[25] = x[19]; + s[26] = x[11]; + s[27] = x[27]; + s[28] = x[7]; + s[29] = x[23]; + s[30] = x[15]; + s[31] = x[31]; + + Dct4Stages<ButterflyRotation_4>(s, &min, &max, /*is_last_stage=*/false); + Dct8Stages<ButterflyRotation_4>(s, &min, &max, /*is_last_stage=*/false); + Dct16Stages<ButterflyRotation_4>(s, &min, &max, /*is_last_stage=*/false); + Dct32Stages<ButterflyRotation_4>(s, &min, &max, /*is_last_stage=*/true); + + if (is_row) { + const int32x4_t v_row_shift = vdupq_n_s32(-row_shift); + for (int idx = 0; idx < 32; idx += 8) { + int32x4_t output[8]; + Transpose4x4(&s[idx], &output[0]); + Transpose4x4(&s[idx + 4], &output[4]); + for (int i = 0; i < 8; ++i) { + output[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(output[i], v_row_shift))); + } + StoreDst<4>(dst, step, idx, &output[0]); + StoreDst<4>(dst, step, idx + 4, &output[4]); + } + } else { + StoreDst<32>(dst, step, 0, &s[0]); + } +} + +void Dct64_NEON(void* dest, int32_t step, bool is_row, int row_shift) { + auto* const dst = static_cast<int32_t*>(dest); + const int32_t range = is_row ? kBitdepth10 + 7 : 15; + const int32x4_t min = vdupq_n_s32(-(1 << range)); + const int32x4_t max = vdupq_n_s32((1 << range) - 1); + int32x4_t s[64], x[32]; + + if (is_row) { + // The last 32 values of every row are always zero if the |tx_width| is + // 64. + for (int idx = 0; idx < 32; idx += 8) { + LoadSrc<4>(dst, step, idx, &x[idx]); + LoadSrc<4>(dst, step, idx + 4, &x[idx + 4]); + Transpose4x4(&x[idx], &x[idx]); + Transpose4x4(&x[idx + 4], &x[idx + 4]); + } + } else { + // The last 32 values of every column are always zero if the |tx_height| is + // 64. + LoadSrc<32>(dst, step, 0, &x[0]); + } + + // stage 1 + // kBitReverseLookup + // 0, 32, 16, 48, 8, 40, 24, 56, 4, 36, 20, 52, 12, 44, 28, 60, + s[0] = x[0]; + s[2] = x[16]; + s[4] = x[8]; + s[6] = x[24]; + s[8] = x[4]; + s[10] = x[20]; + s[12] = x[12]; + s[14] = x[28]; + + // 2, 34, 18, 50, 10, 42, 26, 58, 6, 38, 22, 54, 14, 46, 30, 62, + s[16] = x[2]; + s[18] = x[18]; + s[20] = x[10]; + s[22] = x[26]; + s[24] = x[6]; + s[26] = x[22]; + s[28] = x[14]; + s[30] = x[30]; + + // 1, 33, 17, 49, 9, 41, 25, 57, 5, 37, 21, 53, 13, 45, 29, 61, + s[32] = x[1]; + s[34] = x[17]; + s[36] = x[9]; + s[38] = x[25]; + s[40] = x[5]; + s[42] = x[21]; + s[44] = x[13]; + s[46] = x[29]; + + // 3, 35, 19, 51, 11, 43, 27, 59, 7, 39, 23, 55, 15, 47, 31, 63 + s[48] = x[3]; + s[50] = x[19]; + s[52] = x[11]; + s[54] = x[27]; + s[56] = x[7]; + s[58] = x[23]; + s[60] = x[15]; + s[62] = x[31]; + + Dct4Stages<ButterflyRotation_4, /*is_fast_butterfly=*/true>( + s, &min, &max, /*is_last_stage=*/false); + Dct8Stages<ButterflyRotation_4, /*is_fast_butterfly=*/true>( + s, &min, &max, /*is_last_stage=*/false); + Dct16Stages<ButterflyRotation_4, /*is_fast_butterfly=*/true>( + s, &min, &max, /*is_last_stage=*/false); + Dct32Stages<ButterflyRotation_4, /*is_fast_butterfly=*/true>( + s, &min, &max, /*is_last_stage=*/false); + + //-- start dct 64 stages + // stage 2. + ButterflyRotation_SecondIsZero(&s[32], &s[63], 63 - 0, false); + ButterflyRotation_FirstIsZero(&s[33], &s[62], 63 - 32, false); + ButterflyRotation_SecondIsZero(&s[34], &s[61], 63 - 16, false); + ButterflyRotation_FirstIsZero(&s[35], &s[60], 63 - 48, false); + ButterflyRotation_SecondIsZero(&s[36], &s[59], 63 - 8, false); + ButterflyRotation_FirstIsZero(&s[37], &s[58], 63 - 40, false); + ButterflyRotation_SecondIsZero(&s[38], &s[57], 63 - 24, false); + ButterflyRotation_FirstIsZero(&s[39], &s[56], 63 - 56, false); + ButterflyRotation_SecondIsZero(&s[40], &s[55], 63 - 4, false); + ButterflyRotation_FirstIsZero(&s[41], &s[54], 63 - 36, false); + ButterflyRotation_SecondIsZero(&s[42], &s[53], 63 - 20, false); + ButterflyRotation_FirstIsZero(&s[43], &s[52], 63 - 52, false); + ButterflyRotation_SecondIsZero(&s[44], &s[51], 63 - 12, false); + ButterflyRotation_FirstIsZero(&s[45], &s[50], 63 - 44, false); + ButterflyRotation_SecondIsZero(&s[46], &s[49], 63 - 28, false); + ButterflyRotation_FirstIsZero(&s[47], &s[48], 63 - 60, false); + + // stage 4. + HadamardRotation(&s[32], &s[33], false, &min, &max); + HadamardRotation(&s[34], &s[35], true, &min, &max); + HadamardRotation(&s[36], &s[37], false, &min, &max); + HadamardRotation(&s[38], &s[39], true, &min, &max); + HadamardRotation(&s[40], &s[41], false, &min, &max); + HadamardRotation(&s[42], &s[43], true, &min, &max); + HadamardRotation(&s[44], &s[45], false, &min, &max); + HadamardRotation(&s[46], &s[47], true, &min, &max); + HadamardRotation(&s[48], &s[49], false, &min, &max); + HadamardRotation(&s[50], &s[51], true, &min, &max); + HadamardRotation(&s[52], &s[53], false, &min, &max); + HadamardRotation(&s[54], &s[55], true, &min, &max); + HadamardRotation(&s[56], &s[57], false, &min, &max); + HadamardRotation(&s[58], &s[59], true, &min, &max); + HadamardRotation(&s[60], &s[61], false, &min, &max); + HadamardRotation(&s[62], &s[63], true, &min, &max); + + // stage 7. + ButterflyRotation_4(&s[62], &s[33], 60 - 0, true); + ButterflyRotation_4(&s[61], &s[34], 60 - 0 + 64, true); + ButterflyRotation_4(&s[58], &s[37], 60 - 32, true); + ButterflyRotation_4(&s[57], &s[38], 60 - 32 + 64, true); + ButterflyRotation_4(&s[54], &s[41], 60 - 16, true); + ButterflyRotation_4(&s[53], &s[42], 60 - 16 + 64, true); + ButterflyRotation_4(&s[50], &s[45], 60 - 48, true); + ButterflyRotation_4(&s[49], &s[46], 60 - 48 + 64, true); + + // stage 11. + HadamardRotation(&s[32], &s[35], false, &min, &max); + HadamardRotation(&s[33], &s[34], false, &min, &max); + HadamardRotation(&s[36], &s[39], true, &min, &max); + HadamardRotation(&s[37], &s[38], true, &min, &max); + HadamardRotation(&s[40], &s[43], false, &min, &max); + HadamardRotation(&s[41], &s[42], false, &min, &max); + HadamardRotation(&s[44], &s[47], true, &min, &max); + HadamardRotation(&s[45], &s[46], true, &min, &max); + HadamardRotation(&s[48], &s[51], false, &min, &max); + HadamardRotation(&s[49], &s[50], false, &min, &max); + HadamardRotation(&s[52], &s[55], true, &min, &max); + HadamardRotation(&s[53], &s[54], true, &min, &max); + HadamardRotation(&s[56], &s[59], false, &min, &max); + HadamardRotation(&s[57], &s[58], false, &min, &max); + HadamardRotation(&s[60], &s[63], true, &min, &max); + HadamardRotation(&s[61], &s[62], true, &min, &max); + + // stage 16. + ButterflyRotation_4(&s[61], &s[34], 56, true); + ButterflyRotation_4(&s[60], &s[35], 56, true); + ButterflyRotation_4(&s[59], &s[36], 56 + 64, true); + ButterflyRotation_4(&s[58], &s[37], 56 + 64, true); + ButterflyRotation_4(&s[53], &s[42], 56 - 32, true); + ButterflyRotation_4(&s[52], &s[43], 56 - 32, true); + ButterflyRotation_4(&s[51], &s[44], 56 - 32 + 64, true); + ButterflyRotation_4(&s[50], &s[45], 56 - 32 + 64, true); + + // stage 21. + HadamardRotation(&s[32], &s[39], false, &min, &max); + HadamardRotation(&s[33], &s[38], false, &min, &max); + HadamardRotation(&s[34], &s[37], false, &min, &max); + HadamardRotation(&s[35], &s[36], false, &min, &max); + HadamardRotation(&s[40], &s[47], true, &min, &max); + HadamardRotation(&s[41], &s[46], true, &min, &max); + HadamardRotation(&s[42], &s[45], true, &min, &max); + HadamardRotation(&s[43], &s[44], true, &min, &max); + HadamardRotation(&s[48], &s[55], false, &min, &max); + HadamardRotation(&s[49], &s[54], false, &min, &max); + HadamardRotation(&s[50], &s[53], false, &min, &max); + HadamardRotation(&s[51], &s[52], false, &min, &max); + HadamardRotation(&s[56], &s[63], true, &min, &max); + HadamardRotation(&s[57], &s[62], true, &min, &max); + HadamardRotation(&s[58], &s[61], true, &min, &max); + HadamardRotation(&s[59], &s[60], true, &min, &max); + + // stage 25. + ButterflyRotation_4(&s[59], &s[36], 48, true); + ButterflyRotation_4(&s[58], &s[37], 48, true); + ButterflyRotation_4(&s[57], &s[38], 48, true); + ButterflyRotation_4(&s[56], &s[39], 48, true); + ButterflyRotation_4(&s[55], &s[40], 112, true); + ButterflyRotation_4(&s[54], &s[41], 112, true); + ButterflyRotation_4(&s[53], &s[42], 112, true); + ButterflyRotation_4(&s[52], &s[43], 112, true); + + // stage 28. + HadamardRotation(&s[32], &s[47], false, &min, &max); + HadamardRotation(&s[33], &s[46], false, &min, &max); + HadamardRotation(&s[34], &s[45], false, &min, &max); + HadamardRotation(&s[35], &s[44], false, &min, &max); + HadamardRotation(&s[36], &s[43], false, &min, &max); + HadamardRotation(&s[37], &s[42], false, &min, &max); + HadamardRotation(&s[38], &s[41], false, &min, &max); + HadamardRotation(&s[39], &s[40], false, &min, &max); + HadamardRotation(&s[48], &s[63], true, &min, &max); + HadamardRotation(&s[49], &s[62], true, &min, &max); + HadamardRotation(&s[50], &s[61], true, &min, &max); + HadamardRotation(&s[51], &s[60], true, &min, &max); + HadamardRotation(&s[52], &s[59], true, &min, &max); + HadamardRotation(&s[53], &s[58], true, &min, &max); + HadamardRotation(&s[54], &s[57], true, &min, &max); + HadamardRotation(&s[55], &s[56], true, &min, &max); + + // stage 30. + ButterflyRotation_4(&s[55], &s[40], 32, true); + ButterflyRotation_4(&s[54], &s[41], 32, true); + ButterflyRotation_4(&s[53], &s[42], 32, true); + ButterflyRotation_4(&s[52], &s[43], 32, true); + ButterflyRotation_4(&s[51], &s[44], 32, true); + ButterflyRotation_4(&s[50], &s[45], 32, true); + ButterflyRotation_4(&s[49], &s[46], 32, true); + ButterflyRotation_4(&s[48], &s[47], 32, true); + + // stage 31. + for (int i = 0; i < 32; i += 4) { + HadamardRotation(&s[i], &s[63 - i], false, &min, &max); + HadamardRotation(&s[i + 1], &s[63 - i - 1], false, &min, &max); + HadamardRotation(&s[i + 2], &s[63 - i - 2], false, &min, &max); + HadamardRotation(&s[i + 3], &s[63 - i - 3], false, &min, &max); + } + //-- end dct 64 stages + if (is_row) { + const int32x4_t v_row_shift = vdupq_n_s32(-row_shift); + for (int idx = 0; idx < 64; idx += 8) { + int32x4_t output[8]; + Transpose4x4(&s[idx], &output[0]); + Transpose4x4(&s[idx + 4], &output[4]); + for (int i = 0; i < 8; ++i) { + output[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(output[i], v_row_shift))); + } + StoreDst<4>(dst, step, idx, &output[0]); + StoreDst<4>(dst, step, idx + 4, &output[4]); + } + } else { + StoreDst<64>(dst, step, 0, &s[0]); + } +} + +//------------------------------------------------------------------------------ +// Asymmetric Discrete Sine Transforms (ADST). +LIBGAV1_ALWAYS_INLINE void Adst4_NEON(void* dest, int32_t step, bool is_row, + int row_shift) { + auto* const dst = static_cast<int32_t*>(dest); + int32x4_t s[8]; + int32x4_t x[4]; + + LoadSrc<4>(dst, step, 0, x); + if (is_row) { + Transpose4x4(x, x); + } + + // stage 1. + s[5] = vmulq_n_s32(x[3], kAdst4Multiplier[1]); + s[6] = vmulq_n_s32(x[3], kAdst4Multiplier[3]); + + // stage 2. + const int32x4_t a7 = vsubq_s32(x[0], x[2]); + const int32x4_t b7 = vaddq_s32(a7, x[3]); + + // stage 3. + s[0] = vmulq_n_s32(x[0], kAdst4Multiplier[0]); + s[1] = vmulq_n_s32(x[0], kAdst4Multiplier[1]); + // s[0] = s[0] + s[3] + s[0] = vmlaq_n_s32(s[0], x[2], kAdst4Multiplier[3]); + // s[1] = s[1] - s[4] + s[1] = vmlsq_n_s32(s[1], x[2], kAdst4Multiplier[0]); + + s[3] = vmulq_n_s32(x[1], kAdst4Multiplier[2]); + s[2] = vmulq_n_s32(b7, kAdst4Multiplier[2]); + + // stage 4. + s[0] = vaddq_s32(s[0], s[5]); + s[1] = vsubq_s32(s[1], s[6]); + + // stages 5 and 6. + const int32x4_t x0 = vaddq_s32(s[0], s[3]); + const int32x4_t x1 = vaddq_s32(s[1], s[3]); + const int32x4_t x3_a = vaddq_s32(s[0], s[1]); + const int32x4_t x3 = vsubq_s32(x3_a, s[3]); + x[0] = vrshrq_n_s32(x0, 12); + x[1] = vrshrq_n_s32(x1, 12); + x[2] = vrshrq_n_s32(s[2], 12); + x[3] = vrshrq_n_s32(x3, 12); + + if (is_row) { + const int32x4_t v_row_shift = vdupq_n_s32(-row_shift); + x[0] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[0], v_row_shift))); + x[1] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[1], v_row_shift))); + x[2] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[2], v_row_shift))); + x[3] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[3], v_row_shift))); + Transpose4x4(x, x); + } + StoreDst<4>(dst, step, 0, x); +} + +alignas(16) constexpr int32_t kAdst4DcOnlyMultiplier[4] = {1321, 2482, 3344, + 2482}; + +LIBGAV1_ALWAYS_INLINE bool Adst4DcOnly(void* dest, int adjusted_tx_height, + bool should_round, int row_shift) { + if (adjusted_tx_height > 1) return false; + + auto* dst = static_cast<int32_t*>(dest); + int32x4_t s[2]; + + const int32x4_t v_src0 = vdupq_n_s32(dst[0]); + const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0); + const int32x4_t v_src0_round = + vqrdmulhq_n_s32(v_src0, kTransformRowMultiplier << (31 - 12)); + + const int32x4_t v_src = vbslq_s32(v_mask, v_src0_round, v_src0); + const int32x4_t kAdst4DcOnlyMultipliers = vld1q_s32(kAdst4DcOnlyMultiplier); + s[1] = vdupq_n_s32(0); + + // s0*k0 s0*k1 s0*k2 s0*k1 + s[0] = vmulq_s32(kAdst4DcOnlyMultipliers, v_src); + // 0 0 0 s0*k0 + s[1] = vextq_s32(s[1], s[0], 1); + + const int32x4_t x3 = vaddq_s32(s[0], s[1]); + const int32x4_t dst_0 = vrshrq_n_s32(x3, 12); + + // vqrshlq_s32 will shift right if shift value is negative. + vst1q_s32(dst, + vmovl_s16(vqmovn_s32(vqrshlq_s32(dst_0, vdupq_n_s32(-row_shift))))); + + return true; +} + +LIBGAV1_ALWAYS_INLINE bool Adst4DcOnlyColumn(void* dest, int adjusted_tx_height, + int width) { + if (adjusted_tx_height > 1) return false; + + auto* dst = static_cast<int32_t*>(dest); + int32x4_t s[4]; + + int i = 0; + do { + const int32x4_t v_src = vld1q_s32(&dst[i]); + + s[0] = vmulq_n_s32(v_src, kAdst4Multiplier[0]); + s[1] = vmulq_n_s32(v_src, kAdst4Multiplier[1]); + s[2] = vmulq_n_s32(v_src, kAdst4Multiplier[2]); + + const int32x4_t x0 = s[0]; + const int32x4_t x1 = s[1]; + const int32x4_t x2 = s[2]; + const int32x4_t x3 = vaddq_s32(s[0], s[1]); + const int32x4_t dst_0 = vrshrq_n_s32(x0, 12); + const int32x4_t dst_1 = vrshrq_n_s32(x1, 12); + const int32x4_t dst_2 = vrshrq_n_s32(x2, 12); + const int32x4_t dst_3 = vrshrq_n_s32(x3, 12); + + vst1q_s32(&dst[i], dst_0); + vst1q_s32(&dst[i + width * 1], dst_1); + vst1q_s32(&dst[i + width * 2], dst_2); + vst1q_s32(&dst[i + width * 3], dst_3); + + i += 4; + } while (i < width); + + return true; +} + +template <ButterflyRotationFunc butterfly_rotation> +LIBGAV1_ALWAYS_INLINE void Adst8_NEON(void* dest, int32_t step, bool is_row, + int row_shift) { + auto* const dst = static_cast<int32_t*>(dest); + const int32_t range = is_row ? kBitdepth10 + 7 : 15; + const int32x4_t min = vdupq_n_s32(-(1 << range)); + const int32x4_t max = vdupq_n_s32((1 << range) - 1); + int32x4_t s[8], x[8]; + + if (is_row) { + LoadSrc<4>(dst, step, 0, &x[0]); + LoadSrc<4>(dst, step, 4, &x[4]); + Transpose4x4(&x[0], &x[0]); + Transpose4x4(&x[4], &x[4]); + } else { + LoadSrc<8>(dst, step, 0, &x[0]); + } + + // stage 1. + s[0] = x[7]; + s[1] = x[0]; + s[2] = x[5]; + s[3] = x[2]; + s[4] = x[3]; + s[5] = x[4]; + s[6] = x[1]; + s[7] = x[6]; + + // stage 2. + butterfly_rotation(&s[0], &s[1], 60 - 0, true); + butterfly_rotation(&s[2], &s[3], 60 - 16, true); + butterfly_rotation(&s[4], &s[5], 60 - 32, true); + butterfly_rotation(&s[6], &s[7], 60 - 48, true); + + // stage 3. + HadamardRotation(&s[0], &s[4], false, &min, &max); + HadamardRotation(&s[1], &s[5], false, &min, &max); + HadamardRotation(&s[2], &s[6], false, &min, &max); + HadamardRotation(&s[3], &s[7], false, &min, &max); + + // stage 4. + butterfly_rotation(&s[4], &s[5], 48 - 0, true); + butterfly_rotation(&s[7], &s[6], 48 - 32, true); + + // stage 5. + HadamardRotation(&s[0], &s[2], false, &min, &max); + HadamardRotation(&s[4], &s[6], false, &min, &max); + HadamardRotation(&s[1], &s[3], false, &min, &max); + HadamardRotation(&s[5], &s[7], false, &min, &max); + + // stage 6. + butterfly_rotation(&s[2], &s[3], 32, true); + butterfly_rotation(&s[6], &s[7], 32, true); + + // stage 7. + x[0] = s[0]; + x[1] = vqnegq_s32(s[4]); + x[2] = s[6]; + x[3] = vqnegq_s32(s[2]); + x[4] = s[3]; + x[5] = vqnegq_s32(s[7]); + x[6] = s[5]; + x[7] = vqnegq_s32(s[1]); + + if (is_row) { + const int32x4_t v_row_shift = vdupq_n_s32(-row_shift); + for (int i = 0; i < 8; ++i) { + x[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[i], v_row_shift))); + } + Transpose4x4(&x[0], &x[0]); + Transpose4x4(&x[4], &x[4]); + StoreDst<4>(dst, step, 0, &x[0]); + StoreDst<4>(dst, step, 4, &x[4]); + } else { + StoreDst<8>(dst, step, 0, &x[0]); + } +} + +LIBGAV1_ALWAYS_INLINE bool Adst8DcOnly(void* dest, int adjusted_tx_height, + bool should_round, int row_shift) { + if (adjusted_tx_height > 1) return false; + + auto* dst = static_cast<int32_t*>(dest); + int32x4_t s[8]; + + const int32x4_t v_src = vdupq_n_s32(dst[0]); + const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0); + const int32x4_t v_src_round = + vqrdmulhq_n_s32(v_src, kTransformRowMultiplier << (31 - 12)); + // stage 1. + s[1] = vbslq_s32(v_mask, v_src_round, v_src); + + // stage 2. + ButterflyRotation_FirstIsZero(&s[0], &s[1], 60, true); + + // stage 3. + s[4] = s[0]; + s[5] = s[1]; + + // stage 4. + ButterflyRotation_4(&s[4], &s[5], 48, true); + + // stage 5. + s[2] = s[0]; + s[3] = s[1]; + s[6] = s[4]; + s[7] = s[5]; + + // stage 6. + ButterflyRotation_4(&s[2], &s[3], 32, true); + ButterflyRotation_4(&s[6], &s[7], 32, true); + + // stage 7. + int32x4_t x[8]; + x[0] = s[0]; + x[1] = vqnegq_s32(s[4]); + x[2] = s[6]; + x[3] = vqnegq_s32(s[2]); + x[4] = s[3]; + x[5] = vqnegq_s32(s[7]); + x[6] = s[5]; + x[7] = vqnegq_s32(s[1]); + + for (int i = 0; i < 8; ++i) { + // vqrshlq_s32 will shift right if shift value is negative. + x[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[i], vdupq_n_s32(-row_shift)))); + vst1q_lane_s32(&dst[i], x[i], 0); + } + + return true; +} + +LIBGAV1_ALWAYS_INLINE bool Adst8DcOnlyColumn(void* dest, int adjusted_tx_height, + int width) { + if (adjusted_tx_height > 1) return false; + + auto* dst = static_cast<int32_t*>(dest); + int32x4_t s[8]; + + int i = 0; + do { + const int32x4_t v_src = vld1q_s32(dst); + // stage 1. + s[1] = v_src; + + // stage 2. + ButterflyRotation_FirstIsZero(&s[0], &s[1], 60, true); + + // stage 3. + s[4] = s[0]; + s[5] = s[1]; + + // stage 4. + ButterflyRotation_4(&s[4], &s[5], 48, true); + + // stage 5. + s[2] = s[0]; + s[3] = s[1]; + s[6] = s[4]; + s[7] = s[5]; + + // stage 6. + ButterflyRotation_4(&s[2], &s[3], 32, true); + ButterflyRotation_4(&s[6], &s[7], 32, true); + + // stage 7. + int32x4_t x[8]; + x[0] = s[0]; + x[1] = vqnegq_s32(s[4]); + x[2] = s[6]; + x[3] = vqnegq_s32(s[2]); + x[4] = s[3]; + x[5] = vqnegq_s32(s[7]); + x[6] = s[5]; + x[7] = vqnegq_s32(s[1]); + + for (int j = 0; j < 8; ++j) { + vst1q_s32(&dst[j * width], x[j]); + } + i += 4; + dst += 4; + } while (i < width); + + return true; +} + +template <ButterflyRotationFunc butterfly_rotation> +LIBGAV1_ALWAYS_INLINE void Adst16_NEON(void* dest, int32_t step, bool is_row, + int row_shift) { + auto* const dst = static_cast<int32_t*>(dest); + const int32_t range = is_row ? kBitdepth10 + 7 : 15; + const int32x4_t min = vdupq_n_s32(-(1 << range)); + const int32x4_t max = vdupq_n_s32((1 << range) - 1); + int32x4_t s[16], x[16]; + + if (is_row) { + for (int idx = 0; idx < 16; idx += 8) { + LoadSrc<4>(dst, step, idx, &x[idx]); + LoadSrc<4>(dst, step, idx + 4, &x[idx + 4]); + Transpose4x4(&x[idx], &x[idx]); + Transpose4x4(&x[idx + 4], &x[idx + 4]); + } + } else { + LoadSrc<16>(dst, step, 0, &x[0]); + } + + // stage 1. + s[0] = x[15]; + s[1] = x[0]; + s[2] = x[13]; + s[3] = x[2]; + s[4] = x[11]; + s[5] = x[4]; + s[6] = x[9]; + s[7] = x[6]; + s[8] = x[7]; + s[9] = x[8]; + s[10] = x[5]; + s[11] = x[10]; + s[12] = x[3]; + s[13] = x[12]; + s[14] = x[1]; + s[15] = x[14]; + + // stage 2. + butterfly_rotation(&s[0], &s[1], 62 - 0, true); + butterfly_rotation(&s[2], &s[3], 62 - 8, true); + butterfly_rotation(&s[4], &s[5], 62 - 16, true); + butterfly_rotation(&s[6], &s[7], 62 - 24, true); + butterfly_rotation(&s[8], &s[9], 62 - 32, true); + butterfly_rotation(&s[10], &s[11], 62 - 40, true); + butterfly_rotation(&s[12], &s[13], 62 - 48, true); + butterfly_rotation(&s[14], &s[15], 62 - 56, true); + + // stage 3. + HadamardRotation(&s[0], &s[8], false, &min, &max); + HadamardRotation(&s[1], &s[9], false, &min, &max); + HadamardRotation(&s[2], &s[10], false, &min, &max); + HadamardRotation(&s[3], &s[11], false, &min, &max); + HadamardRotation(&s[4], &s[12], false, &min, &max); + HadamardRotation(&s[5], &s[13], false, &min, &max); + HadamardRotation(&s[6], &s[14], false, &min, &max); + HadamardRotation(&s[7], &s[15], false, &min, &max); + + // stage 4. + butterfly_rotation(&s[8], &s[9], 56 - 0, true); + butterfly_rotation(&s[13], &s[12], 8 + 0, true); + butterfly_rotation(&s[10], &s[11], 56 - 32, true); + butterfly_rotation(&s[15], &s[14], 8 + 32, true); + + // stage 5. + HadamardRotation(&s[0], &s[4], false, &min, &max); + HadamardRotation(&s[8], &s[12], false, &min, &max); + HadamardRotation(&s[1], &s[5], false, &min, &max); + HadamardRotation(&s[9], &s[13], false, &min, &max); + HadamardRotation(&s[2], &s[6], false, &min, &max); + HadamardRotation(&s[10], &s[14], false, &min, &max); + HadamardRotation(&s[3], &s[7], false, &min, &max); + HadamardRotation(&s[11], &s[15], false, &min, &max); + + // stage 6. + butterfly_rotation(&s[4], &s[5], 48 - 0, true); + butterfly_rotation(&s[12], &s[13], 48 - 0, true); + butterfly_rotation(&s[7], &s[6], 48 - 32, true); + butterfly_rotation(&s[15], &s[14], 48 - 32, true); + + // stage 7. + HadamardRotation(&s[0], &s[2], false, &min, &max); + HadamardRotation(&s[4], &s[6], false, &min, &max); + HadamardRotation(&s[8], &s[10], false, &min, &max); + HadamardRotation(&s[12], &s[14], false, &min, &max); + HadamardRotation(&s[1], &s[3], false, &min, &max); + HadamardRotation(&s[5], &s[7], false, &min, &max); + HadamardRotation(&s[9], &s[11], false, &min, &max); + HadamardRotation(&s[13], &s[15], false, &min, &max); + + // stage 8. + butterfly_rotation(&s[2], &s[3], 32, true); + butterfly_rotation(&s[6], &s[7], 32, true); + butterfly_rotation(&s[10], &s[11], 32, true); + butterfly_rotation(&s[14], &s[15], 32, true); + + // stage 9. + x[0] = s[0]; + x[1] = vqnegq_s32(s[8]); + x[2] = s[12]; + x[3] = vqnegq_s32(s[4]); + x[4] = s[6]; + x[5] = vqnegq_s32(s[14]); + x[6] = s[10]; + x[7] = vqnegq_s32(s[2]); + x[8] = s[3]; + x[9] = vqnegq_s32(s[11]); + x[10] = s[15]; + x[11] = vqnegq_s32(s[7]); + x[12] = s[5]; + x[13] = vqnegq_s32(s[13]); + x[14] = s[9]; + x[15] = vqnegq_s32(s[1]); + + if (is_row) { + const int32x4_t v_row_shift = vdupq_n_s32(-row_shift); + for (int i = 0; i < 16; ++i) { + x[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[i], v_row_shift))); + } + for (int idx = 0; idx < 16; idx += 8) { + Transpose4x4(&x[idx], &x[idx]); + Transpose4x4(&x[idx + 4], &x[idx + 4]); + StoreDst<4>(dst, step, idx, &x[idx]); + StoreDst<4>(dst, step, idx + 4, &x[idx + 4]); + } + } else { + StoreDst<16>(dst, step, 0, &x[0]); + } +} + +LIBGAV1_ALWAYS_INLINE void Adst16DcOnlyInternal(int32x4_t* s, int32x4_t* x) { + // stage 2. + ButterflyRotation_FirstIsZero(&s[0], &s[1], 62, true); + + // stage 3. + s[8] = s[0]; + s[9] = s[1]; + + // stage 4. + ButterflyRotation_4(&s[8], &s[9], 56, true); + + // stage 5. + s[4] = s[0]; + s[12] = s[8]; + s[5] = s[1]; + s[13] = s[9]; + + // stage 6. + ButterflyRotation_4(&s[4], &s[5], 48, true); + ButterflyRotation_4(&s[12], &s[13], 48, true); + + // stage 7. + s[2] = s[0]; + s[6] = s[4]; + s[10] = s[8]; + s[14] = s[12]; + s[3] = s[1]; + s[7] = s[5]; + s[11] = s[9]; + s[15] = s[13]; + + // stage 8. + ButterflyRotation_4(&s[2], &s[3], 32, true); + ButterflyRotation_4(&s[6], &s[7], 32, true); + ButterflyRotation_4(&s[10], &s[11], 32, true); + ButterflyRotation_4(&s[14], &s[15], 32, true); + + // stage 9. + x[0] = s[0]; + x[1] = vqnegq_s32(s[8]); + x[2] = s[12]; + x[3] = vqnegq_s32(s[4]); + x[4] = s[6]; + x[5] = vqnegq_s32(s[14]); + x[6] = s[10]; + x[7] = vqnegq_s32(s[2]); + x[8] = s[3]; + x[9] = vqnegq_s32(s[11]); + x[10] = s[15]; + x[11] = vqnegq_s32(s[7]); + x[12] = s[5]; + x[13] = vqnegq_s32(s[13]); + x[14] = s[9]; + x[15] = vqnegq_s32(s[1]); +} + +LIBGAV1_ALWAYS_INLINE bool Adst16DcOnly(void* dest, int adjusted_tx_height, + bool should_round, int row_shift) { + if (adjusted_tx_height > 1) return false; + + auto* dst = static_cast<int32_t*>(dest); + int32x4_t s[16]; + int32x4_t x[16]; + const int32x4_t v_src = vdupq_n_s32(dst[0]); + const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0); + const int32x4_t v_src_round = + vqrdmulhq_n_s32(v_src, kTransformRowMultiplier << (31 - 12)); + // stage 1. + s[1] = vbslq_s32(v_mask, v_src_round, v_src); + + Adst16DcOnlyInternal(s, x); + + for (int i = 0; i < 16; ++i) { + // vqrshlq_s32 will shift right if shift value is negative. + x[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[i], vdupq_n_s32(-row_shift)))); + vst1q_lane_s32(&dst[i], x[i], 0); + } + + return true; +} + +LIBGAV1_ALWAYS_INLINE bool Adst16DcOnlyColumn(void* dest, + int adjusted_tx_height, + int width) { + if (adjusted_tx_height > 1) return false; + + auto* dst = static_cast<int32_t*>(dest); + int i = 0; + do { + int32x4_t s[16]; + int32x4_t x[16]; + const int32x4_t v_src = vld1q_s32(dst); + // stage 1. + s[1] = v_src; + + Adst16DcOnlyInternal(s, x); + + for (int j = 0; j < 16; ++j) { + vst1q_s32(&dst[j * width], x[j]); + } + i += 4; + dst += 4; + } while (i < width); + + return true; +} + +//------------------------------------------------------------------------------ +// Identity Transforms. + +LIBGAV1_ALWAYS_INLINE void Identity4_NEON(void* dest, int32_t step, int shift) { + auto* const dst = static_cast<int32_t*>(dest); + const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11); + const int32x4_t v_multiplier = vdupq_n_s32(kIdentity4Multiplier); + const int32x4_t v_shift = vdupq_n_s32(-(12 + shift)); + for (int i = 0; i < 4; ++i) { + const int32x4_t v_src = vld1q_s32(&dst[i * step]); + const int32x4_t v_src_mult_lo = + vmlaq_s32(v_dual_round, v_src, v_multiplier); + const int32x4_t shift_lo = vqshlq_s32(v_src_mult_lo, v_shift); + vst1q_s32(&dst[i * step], vmovl_s16(vqmovn_s32(shift_lo))); + } +} + +LIBGAV1_ALWAYS_INLINE bool Identity4DcOnly(void* dest, int adjusted_tx_height, + bool should_round, int tx_height) { + if (adjusted_tx_height > 1) return false; + + auto* dst = static_cast<int32_t*>(dest); + const int32x4_t v_src0 = vdupq_n_s32(dst[0]); + const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0); + const int32x4_t v_src_round = + vqrdmulhq_n_s32(v_src0, kTransformRowMultiplier << (31 - 12)); + const int32x4_t v_src = vbslq_s32(v_mask, v_src_round, v_src0); + const int shift = tx_height < 16 ? 0 : 1; + const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11); + const int32x4_t v_multiplier = vdupq_n_s32(kIdentity4Multiplier); + const int32x4_t v_shift = vdupq_n_s32(-(12 + shift)); + const int32x4_t v_src_mult_lo = vmlaq_s32(v_dual_round, v_src, v_multiplier); + const int32x4_t dst_0 = vqshlq_s32(v_src_mult_lo, v_shift); + vst1q_lane_s32(dst, vmovl_s16(vqmovn_s32(dst_0)), 0); + return true; +} + +template <int identity_size> +LIBGAV1_ALWAYS_INLINE void IdentityColumnStoreToFrame( + Array2DView<uint16_t> frame, const int start_x, const int start_y, + const int tx_width, const int tx_height, const int32_t* source) { + static_assert(identity_size == 4 || identity_size == 8 || identity_size == 16, + "Invalid identity_size."); + const int stride = frame.columns(); + uint16_t* dst = frame[start_y] + start_x; + const int32x4_t v_dual_round = vdupq_n_s32((1 + (1 << 4)) << 11); + const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1); + + if (tx_width == 4) { + int i = 0; + do { + int32x4x2_t v_src, v_dst_i, a, b; + v_src.val[0] = vld1q_s32(&source[i * 4]); + v_src.val[1] = vld1q_s32(&source[(i * 4) + 4]); + if (identity_size == 4) { + v_dst_i.val[0] = + vmlaq_n_s32(v_dual_round, v_src.val[0], kIdentity4Multiplier); + v_dst_i.val[1] = + vmlaq_n_s32(v_dual_round, v_src.val[1], kIdentity4Multiplier); + a.val[0] = vshrq_n_s32(v_dst_i.val[0], 4 + 12); + a.val[1] = vshrq_n_s32(v_dst_i.val[1], 4 + 12); + } else if (identity_size == 8) { + v_dst_i.val[0] = vaddq_s32(v_src.val[0], v_src.val[0]); + v_dst_i.val[1] = vaddq_s32(v_src.val[1], v_src.val[1]); + a.val[0] = vrshrq_n_s32(v_dst_i.val[0], 4); + a.val[1] = vrshrq_n_s32(v_dst_i.val[1], 4); + } else { // identity_size == 16 + v_dst_i.val[0] = + vmlaq_n_s32(v_dual_round, v_src.val[0], kIdentity16Multiplier); + v_dst_i.val[1] = + vmlaq_n_s32(v_dual_round, v_src.val[1], kIdentity16Multiplier); + a.val[0] = vshrq_n_s32(v_dst_i.val[0], 4 + 12); + a.val[1] = vshrq_n_s32(v_dst_i.val[1], 4 + 12); + } + uint16x4x2_t frame_data; + frame_data.val[0] = vld1_u16(dst); + frame_data.val[1] = vld1_u16(dst + stride); + b.val[0] = vaddw_s16(a.val[0], vreinterpret_s16_u16(frame_data.val[0])); + b.val[1] = vaddw_s16(a.val[1], vreinterpret_s16_u16(frame_data.val[1])); + vst1_u16(dst, vmin_u16(vqmovun_s32(b.val[0]), v_max_bitdepth)); + vst1_u16(dst + stride, vmin_u16(vqmovun_s32(b.val[1]), v_max_bitdepth)); + dst += stride << 1; + i += 2; + } while (i < tx_height); + } else { + int i = 0; + do { + const int row = i * tx_width; + int j = 0; + do { + int32x4x2_t v_src, v_dst_i, a, b; + v_src.val[0] = vld1q_s32(&source[row + j]); + v_src.val[1] = vld1q_s32(&source[row + j + 4]); + if (identity_size == 4) { + v_dst_i.val[0] = + vmlaq_n_s32(v_dual_round, v_src.val[0], kIdentity4Multiplier); + v_dst_i.val[1] = + vmlaq_n_s32(v_dual_round, v_src.val[1], kIdentity4Multiplier); + a.val[0] = vshrq_n_s32(v_dst_i.val[0], 4 + 12); + a.val[1] = vshrq_n_s32(v_dst_i.val[1], 4 + 12); + } else if (identity_size == 8) { + v_dst_i.val[0] = vaddq_s32(v_src.val[0], v_src.val[0]); + v_dst_i.val[1] = vaddq_s32(v_src.val[1], v_src.val[1]); + a.val[0] = vrshrq_n_s32(v_dst_i.val[0], 4); + a.val[1] = vrshrq_n_s32(v_dst_i.val[1], 4); + } else { // identity_size == 16 + v_dst_i.val[0] = + vmlaq_n_s32(v_dual_round, v_src.val[0], kIdentity16Multiplier); + v_dst_i.val[1] = + vmlaq_n_s32(v_dual_round, v_src.val[1], kIdentity16Multiplier); + a.val[0] = vshrq_n_s32(v_dst_i.val[0], 4 + 12); + a.val[1] = vshrq_n_s32(v_dst_i.val[1], 4 + 12); + } + uint16x4x2_t frame_data; + frame_data.val[0] = vld1_u16(dst + j); + frame_data.val[1] = vld1_u16(dst + j + 4); + b.val[0] = vaddw_s16(a.val[0], vreinterpret_s16_u16(frame_data.val[0])); + b.val[1] = vaddw_s16(a.val[1], vreinterpret_s16_u16(frame_data.val[1])); + vst1_u16(dst + j, vmin_u16(vqmovun_s32(b.val[0]), v_max_bitdepth)); + vst1_u16(dst + j + 4, vmin_u16(vqmovun_s32(b.val[1]), v_max_bitdepth)); + j += 8; + } while (j < tx_width); + dst += stride; + } while (++i < tx_height); + } +} + +LIBGAV1_ALWAYS_INLINE void Identity4RowColumnStoreToFrame( + Array2DView<uint16_t> frame, const int start_x, const int start_y, + const int tx_width, const int tx_height, const int32_t* source) { + const int stride = frame.columns(); + uint16_t* dst = frame[start_y] + start_x; + const int32x4_t v_round = vdupq_n_s32((1 + (0)) << 11); + const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1); + + if (tx_width == 4) { + int i = 0; + do { + const int32x4_t v_src = vld1q_s32(&source[i * 4]); + const int32x4_t v_dst_row = + vshrq_n_s32(vmlaq_n_s32(v_round, v_src, kIdentity4Multiplier), 12); + const int32x4_t v_dst_col = + vmlaq_n_s32(v_round, v_dst_row, kIdentity4Multiplier); + const uint16x4_t frame_data = vld1_u16(dst); + const int32x4_t a = vrshrq_n_s32(v_dst_col, 4 + 12); + const int32x4_t b = vaddw_s16(a, vreinterpret_s16_u16(frame_data)); + vst1_u16(dst, vmin_u16(vqmovun_s32(b), v_max_bitdepth)); + dst += stride; + } while (++i < tx_height); + } else { + int i = 0; + do { + const int row = i * tx_width; + int j = 0; + do { + int32x4x2_t v_src, v_src_round, v_dst_row, v_dst_col, a, b; + v_src.val[0] = vld1q_s32(&source[row + j]); + v_src.val[1] = vld1q_s32(&source[row + j + 4]); + v_src_round.val[0] = vshrq_n_s32( + vmlaq_n_s32(v_round, v_src.val[0], kTransformRowMultiplier), 12); + v_src_round.val[1] = vshrq_n_s32( + vmlaq_n_s32(v_round, v_src.val[1], kTransformRowMultiplier), 12); + v_dst_row.val[0] = vqaddq_s32(v_src_round.val[0], v_src_round.val[0]); + v_dst_row.val[1] = vqaddq_s32(v_src_round.val[1], v_src_round.val[1]); + v_dst_col.val[0] = + vmlaq_n_s32(v_round, v_dst_row.val[0], kIdentity4Multiplier); + v_dst_col.val[1] = + vmlaq_n_s32(v_round, v_dst_row.val[1], kIdentity4Multiplier); + uint16x4x2_t frame_data; + frame_data.val[0] = vld1_u16(dst + j); + frame_data.val[1] = vld1_u16(dst + j + 4); + a.val[0] = vrshrq_n_s32(v_dst_col.val[0], 4 + 12); + a.val[1] = vrshrq_n_s32(v_dst_col.val[1], 4 + 12); + b.val[0] = vaddw_s16(a.val[0], vreinterpret_s16_u16(frame_data.val[0])); + b.val[1] = vaddw_s16(a.val[1], vreinterpret_s16_u16(frame_data.val[1])); + vst1_u16(dst + j, vmin_u16(vqmovun_s32(b.val[0]), v_max_bitdepth)); + vst1_u16(dst + j + 4, vmin_u16(vqmovun_s32(b.val[1]), v_max_bitdepth)); + j += 8; + } while (j < tx_width); + dst += stride; + } while (++i < tx_height); + } +} + +LIBGAV1_ALWAYS_INLINE void Identity8Row32_NEON(void* dest, int32_t step) { + auto* const dst = static_cast<int32_t*>(dest); + + // When combining the identity8 multiplier with the row shift, the + // calculations for tx_height equal to 32 can be simplified from + // ((A * 2) + 2) >> 2) to ((A + 1) >> 1). + for (int i = 0; i < 4; ++i) { + const int32x4_t v_src_lo = vld1q_s32(&dst[i * step]); + const int32x4_t v_src_hi = vld1q_s32(&dst[(i * step) + 4]); + const int32x4_t a_lo = vrshrq_n_s32(v_src_lo, 1); + const int32x4_t a_hi = vrshrq_n_s32(v_src_hi, 1); + vst1q_s32(&dst[i * step], vmovl_s16(vqmovn_s32(a_lo))); + vst1q_s32(&dst[(i * step) + 4], vmovl_s16(vqmovn_s32(a_hi))); + } +} + +LIBGAV1_ALWAYS_INLINE void Identity8Row4_NEON(void* dest, int32_t step) { + auto* const dst = static_cast<int32_t*>(dest); + + for (int i = 0; i < 4; ++i) { + const int32x4_t v_src_lo = vld1q_s32(&dst[i * step]); + const int32x4_t v_src_hi = vld1q_s32(&dst[(i * step) + 4]); + const int32x4_t v_srcx2_lo = vqaddq_s32(v_src_lo, v_src_lo); + const int32x4_t v_srcx2_hi = vqaddq_s32(v_src_hi, v_src_hi); + vst1q_s32(&dst[i * step], vmovl_s16(vqmovn_s32(v_srcx2_lo))); + vst1q_s32(&dst[(i * step) + 4], vmovl_s16(vqmovn_s32(v_srcx2_hi))); + } +} + +LIBGAV1_ALWAYS_INLINE bool Identity8DcOnly(void* dest, int adjusted_tx_height, + bool should_round, int row_shift) { + if (adjusted_tx_height > 1) return false; + + auto* dst = static_cast<int32_t*>(dest); + const int32x4_t v_src0 = vdupq_n_s32(dst[0]); + const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0); + const int32x4_t v_src_round = + vqrdmulhq_n_s32(v_src0, kTransformRowMultiplier << (31 - 12)); + const int32x4_t v_src = vbslq_s32(v_mask, v_src_round, v_src0); + const int32x4_t v_srcx2 = vaddq_s32(v_src, v_src); + const int32x4_t dst_0 = vqrshlq_s32(v_srcx2, vdupq_n_s32(-row_shift)); + vst1q_lane_s32(dst, vmovl_s16(vqmovn_s32(dst_0)), 0); + return true; +} + +LIBGAV1_ALWAYS_INLINE void Identity16Row_NEON(void* dest, int32_t step, + int shift) { + auto* const dst = static_cast<int32_t*>(dest); + const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11); + const int32x4_t v_shift = vdupq_n_s32(-(12 + shift)); + + for (int i = 0; i < 4; ++i) { + for (int j = 0; j < 2; ++j) { + int32x4x2_t v_src; + v_src.val[0] = vld1q_s32(&dst[i * step + j * 8]); + v_src.val[1] = vld1q_s32(&dst[i * step + j * 8 + 4]); + const int32x4_t v_src_mult_lo = + vmlaq_n_s32(v_dual_round, v_src.val[0], kIdentity16Multiplier); + const int32x4_t v_src_mult_hi = + vmlaq_n_s32(v_dual_round, v_src.val[1], kIdentity16Multiplier); + const int32x4_t shift_lo = vqshlq_s32(v_src_mult_lo, v_shift); + const int32x4_t shift_hi = vqshlq_s32(v_src_mult_hi, v_shift); + vst1q_s32(&dst[i * step + j * 8], vmovl_s16(vqmovn_s32(shift_lo))); + vst1q_s32(&dst[i * step + j * 8 + 4], vmovl_s16(vqmovn_s32(shift_hi))); + } + } +} + +LIBGAV1_ALWAYS_INLINE bool Identity16DcOnly(void* dest, int adjusted_tx_height, + bool should_round, int shift) { + if (adjusted_tx_height > 1) return false; + + auto* dst = static_cast<int32_t*>(dest); + const int32x4_t v_src0 = vdupq_n_s32(dst[0]); + const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0); + const int32x4_t v_src_round = + vqrdmulhq_n_s32(v_src0, kTransformRowMultiplier << (31 - 12)); + const int32x4_t v_src = vbslq_s32(v_mask, v_src_round, v_src0); + const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11); + const int32x4_t v_src_mult_lo = + vmlaq_n_s32(v_dual_round, v_src, kIdentity16Multiplier); + const int32x4_t dst_0 = vqshlq_s32(v_src_mult_lo, vdupq_n_s32(-(12 + shift))); + vst1q_lane_s32(dst, vmovl_s16(vqmovn_s32(dst_0)), 0); + return true; +} + +//------------------------------------------------------------------------------ +// row/column transform loops + +template <int tx_height> +LIBGAV1_ALWAYS_INLINE void FlipColumns(int32_t* source, int tx_width) { + if (tx_width >= 16) { + int i = 0; + do { + // 00 01 02 03 + const int32x4_t a = vld1q_s32(&source[i]); + const int32x4_t b = vld1q_s32(&source[i + 4]); + const int32x4_t c = vld1q_s32(&source[i + 8]); + const int32x4_t d = vld1q_s32(&source[i + 12]); + // 01 00 03 02 + const int32x4_t a_rev = vrev64q_s32(a); + const int32x4_t b_rev = vrev64q_s32(b); + const int32x4_t c_rev = vrev64q_s32(c); + const int32x4_t d_rev = vrev64q_s32(d); + // 03 02 01 00 + vst1q_s32(&source[i], vextq_s32(d_rev, d_rev, 2)); + vst1q_s32(&source[i + 4], vextq_s32(c_rev, c_rev, 2)); + vst1q_s32(&source[i + 8], vextq_s32(b_rev, b_rev, 2)); + vst1q_s32(&source[i + 12], vextq_s32(a_rev, a_rev, 2)); + i += 16; + } while (i < tx_width * tx_height); + } else if (tx_width == 8) { + for (int i = 0; i < 8 * tx_height; i += 8) { + // 00 01 02 03 + const int32x4_t a = vld1q_s32(&source[i]); + const int32x4_t b = vld1q_s32(&source[i + 4]); + // 01 00 03 02 + const int32x4_t a_rev = vrev64q_s32(a); + const int32x4_t b_rev = vrev64q_s32(b); + // 03 02 01 00 + vst1q_s32(&source[i], vextq_s32(b_rev, b_rev, 2)); + vst1q_s32(&source[i + 4], vextq_s32(a_rev, a_rev, 2)); + } + } else { + // Process two rows per iteration. + for (int i = 0; i < 4 * tx_height; i += 8) { + // 00 01 02 03 + const int32x4_t a = vld1q_s32(&source[i]); + const int32x4_t b = vld1q_s32(&source[i + 4]); + // 01 00 03 02 + const int32x4_t a_rev = vrev64q_s32(a); + const int32x4_t b_rev = vrev64q_s32(b); + // 03 02 01 00 + vst1q_s32(&source[i], vextq_s32(a_rev, a_rev, 2)); + vst1q_s32(&source[i + 4], vextq_s32(b_rev, b_rev, 2)); + } + } +} + +template <int tx_width> +LIBGAV1_ALWAYS_INLINE void ApplyRounding(int32_t* source, int num_rows) { + // Process two rows per iteration. + int i = 0; + do { + const int32x4_t a_lo = vld1q_s32(&source[i]); + const int32x4_t a_hi = vld1q_s32(&source[i + 4]); + const int32x4_t b_lo = + vqrdmulhq_n_s32(a_lo, kTransformRowMultiplier << (31 - 12)); + const int32x4_t b_hi = + vqrdmulhq_n_s32(a_hi, kTransformRowMultiplier << (31 - 12)); + vst1q_s32(&source[i], b_lo); + vst1q_s32(&source[i + 4], b_hi); + i += 8; + } while (i < tx_width * num_rows); +} + +template <int tx_width> +LIBGAV1_ALWAYS_INLINE void RowShift(int32_t* source, int num_rows, + int row_shift) { + // vqrshlq_s32 will shift right if shift value is negative. + row_shift = -row_shift; + + // Process two rows per iteration. + int i = 0; + do { + const int32x4_t residual0 = vld1q_s32(&source[i]); + const int32x4_t residual1 = vld1q_s32(&source[i + 4]); + vst1q_s32(&source[i], vqrshlq_s32(residual0, vdupq_n_s32(row_shift))); + vst1q_s32(&source[i + 4], vqrshlq_s32(residual1, vdupq_n_s32(row_shift))); + i += 8; + } while (i < tx_width * num_rows); +} + +template <int tx_height, bool enable_flip_rows = false> +LIBGAV1_ALWAYS_INLINE void StoreToFrameWithRound( + Array2DView<uint16_t> frame, const int start_x, const int start_y, + const int tx_width, const int32_t* source, TransformType tx_type) { + const bool flip_rows = + enable_flip_rows ? kTransformFlipRowsMask.Contains(tx_type) : false; + const int stride = frame.columns(); + uint16_t* dst = frame[start_y] + start_x; + + if (tx_width == 4) { + for (int i = 0; i < tx_height; ++i) { + const int row = flip_rows ? (tx_height - i - 1) * 4 : i * 4; + const int32x4_t residual = vld1q_s32(&source[row]); + const uint16x4_t frame_data = vld1_u16(dst); + const int32x4_t a = vrshrq_n_s32(residual, 4); + const uint32x4_t b = vaddw_u16(vreinterpretq_u32_s32(a), frame_data); + const uint16x4_t d = vqmovun_s32(vreinterpretq_s32_u32(b)); + vst1_u16(dst, vmin_u16(d, vdup_n_u16((1 << kBitdepth10) - 1))); + dst += stride; + } + } else { + for (int i = 0; i < tx_height; ++i) { + const int y = start_y + i; + const int row = flip_rows ? (tx_height - i - 1) * tx_width : i * tx_width; + int j = 0; + do { + const int x = start_x + j; + const int32x4_t residual = vld1q_s32(&source[row + j]); + const int32x4_t residual_hi = vld1q_s32(&source[row + j + 4]); + const uint16x8_t frame_data = vld1q_u16(frame[y] + x); + const int32x4_t a = vrshrq_n_s32(residual, 4); + const int32x4_t a_hi = vrshrq_n_s32(residual_hi, 4); + const uint32x4_t b = + vaddw_u16(vreinterpretq_u32_s32(a), vget_low_u16(frame_data)); + const uint32x4_t b_hi = + vaddw_u16(vreinterpretq_u32_s32(a_hi), vget_high_u16(frame_data)); + const uint16x4_t d = vqmovun_s32(vreinterpretq_s32_u32(b)); + const uint16x4_t d_hi = vqmovun_s32(vreinterpretq_s32_u32(b_hi)); + vst1q_u16(frame[y] + x, vminq_u16(vcombine_u16(d, d_hi), + vdupq_n_u16((1 << kBitdepth10) - 1))); + j += 8; + } while (j < tx_width); + } + } +} + +void Dct4TransformLoopRow_NEON(TransformType /*tx_type*/, TransformSize tx_size, + int adjusted_tx_height, void* src_buffer, + int /*start_x*/, int /*start_y*/, + void* /*dst_frame*/) { + auto* src = static_cast<int32_t*>(src_buffer); + const int tx_height = kTransformHeight[tx_size]; + const bool should_round = (tx_height == 8); + const int row_shift = (tx_height == 16); + + if (DctDcOnly<4>(src, adjusted_tx_height, should_round, row_shift)) { + return; + } + + if (should_round) { + ApplyRounding<4>(src, adjusted_tx_height); + } + + // Process 4 1d dct4 rows in parallel per iteration. + int i = adjusted_tx_height; + auto* data = src; + do { + Dct4_NEON<ButterflyRotation_4>(data, /*step=*/4, /*is_row=*/true, + row_shift); + data += 16; + i -= 4; + } while (i != 0); +} + +void Dct4TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size, + int adjusted_tx_height, void* src_buffer, + int start_x, int start_y, void* dst_frame) { + auto* src = static_cast<int32_t*>(src_buffer); + const int tx_width = kTransformWidth[tx_size]; + + if (kTransformFlipColumnsMask.Contains(tx_type)) { + FlipColumns<4>(src, tx_width); + } + + if (!DctDcOnlyColumn<4>(src, adjusted_tx_height, tx_width)) { + // Process 4 1d dct4 columns in parallel per iteration. + int i = tx_width; + auto* data = src; + do { + Dct4_NEON<ButterflyRotation_4>(data, tx_width, /*transpose=*/false, + /*row_shift=*/0); + data += 4; + i -= 4; + } while (i != 0); + } + + auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame); + StoreToFrameWithRound<4>(frame, start_x, start_y, tx_width, src, tx_type); +} + +void Dct8TransformLoopRow_NEON(TransformType /*tx_type*/, TransformSize tx_size, + int adjusted_tx_height, void* src_buffer, + int /*start_x*/, int /*start_y*/, + void* /*dst_frame*/) { + auto* src = static_cast<int32_t*>(src_buffer); + const bool should_round = kShouldRound[tx_size]; + const uint8_t row_shift = kTransformRowShift[tx_size]; + + if (DctDcOnly<8>(src, adjusted_tx_height, should_round, row_shift)) { + return; + } + + if (should_round) { + ApplyRounding<8>(src, adjusted_tx_height); + } + + // Process 4 1d dct8 rows in parallel per iteration. + int i = adjusted_tx_height; + auto* data = src; + do { + Dct8_NEON<ButterflyRotation_4>(data, /*step=*/8, /*is_row=*/true, + row_shift); + data += 32; + i -= 4; + } while (i != 0); +} + +void Dct8TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size, + int adjusted_tx_height, void* src_buffer, + int start_x, int start_y, void* dst_frame) { + auto* src = static_cast<int32_t*>(src_buffer); + const int tx_width = kTransformWidth[tx_size]; + + if (kTransformFlipColumnsMask.Contains(tx_type)) { + FlipColumns<8>(src, tx_width); + } + + if (!DctDcOnlyColumn<8>(src, adjusted_tx_height, tx_width)) { + // Process 4 1d dct8 columns in parallel per iteration. + int i = tx_width; + auto* data = src; + do { + Dct8_NEON<ButterflyRotation_4>(data, tx_width, /*is_row=*/false, + /*row_shift=*/0); + data += 4; + i -= 4; + } while (i != 0); + } + auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame); + StoreToFrameWithRound<8>(frame, start_x, start_y, tx_width, src, tx_type); +} + +void Dct16TransformLoopRow_NEON(TransformType /*tx_type*/, + TransformSize tx_size, int adjusted_tx_height, + void* src_buffer, int /*start_x*/, + int /*start_y*/, void* /*dst_frame*/) { + auto* src = static_cast<int32_t*>(src_buffer); + const bool should_round = kShouldRound[tx_size]; + const uint8_t row_shift = kTransformRowShift[tx_size]; + + if (DctDcOnly<16>(src, adjusted_tx_height, should_round, row_shift)) { + return; + } + + if (should_round) { + ApplyRounding<16>(src, adjusted_tx_height); + } + + assert(adjusted_tx_height % 4 == 0); + int i = adjusted_tx_height; + auto* data = src; + do { + // Process 4 1d dct16 rows in parallel per iteration. + Dct16_NEON<ButterflyRotation_4>(data, 16, /*is_row=*/true, row_shift); + data += 64; + i -= 4; + } while (i != 0); +} + +void Dct16TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size, + int adjusted_tx_height, void* src_buffer, + int start_x, int start_y, void* dst_frame) { + auto* src = static_cast<int32_t*>(src_buffer); + const int tx_width = kTransformWidth[tx_size]; + + if (kTransformFlipColumnsMask.Contains(tx_type)) { + FlipColumns<16>(src, tx_width); + } + + if (!DctDcOnlyColumn<16>(src, adjusted_tx_height, tx_width)) { + // Process 4 1d dct16 columns in parallel per iteration. + int i = tx_width; + auto* data = src; + do { + Dct16_NEON<ButterflyRotation_4>(data, tx_width, /*is_row=*/false, + /*row_shift=*/0); + data += 4; + i -= 4; + } while (i != 0); + } + auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame); + StoreToFrameWithRound<16>(frame, start_x, start_y, tx_width, src, tx_type); +} + +void Dct32TransformLoopRow_NEON(TransformType /*tx_type*/, + TransformSize tx_size, int adjusted_tx_height, + void* src_buffer, int /*start_x*/, + int /*start_y*/, void* /*dst_frame*/) { + auto* src = static_cast<int32_t*>(src_buffer); + const bool should_round = kShouldRound[tx_size]; + const uint8_t row_shift = kTransformRowShift[tx_size]; + + if (DctDcOnly<32>(src, adjusted_tx_height, should_round, row_shift)) { + return; + } + + if (should_round) { + ApplyRounding<32>(src, adjusted_tx_height); + } + + assert(adjusted_tx_height % 4 == 0); + int i = adjusted_tx_height; + auto* data = src; + do { + // Process 4 1d dct32 rows in parallel per iteration. + Dct32_NEON(data, 32, /*is_row=*/true, row_shift); + data += 128; + i -= 4; + } while (i != 0); +} + +void Dct32TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size, + int adjusted_tx_height, void* src_buffer, + int start_x, int start_y, void* dst_frame) { + auto* src = static_cast<int32_t*>(src_buffer); + const int tx_width = kTransformWidth[tx_size]; + + if (kTransformFlipColumnsMask.Contains(tx_type)) { + FlipColumns<32>(src, tx_width); + } + + if (!DctDcOnlyColumn<32>(src, adjusted_tx_height, tx_width)) { + // Process 4 1d dct32 columns in parallel per iteration. + int i = tx_width; + auto* data = src; + do { + Dct32_NEON(data, tx_width, /*is_row=*/false, /*row_shift=*/0); + data += 4; + i -= 4; + } while (i != 0); + } + auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame); + StoreToFrameWithRound<32>(frame, start_x, start_y, tx_width, src, tx_type); +} + +void Dct64TransformLoopRow_NEON(TransformType /*tx_type*/, + TransformSize tx_size, int adjusted_tx_height, + void* src_buffer, int /*start_x*/, + int /*start_y*/, void* /*dst_frame*/) { + auto* src = static_cast<int32_t*>(src_buffer); + const bool should_round = kShouldRound[tx_size]; + const uint8_t row_shift = kTransformRowShift[tx_size]; + + if (DctDcOnly<64>(src, adjusted_tx_height, should_round, row_shift)) { + return; + } + + if (should_round) { + ApplyRounding<64>(src, adjusted_tx_height); + } + + assert(adjusted_tx_height % 4 == 0); + int i = adjusted_tx_height; + auto* data = src; + do { + // Process 4 1d dct64 rows in parallel per iteration. + Dct64_NEON(data, 64, /*is_row=*/true, row_shift); + data += 128 * 2; + i -= 4; + } while (i != 0); +} + +void Dct64TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size, + int adjusted_tx_height, void* src_buffer, + int start_x, int start_y, void* dst_frame) { + auto* src = static_cast<int32_t*>(src_buffer); + const int tx_width = kTransformWidth[tx_size]; + + if (kTransformFlipColumnsMask.Contains(tx_type)) { + FlipColumns<64>(src, tx_width); + } + + if (!DctDcOnlyColumn<64>(src, adjusted_tx_height, tx_width)) { + // Process 4 1d dct64 columns in parallel per iteration. + int i = tx_width; + auto* data = src; + do { + Dct64_NEON(data, tx_width, /*is_row=*/false, /*row_shift=*/0); + data += 4; + i -= 4; + } while (i != 0); + } + auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame); + StoreToFrameWithRound<64>(frame, start_x, start_y, tx_width, src, tx_type); +} + +void Adst4TransformLoopRow_NEON(TransformType /*tx_type*/, + TransformSize tx_size, int adjusted_tx_height, + void* src_buffer, int /*start_x*/, + int /*start_y*/, void* /*dst_frame*/) { + auto* src = static_cast<int32_t*>(src_buffer); + const int tx_height = kTransformHeight[tx_size]; + const int row_shift = static_cast<int>(tx_height == 16); + const bool should_round = (tx_height == 8); + + if (Adst4DcOnly(src, adjusted_tx_height, should_round, row_shift)) { + return; + } + + if (should_round) { + ApplyRounding<4>(src, adjusted_tx_height); + } + + // Process 4 1d adst4 rows in parallel per iteration. + int i = adjusted_tx_height; + auto* data = src; + do { + Adst4_NEON(data, /*step=*/4, /*is_row=*/true, row_shift); + data += 16; + i -= 4; + } while (i != 0); +} + +void Adst4TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size, + int adjusted_tx_height, void* src_buffer, + int start_x, int start_y, void* dst_frame) { + auto* src = static_cast<int32_t*>(src_buffer); + const int tx_width = kTransformWidth[tx_size]; + + if (kTransformFlipColumnsMask.Contains(tx_type)) { + FlipColumns<4>(src, tx_width); + } + + if (!Adst4DcOnlyColumn(src, adjusted_tx_height, tx_width)) { + // Process 4 1d adst4 columns in parallel per iteration. + int i = tx_width; + auto* data = src; + do { + Adst4_NEON(data, tx_width, /*is_row=*/false, /*row_shift=*/0); + data += 4; + i -= 4; + } while (i != 0); + } + + auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame); + StoreToFrameWithRound<4, /*enable_flip_rows=*/true>(frame, start_x, start_y, + tx_width, src, tx_type); +} + +void Adst8TransformLoopRow_NEON(TransformType /*tx_type*/, + TransformSize tx_size, int adjusted_tx_height, + void* src_buffer, int /*start_x*/, + int /*start_y*/, void* /*dst_frame*/) { + auto* src = static_cast<int32_t*>(src_buffer); + const bool should_round = kShouldRound[tx_size]; + const uint8_t row_shift = kTransformRowShift[tx_size]; + + if (Adst8DcOnly(src, adjusted_tx_height, should_round, row_shift)) { + return; + } + + if (should_round) { + ApplyRounding<8>(src, adjusted_tx_height); + } + + // Process 4 1d adst8 rows in parallel per iteration. + assert(adjusted_tx_height % 4 == 0); + int i = adjusted_tx_height; + auto* data = src; + do { + Adst8_NEON<ButterflyRotation_4>(data, /*step=*/8, + /*transpose=*/true, row_shift); + data += 32; + i -= 4; + } while (i != 0); +} + +void Adst8TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size, + int adjusted_tx_height, void* src_buffer, + int start_x, int start_y, void* dst_frame) { + auto* src = static_cast<int32_t*>(src_buffer); + const int tx_width = kTransformWidth[tx_size]; + + if (kTransformFlipColumnsMask.Contains(tx_type)) { + FlipColumns<8>(src, tx_width); + } + + if (!Adst8DcOnlyColumn(src, adjusted_tx_height, tx_width)) { + // Process 4 1d adst8 columns in parallel per iteration. + int i = tx_width; + auto* data = src; + do { + Adst8_NEON<ButterflyRotation_4>(data, tx_width, /*transpose=*/false, + /*row_shift=*/0); + data += 4; + i -= 4; + } while (i != 0); + } + auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame); + StoreToFrameWithRound<8, /*enable_flip_rows=*/true>(frame, start_x, start_y, + tx_width, src, tx_type); +} + +void Adst16TransformLoopRow_NEON(TransformType /*tx_type*/, + TransformSize tx_size, int adjusted_tx_height, + void* src_buffer, int /*start_x*/, + int /*start_y*/, void* /*dst_frame*/) { + auto* src = static_cast<int32_t*>(src_buffer); + const bool should_round = kShouldRound[tx_size]; + const uint8_t row_shift = kTransformRowShift[tx_size]; + + if (Adst16DcOnly(src, adjusted_tx_height, should_round, row_shift)) { + return; + } + + if (should_round) { + ApplyRounding<16>(src, adjusted_tx_height); + } + + assert(adjusted_tx_height % 4 == 0); + int i = adjusted_tx_height; + do { + // Process 4 1d adst16 rows in parallel per iteration. + Adst16_NEON<ButterflyRotation_4>(src, 16, /*is_row=*/true, row_shift); + src += 64; + i -= 4; + } while (i != 0); +} + +void Adst16TransformLoopColumn_NEON(TransformType tx_type, + TransformSize tx_size, + int adjusted_tx_height, void* src_buffer, + int start_x, int start_y, void* dst_frame) { + auto* src = static_cast<int32_t*>(src_buffer); + const int tx_width = kTransformWidth[tx_size]; + + if (kTransformFlipColumnsMask.Contains(tx_type)) { + FlipColumns<16>(src, tx_width); + } + + if (!Adst16DcOnlyColumn(src, adjusted_tx_height, tx_width)) { + int i = tx_width; + auto* data = src; + do { + // Process 4 1d adst16 columns in parallel per iteration. + Adst16_NEON<ButterflyRotation_4>(data, tx_width, /*is_row=*/false, + /*row_shift=*/0); + data += 4; + i -= 4; + } while (i != 0); + } + auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame); + StoreToFrameWithRound<16, /*enable_flip_rows=*/true>(frame, start_x, start_y, + tx_width, src, tx_type); +} + +void Identity4TransformLoopRow_NEON(TransformType tx_type, + TransformSize tx_size, + int adjusted_tx_height, void* src_buffer, + int /*start_x*/, int /*start_y*/, + void* /*dst_frame*/) { + // Special case: Process row calculations during column transform call. + // Improves performance. + if (tx_type == kTransformTypeIdentityIdentity && + tx_size == kTransformSize4x4) { + return; + } + + auto* src = static_cast<int32_t*>(src_buffer); + const int tx_height = kTransformHeight[tx_size]; + const bool should_round = (tx_height == 8); + + if (Identity4DcOnly(src, adjusted_tx_height, should_round, tx_height)) { + return; + } + + if (should_round) { + ApplyRounding<4>(src, adjusted_tx_height); + } + + const int shift = tx_height > 8 ? 1 : 0; + int i = adjusted_tx_height; + do { + Identity4_NEON(src, /*step=*/4, shift); + src += 16; + i -= 4; + } while (i != 0); +} + +void Identity4TransformLoopColumn_NEON(TransformType tx_type, + TransformSize tx_size, + int adjusted_tx_height, void* src_buffer, + int start_x, int start_y, + void* dst_frame) { + auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame); + auto* src = static_cast<int32_t*>(src_buffer); + const int tx_width = kTransformWidth[tx_size]; + + // Special case: Process row calculations during column transform call. + if (tx_type == kTransformTypeIdentityIdentity && + (tx_size == kTransformSize4x4 || tx_size == kTransformSize8x4)) { + Identity4RowColumnStoreToFrame(frame, start_x, start_y, tx_width, + adjusted_tx_height, src); + return; + } + + if (kTransformFlipColumnsMask.Contains(tx_type)) { + FlipColumns<4>(src, tx_width); + } + + IdentityColumnStoreToFrame<4>(frame, start_x, start_y, tx_width, + adjusted_tx_height, src); +} + +void Identity8TransformLoopRow_NEON(TransformType tx_type, + TransformSize tx_size, + int adjusted_tx_height, void* src_buffer, + int /*start_x*/, int /*start_y*/, + void* /*dst_frame*/) { + // Special case: Process row calculations during column transform call. + // Improves performance. + if (tx_type == kTransformTypeIdentityIdentity && + tx_size == kTransformSize8x4) { + return; + } + + auto* src = static_cast<int32_t*>(src_buffer); + const int tx_height = kTransformHeight[tx_size]; + const bool should_round = kShouldRound[tx_size]; + const uint8_t row_shift = kTransformRowShift[tx_size]; + + if (Identity8DcOnly(src, adjusted_tx_height, should_round, row_shift)) { + return; + } + if (should_round) { + ApplyRounding<8>(src, adjusted_tx_height); + } + + // When combining the identity8 multiplier with the row shift, the + // calculations for tx_height == 8 and tx_height == 16 can be simplified + // from ((A * 2) + 1) >> 1) to A. For 10bpp, A must be clamped to a signed 16 + // bit value. + if ((tx_height & 0x18) != 0) { + for (int i = 0; i < tx_height; ++i) { + const int32x4_t v_src_lo = vld1q_s32(&src[i * 8]); + const int32x4_t v_src_hi = vld1q_s32(&src[(i * 8) + 4]); + vst1q_s32(&src[i * 8], vmovl_s16(vqmovn_s32(v_src_lo))); + vst1q_s32(&src[(i * 8) + 4], vmovl_s16(vqmovn_s32(v_src_hi))); + } + return; + } + if (tx_height == 32) { + int i = adjusted_tx_height; + do { + Identity8Row32_NEON(src, /*step=*/8); + src += 32; + i -= 4; + } while (i != 0); + return; + } + + assert(tx_size == kTransformSize8x4); + int i = adjusted_tx_height; + do { + Identity8Row4_NEON(src, /*step=*/8); + src += 32; + i -= 4; + } while (i != 0); +} + +void Identity8TransformLoopColumn_NEON(TransformType tx_type, + TransformSize tx_size, + int adjusted_tx_height, void* src_buffer, + int start_x, int start_y, + void* dst_frame) { + auto* src = static_cast<int32_t*>(src_buffer); + const int tx_width = kTransformWidth[tx_size]; + + if (kTransformFlipColumnsMask.Contains(tx_type)) { + FlipColumns<8>(src, tx_width); + } + auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame); + IdentityColumnStoreToFrame<8>(frame, start_x, start_y, tx_width, + adjusted_tx_height, src); +} + +void Identity16TransformLoopRow_NEON(TransformType /*tx_type*/, + TransformSize tx_size, + int adjusted_tx_height, void* src_buffer, + int /*start_x*/, int /*start_y*/, + void* /*dst_frame*/) { + auto* src = static_cast<int32_t*>(src_buffer); + const bool should_round = kShouldRound[tx_size]; + const uint8_t row_shift = kTransformRowShift[tx_size]; + + if (Identity16DcOnly(src, adjusted_tx_height, should_round, row_shift)) { + return; + } + + if (should_round) { + ApplyRounding<16>(src, adjusted_tx_height); + } + int i = adjusted_tx_height; + do { + Identity16Row_NEON(src, /*step=*/16, row_shift); + src += 64; + i -= 4; + } while (i != 0); +} + +void Identity16TransformLoopColumn_NEON(TransformType tx_type, + TransformSize tx_size, + int adjusted_tx_height, + void* src_buffer, int start_x, + int start_y, void* dst_frame) { + auto* src = static_cast<int32_t*>(src_buffer); + const int tx_width = kTransformWidth[tx_size]; + + if (kTransformFlipColumnsMask.Contains(tx_type)) { + FlipColumns<16>(src, tx_width); + } + auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame); + IdentityColumnStoreToFrame<16>(frame, start_x, start_y, tx_width, + adjusted_tx_height, src); +} + +//------------------------------------------------------------------------------ + +void Init10bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10); + assert(dsp != nullptr); + // Maximum transform size for Dct is 64. + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kRow] = + Dct4TransformLoopRow_NEON; + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kColumn] = + Dct4TransformLoopColumn_NEON; + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kRow] = + Dct8TransformLoopRow_NEON; + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kColumn] = + Dct8TransformLoopColumn_NEON; + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kRow] = + Dct16TransformLoopRow_NEON; + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kColumn] = + Dct16TransformLoopColumn_NEON; + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kRow] = + Dct32TransformLoopRow_NEON; + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kColumn] = + Dct32TransformLoopColumn_NEON; + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kRow] = + Dct64TransformLoopRow_NEON; + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kColumn] = + Dct64TransformLoopColumn_NEON; + + // Maximum transform size for Adst is 16. + dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kRow] = + Adst4TransformLoopRow_NEON; + dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kColumn] = + Adst4TransformLoopColumn_NEON; + dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kRow] = + Adst8TransformLoopRow_NEON; + dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kColumn] = + Adst8TransformLoopColumn_NEON; + dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kRow] = + Adst16TransformLoopRow_NEON; + dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kColumn] = + Adst16TransformLoopColumn_NEON; + + // Maximum transform size for Identity transform is 32. + dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kRow] = + Identity4TransformLoopRow_NEON; + dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kColumn] = + Identity4TransformLoopColumn_NEON; + dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kRow] = + Identity8TransformLoopRow_NEON; + dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kColumn] = + Identity8TransformLoopColumn_NEON; + dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kRow] = + Identity16TransformLoopRow_NEON; + dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kColumn] = + Identity16TransformLoopColumn_NEON; +} + +} // namespace + +void InverseTransformInit10bpp_NEON() { Init10bpp(); } + +} // namespace dsp +} // namespace libgav1 +#else // !LIBGAV1_ENABLE_NEON || LIBGAV1_MAX_BITDEPTH < 10 +namespace libgav1 { +namespace dsp { + +void InverseTransformInit10bpp_NEON() {} + +} // namespace dsp +} // namespace libgav1 +#endif // LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10 diff --git a/src/dsp/arm/inverse_transform_neon.cc b/src/dsp/arm/inverse_transform_neon.cc index 072991a..315d5e9 100644 --- a/src/dsp/arm/inverse_transform_neon.cc +++ b/src/dsp/arm/inverse_transform_neon.cc @@ -3117,7 +3117,7 @@ void InverseTransformInit_NEON() { low_bitdepth::Init8bpp(); } } // namespace dsp } // namespace libgav1 -#else // !LIBGAV1_ENABLE_NEON +#else // !LIBGAV1_ENABLE_NEON namespace libgav1 { namespace dsp { diff --git a/src/dsp/arm/inverse_transform_neon.h b/src/dsp/arm/inverse_transform_neon.h index af647e8..91e0e83 100644 --- a/src/dsp/arm/inverse_transform_neon.h +++ b/src/dsp/arm/inverse_transform_neon.h @@ -26,6 +26,7 @@ namespace dsp { // Initializes Dsp::inverse_transforms, see the defines below for specifics. // This function is not thread-safe. void InverseTransformInit_NEON(); +void InverseTransformInit10bpp_NEON(); } // namespace dsp } // namespace libgav1 @@ -47,6 +48,21 @@ void InverseTransformInit_NEON(); #define LIBGAV1_Dsp8bpp_1DTransformSize32_1DTransformIdentity LIBGAV1_CPU_NEON #define LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformWht LIBGAV1_CPU_NEON + +#define LIBGAV1_Dsp10bpp_1DTransformSize4_1DTransformDct LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_1DTransformSize8_1DTransformDct LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_1DTransformSize16_1DTransformDct LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_1DTransformSize32_1DTransformDct LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_1DTransformSize64_1DTransformDct LIBGAV1_CPU_NEON + +#define LIBGAV1_Dsp10bpp_1DTransformSize4_1DTransformAdst LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_1DTransformSize8_1DTransformAdst LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_1DTransformSize16_1DTransformAdst LIBGAV1_CPU_NEON + +#define LIBGAV1_Dsp10bpp_1DTransformSize4_1DTransformIdentity LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_1DTransformSize8_1DTransformIdentity LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_1DTransformSize16_1DTransformIdentity LIBGAV1_CPU_NEON + #endif // LIBGAV1_ENABLE_NEON #endif // LIBGAV1_SRC_DSP_ARM_INVERSE_TRANSFORM_NEON_H_ diff --git a/src/dsp/arm/loop_filter_neon.cc b/src/dsp/arm/loop_filter_neon.cc index 146c983..8d72892 100644 --- a/src/dsp/arm/loop_filter_neon.cc +++ b/src/dsp/arm/loop_filter_neon.cc @@ -35,7 +35,7 @@ namespace { // (abs(p1 - p0) > thresh) || (abs(q1 - q0) > thresh) inline uint8x8_t Hev(const uint8x8_t abd_p0p1_q0q1, const uint8_t thresh) { const uint8x8_t a = vcgt_u8(abd_p0p1_q0q1, vdup_n_u8(thresh)); - return vorr_u8(a, RightShift<32>(a)); + return vorr_u8(a, RightShiftVector<32>(a)); } // abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= outer_thresh @@ -44,7 +44,7 @@ inline uint8x8_t OuterThreshold(const uint8x8_t p0q0, const uint8x8_t p1q1, const uint8x8x2_t a = Interleave32(p0q0, p1q1); const uint8x8_t b = vabd_u8(a.val[0], a.val[1]); const uint8x8_t p0q0_double = vqadd_u8(b, b); - const uint8x8_t p1q1_half = RightShift<32>(vshr_n_u8(b, 1)); + const uint8x8_t p1q1_half = RightShiftVector<32>(vshr_n_u8(b, 1)); const uint8x8_t c = vqadd_u8(p0q0_double, p1q1_half); return vcle_u8(c, vdup_n_u8(outer_thresh)); } @@ -56,7 +56,7 @@ inline uint8x8_t NeedsFilter4(const uint8x8_t abd_p0p1_q0q1, const uint8_t inner_thresh, const uint8_t outer_thresh) { const uint8x8_t a = vcle_u8(abd_p0p1_q0q1, vdup_n_u8(inner_thresh)); - const uint8x8_t inner_mask = vand_u8(a, RightShift<32>(a)); + const uint8x8_t inner_mask = vand_u8(a, RightShiftVector<32>(a)); const uint8x8_t outer_mask = OuterThreshold(p0q0, p1q1, outer_thresh); return vand_u8(inner_mask, outer_mask); } @@ -121,7 +121,7 @@ inline void Filter4(const uint8x8_t q0p1, const uint8x8_t p0q1, vcombine_s16(vget_low_s16(p0q1_l), vget_low_s16(q0p1_l)); // Need to shift the second term or we end up with a2_ma2. const int8x8_t a2_ma1 = - InterleaveLow32(a2_a1, RightShift<32>(vneg_s8(a2_a1))); + InterleaveLow32(a2_a1, RightShiftVector<32>(vneg_s8(a2_a1))); const int16x8_t p0q0_a = vaddw_s8(p0q0_l, a2_ma1); *p1q1_result = vqmovun_s16(p1q1_a3); @@ -251,7 +251,7 @@ inline uint8x8_t IsFlat3(const uint8x8_t abd_p0p1_q0q1, const uint8x8_t abd_p0p2_q0q2) { const uint8x8_t a = vmax_u8(abd_p0p1_q0q1, abd_p0p2_q0q2); const uint8x8_t b = vcle_u8(a, vdup_n_u8(1)); - return vand_u8(b, RightShift<32>(b)); + return vand_u8(b, RightShiftVector<32>(b)); } // abs(p2 - p1) <= inner_thresh && abs(p1 - p0) <= inner_thresh && @@ -264,7 +264,7 @@ inline uint8x8_t NeedsFilter6(const uint8x8_t abd_p0p1_q0q1, const uint8_t outer_thresh) { const uint8x8_t a = vmax_u8(abd_p0p1_q0q1, abd_p1p2_q1q2); const uint8x8_t b = vcle_u8(a, vdup_n_u8(inner_thresh)); - const uint8x8_t inner_mask = vand_u8(b, RightShift<32>(b)); + const uint8x8_t inner_mask = vand_u8(b, RightShiftVector<32>(b)); const uint8x8_t outer_mask = OuterThreshold(p0q0, p1q1, outer_thresh); return vand_u8(inner_mask, outer_mask); } @@ -482,7 +482,7 @@ inline uint8x8_t IsFlat4(const uint8x8_t abd_p0n0_q0n0, const uint8x8_t a = vmax_u8(abd_p0n0_q0n0, abd_p0n1_q0n1); const uint8x8_t b = vmax_u8(a, abd_p0n2_q0n2); const uint8x8_t c = vcle_u8(b, vdup_n_u8(1)); - return vand_u8(c, RightShift<32>(c)); + return vand_u8(c, RightShiftVector<32>(c)); } // abs(p3 - p2) <= inner_thresh && abs(p2 - p1) <= inner_thresh && @@ -498,7 +498,7 @@ inline uint8x8_t NeedsFilter8(const uint8x8_t abd_p0p1_q0q1, const uint8x8_t a = vmax_u8(abd_p0p1_q0q1, abd_p1p2_q1q2); const uint8x8_t b = vmax_u8(a, abd_p2p3_q2q3); const uint8x8_t c = vcle_u8(b, vdup_n_u8(inner_thresh)); - const uint8x8_t inner_mask = vand_u8(c, RightShift<32>(c)); + const uint8x8_t inner_mask = vand_u8(c, RightShiftVector<32>(c)); const uint8x8_t outer_mask = OuterThreshold(p0q0, p1q1, outer_thresh); return vand_u8(inner_mask, outer_mask); } @@ -1179,7 +1179,7 @@ void LoopFilterInit_NEON() { low_bitdepth::Init8bpp(); } } // namespace dsp } // namespace libgav1 -#else // !LIBGAV1_ENABLE_NEON +#else // !LIBGAV1_ENABLE_NEON namespace libgav1 { namespace dsp { diff --git a/src/dsp/arm/loop_restoration_neon.cc b/src/dsp/arm/loop_restoration_neon.cc index 337c9b4..e6ceb66 100644 --- a/src/dsp/arm/loop_restoration_neon.cc +++ b/src/dsp/arm/loop_restoration_neon.cc @@ -41,10 +41,25 @@ inline uint8x8_t VshrU128(const uint8x8x2_t src) { } template <int bytes> +inline uint8x8_t VshrU128(const uint8x8_t src[2]) { + return vext_u8(src[0], src[1], bytes); +} + +template <int bytes> +inline uint8x16_t VshrU128(const uint8x16_t src[2]) { + return vextq_u8(src[0], src[1], bytes); +} + +template <int bytes> inline uint16x8_t VshrU128(const uint16x8x2_t src) { return vextq_u16(src.val[0], src.val[1], bytes / 2); } +template <int bytes> +inline uint16x8_t VshrU128(const uint16x8_t src[2]) { + return vextq_u16(src[0], src[1], bytes / 2); +} + // Wiener // Must make a local copy of coefficients to help compiler know that they have @@ -177,18 +192,17 @@ inline void WienerHorizontalTap3(const uint8_t* src, const ptrdiff_t src_stride, int16_t** const wiener_buffer) { for (int y = height; y != 0; --y) { const uint8_t* src_ptr = src; - uint8x16_t s[4]; - s[0] = vld1q_u8(src_ptr); + uint8x16_t s[3]; ptrdiff_t x = width; do { - src_ptr += 16; - s[3] = vld1q_u8(src_ptr); - s[1] = vextq_u8(s[0], s[3], 1); - s[2] = vextq_u8(s[0], s[3], 2); + // Slightly faster than using vextq_u8(). + s[0] = vld1q_u8(src_ptr); + s[1] = vld1q_u8(src_ptr + 1); + s[2] = vld1q_u8(src_ptr + 2); int16x8x2_t sum; sum.val[0] = sum.val[1] = vdupq_n_s16(0); WienerHorizontalSum(s, filter, sum, *wiener_buffer); - s[0] = s[3]; + src_ptr += 16; *wiener_buffer += 16; x -= 16; } while (x != 0); @@ -476,12 +490,12 @@ inline void WienerVerticalTap1(const int16_t* wiener_buffer, // For width 16 and up, store the horizontal results, and then do the vertical // filter row by row. This is faster than doing it column by column when // considering cache issues. -void WienerFilter_NEON(const RestorationUnitInfo& restoration_info, - const void* const source, const void* const top_border, - const void* const bottom_border, const ptrdiff_t stride, - const int width, const int height, - RestorationBuffer* const restoration_buffer, - void* const dest) { +void WienerFilter_NEON( + const RestorationUnitInfo& restoration_info, const void* const source, + const ptrdiff_t stride, const void* const top_border, + const ptrdiff_t top_border_stride, const void* const bottom_border, + const ptrdiff_t bottom_border_stride, const int width, const int height, + RestorationBuffer* const restoration_buffer, void* const dest) { const int16_t* const number_leading_zero_coefficients = restoration_info.wiener_info.number_leading_zero_coefficients; const int number_rows_to_skip = std::max( @@ -509,39 +523,42 @@ void WienerFilter_NEON(const RestorationUnitInfo& restoration_info, const auto* const top = static_cast<const uint8_t*>(top_border); const auto* const bottom = static_cast<const uint8_t*>(bottom_border); if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) { - WienerHorizontalTap7(top + (2 - height_extra) * stride - 3, stride, - wiener_stride, height_extra, filter_horizontal, - &wiener_buffer_horizontal); - WienerHorizontalTap7(src - 3, stride, wiener_stride, height, + WienerHorizontalTap7(top + (2 - height_extra) * top_border_stride - 3, + top_border_stride, wiener_stride, height_extra, filter_horizontal, &wiener_buffer_horizontal); - WienerHorizontalTap7(bottom - 3, stride, wiener_stride, height_extra, + WienerHorizontalTap7(src - 3, stride, wiener_stride, height, filter_horizontal, &wiener_buffer_horizontal); - } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) { - WienerHorizontalTap5(top + (2 - height_extra) * stride - 2, stride, - wiener_stride, height_extra, filter_horizontal, + WienerHorizontalTap7(bottom - 3, bottom_border_stride, wiener_stride, + height_extra, filter_horizontal, &wiener_buffer_horizontal); - WienerHorizontalTap5(src - 2, stride, wiener_stride, height, + } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) { + WienerHorizontalTap5(top + (2 - height_extra) * top_border_stride - 2, + top_border_stride, wiener_stride, height_extra, filter_horizontal, &wiener_buffer_horizontal); - WienerHorizontalTap5(bottom - 2, stride, wiener_stride, height_extra, + WienerHorizontalTap5(src - 2, stride, wiener_stride, height, filter_horizontal, &wiener_buffer_horizontal); + WienerHorizontalTap5(bottom - 2, bottom_border_stride, wiener_stride, + height_extra, filter_horizontal, + &wiener_buffer_horizontal); } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) { // The maximum over-reads happen here. - WienerHorizontalTap3(top + (2 - height_extra) * stride - 1, stride, - wiener_stride, height_extra, filter_horizontal, - &wiener_buffer_horizontal); - WienerHorizontalTap3(src - 1, stride, wiener_stride, height, + WienerHorizontalTap3(top + (2 - height_extra) * top_border_stride - 1, + top_border_stride, wiener_stride, height_extra, filter_horizontal, &wiener_buffer_horizontal); - WienerHorizontalTap3(bottom - 1, stride, wiener_stride, height_extra, + WienerHorizontalTap3(src - 1, stride, wiener_stride, height, filter_horizontal, &wiener_buffer_horizontal); + WienerHorizontalTap3(bottom - 1, bottom_border_stride, wiener_stride, + height_extra, filter_horizontal, + &wiener_buffer_horizontal); } else { assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3); - WienerHorizontalTap1(top + (2 - height_extra) * stride, stride, - wiener_stride, height_extra, + WienerHorizontalTap1(top + (2 - height_extra) * top_border_stride, + top_border_stride, wiener_stride, height_extra, &wiener_buffer_horizontal); WienerHorizontalTap1(src, stride, wiener_stride, height, &wiener_buffer_horizontal); - WienerHorizontalTap1(bottom, stride, wiener_stride, height_extra, - &wiener_buffer_horizontal); + WienerHorizontalTap1(bottom, bottom_border_stride, wiener_stride, + height_extra, &wiener_buffer_horizontal); } // vertical filtering. @@ -574,13 +591,20 @@ void WienerFilter_NEON(const RestorationUnitInfo& restoration_info, //------------------------------------------------------------------------------ // SGR -inline void Prepare3_8(const uint8x8x2_t src, uint8x8_t dst[3]) { +inline void Prepare3_8(const uint8x8_t src[2], uint8x8_t dst[3]) { dst[0] = VshrU128<0>(src); dst[1] = VshrU128<1>(src); dst[2] = VshrU128<2>(src); } -inline void Prepare3_16(const uint16x8x2_t src, uint16x4_t low[3], +template <int offset> +inline void Prepare3_8(const uint8x16_t src[2], uint8x16_t dst[3]) { + dst[0] = VshrU128<offset + 0>(src); + dst[1] = VshrU128<offset + 1>(src); + dst[2] = VshrU128<offset + 2>(src); +} + +inline void Prepare3_16(const uint16x8_t src[2], uint16x4_t low[3], uint16x4_t high[3]) { uint16x8_t s[3]; s[0] = VshrU128<0>(src); @@ -594,7 +618,7 @@ inline void Prepare3_16(const uint16x8x2_t src, uint16x4_t low[3], high[2] = vget_high_u16(s[2]); } -inline void Prepare5_8(const uint8x8x2_t src, uint8x8_t dst[5]) { +inline void Prepare5_8(const uint8x8_t src[2], uint8x8_t dst[5]) { dst[0] = VshrU128<0>(src); dst[1] = VshrU128<1>(src); dst[2] = VshrU128<2>(src); @@ -602,7 +626,16 @@ inline void Prepare5_8(const uint8x8x2_t src, uint8x8_t dst[5]) { dst[4] = VshrU128<4>(src); } -inline void Prepare5_16(const uint16x8x2_t src, uint16x4_t low[5], +template <int offset> +inline void Prepare5_8(const uint8x16_t src[2], uint8x16_t dst[5]) { + dst[0] = VshrU128<offset + 0>(src); + dst[1] = VshrU128<offset + 1>(src); + dst[2] = VshrU128<offset + 2>(src); + dst[3] = VshrU128<offset + 3>(src); + dst[4] = VshrU128<offset + 4>(src); +} + +inline void Prepare5_16(const uint16x8_t src[2], uint16x4_t low[5], uint16x4_t high[5]) { Prepare3_16(src, low, high); const uint16x8_t s3 = VshrU128<6>(src); @@ -641,6 +674,30 @@ inline uint16x8_t Sum3W_16(const uint8x8_t src[3]) { return vaddw_u8(sum, src[2]); } +inline uint16x8_t Sum3WLo16(const uint8x16_t src[3]) { + const uint16x8_t sum = vaddl_u8(vget_low_u8(src[0]), vget_low_u8(src[1])); + return vaddw_u8(sum, vget_low_u8(src[2])); +} + +inline uint16x8_t Sum3WHi16(const uint8x16_t src[3]) { + const uint16x8_t sum = vaddl_u8(vget_high_u8(src[0]), vget_high_u8(src[1])); + return vaddw_u8(sum, vget_high_u8(src[2])); +} + +inline uint16x8_t Sum5WLo16(const uint8x16_t src[5]) { + const uint16x8_t sum01 = vaddl_u8(vget_low_u8(src[0]), vget_low_u8(src[1])); + const uint16x8_t sum23 = vaddl_u8(vget_low_u8(src[2]), vget_low_u8(src[3])); + const uint16x8_t sum = vaddq_u16(sum01, sum23); + return vaddw_u8(sum, vget_low_u8(src[4])); +} + +inline uint16x8_t Sum5WHi16(const uint8x16_t src[5]) { + const uint16x8_t sum01 = vaddl_u8(vget_high_u8(src[0]), vget_high_u8(src[1])); + const uint16x8_t sum23 = vaddl_u8(vget_high_u8(src[2]), vget_high_u8(src[3])); + const uint16x8_t sum = vaddq_u16(sum01, sum23); + return vaddw_u8(sum, vget_high_u8(src[4])); +} + inline uint32x4_t Sum3W_32(const uint16x4_t src[3]) { const uint32x4_t sum = vaddl_u16(src[0], src[1]); return vaddw_u16(sum, src[2]); @@ -678,13 +735,28 @@ inline uint32x4_t Sum5W_32(const uint16x4_t src[5]) { return vaddw_u16(sum0123, src[4]); } -inline uint16x8_t Sum3Horizontal(const uint8x8x2_t src) { +inline uint16x8_t Sum3Horizontal(const uint8x8_t src[2]) { uint8x8_t s[3]; Prepare3_8(src, s); return Sum3W_16(s); } -inline uint32x4x2_t Sum3WHorizontal(const uint16x8x2_t src) { +inline uint16x8_t Sum3Horizontal(const uint8x16_t src) { + uint8x8_t s[2]; + s[0] = vget_low_u8(src); + s[1] = vget_high_u8(src); + return Sum3Horizontal(s); +} + +template <int offset> +inline void Sum3Horizontal(const uint8x16_t src[2], uint16x8_t dst[2]) { + uint8x16_t s[3]; + Prepare3_8<offset>(src, s); + dst[0] = Sum3WLo16(s); + dst[1] = Sum3WHi16(s); +} + +inline uint32x4x2_t Sum3WHorizontal(const uint16x8_t src[2]) { uint16x4_t low[3], high[3]; uint32x4x2_t sum; Prepare3_16(src, low, high); @@ -693,7 +765,7 @@ inline uint32x4x2_t Sum3WHorizontal(const uint16x8x2_t src) { return sum; } -inline uint16x8_t Sum5Horizontal(const uint8x8x2_t src) { +inline uint16x8_t Sum5Horizontal(const uint8x8_t src[2]) { uint8x8_t s[5]; Prepare5_8(src, s); const uint16x8_t sum01 = vaddl_u8(s[0], s[1]); @@ -702,7 +774,23 @@ inline uint16x8_t Sum5Horizontal(const uint8x8x2_t src) { return vaddw_u8(sum0123, s[4]); } -inline uint32x4x2_t Sum5WHorizontal(const uint16x8x2_t src) { +inline uint16x8_t Sum5Horizontal(const uint8x16_t src) { + uint8x8_t s[2]; + s[0] = vget_low_u8(src); + s[1] = vget_high_u8(src); + return Sum5Horizontal(s); +} + +template <int offset> +inline void Sum5Horizontal(const uint8x16_t src[2], uint16x8_t* const dst0, + uint16x8_t* const dst1) { + uint8x16_t s[5]; + Prepare5_8<offset>(src, s); + *dst0 = Sum5WLo16(s); + *dst1 = Sum5WHi16(s); +} + +inline uint32x4x2_t Sum5WHorizontal(const uint16x8_t src[2]) { uint16x4_t low[5], high[5]; Prepare5_16(src, low, high); uint32x4x2_t sum; @@ -711,35 +799,68 @@ inline uint32x4x2_t Sum5WHorizontal(const uint16x8x2_t src) { return sum; } -void SumHorizontal(const uint16x4_t src[5], uint32x4_t* const row_sq3, - uint32x4_t* const row_sq5) { - const uint32x4_t sum04 = vaddl_u16(src[0], src[4]); - const uint32x4_t sum12 = vaddl_u16(src[1], src[2]); - *row_sq3 = vaddw_u16(sum12, src[3]); - *row_sq5 = vaddq_u32(sum04, *row_sq3); +template <int offset> +void SumHorizontal(const uint8x16_t src[2], uint16x8_t* const row3_0, + uint16x8_t* const row3_1, uint16x8_t* const row5_0, + uint16x8_t* const row5_1) { + uint8x16_t s[5]; + Prepare5_8<offset>(src, s); + const uint16x8_t sum04_lo = vaddl_u8(vget_low_u8(s[0]), vget_low_u8(s[4])); + const uint16x8_t sum04_hi = vaddl_u8(vget_high_u8(s[0]), vget_high_u8(s[4])); + *row3_0 = Sum3WLo16(s + 1); + *row3_1 = Sum3WHi16(s + 1); + *row5_0 = vaddq_u16(sum04_lo, *row3_0); + *row5_1 = vaddq_u16(sum04_hi, *row3_1); } -void SumHorizontal(const uint8x8x2_t src, const uint16x8x2_t sq, - uint16x8_t* const row3, uint16x8_t* const row5, - uint32x4x2_t* const row_sq3, uint32x4x2_t* const row_sq5) { +void SumHorizontal(const uint8x8_t src[2], uint16x8_t* const row3, + uint16x8_t* const row5) { uint8x8_t s[5]; Prepare5_8(src, s); const uint16x8_t sum04 = vaddl_u8(s[0], s[4]); const uint16x8_t sum12 = vaddl_u8(s[1], s[2]); *row3 = vaddw_u8(sum12, s[3]); *row5 = vaddq_u16(sum04, *row3); +} + +void SumHorizontal(const uint16x4_t src[5], uint32x4_t* const row_sq3, + uint32x4_t* const row_sq5) { + const uint32x4_t sum04 = vaddl_u16(src[0], src[4]); + const uint32x4_t sum12 = vaddl_u16(src[1], src[2]); + *row_sq3 = vaddw_u16(sum12, src[3]); + *row_sq5 = vaddq_u32(sum04, *row_sq3); +} + +void SumHorizontal(const uint16x8_t sq[2], uint32x4x2_t* const row_sq3, + uint32x4x2_t* const row_sq5) { uint16x4_t low[5], high[5]; Prepare5_16(sq, low, high); SumHorizontal(low, &row_sq3->val[0], &row_sq5->val[0]); SumHorizontal(high, &row_sq3->val[1], &row_sq5->val[1]); } -inline uint16x8_t Sum343(const uint8x8x2_t src) { - uint8x8_t s[3]; - Prepare3_8(src, s); - const uint16x8_t sum = Sum3W_16(s); +void SumHorizontal(const uint8x8_t src[2], const uint16x8_t sq[2], + uint16x8_t* const row3, uint16x8_t* const row5, + uint32x4x2_t* const row_sq3, uint32x4x2_t* const row_sq5) { + SumHorizontal(src, row3, row5); + SumHorizontal(sq, row_sq3, row_sq5); +} + +void SumHorizontal(const uint8x16_t src, const uint16x8_t sq[2], + uint16x8_t* const row3, uint16x8_t* const row5, + uint32x4x2_t* const row_sq3, uint32x4x2_t* const row_sq5) { + uint8x8_t s[2]; + s[0] = vget_low_u8(src); + s[1] = vget_high_u8(src); + return SumHorizontal(s, sq, row3, row5, row_sq3, row_sq5); +} + +template <int offset> +inline uint16x8_t Sum343(const uint8x16_t ma3[2]) { + const uint16x8_t sum = (offset == 0) ? Sum3WLo16(ma3) : Sum3WHi16(ma3); const uint16x8_t sum3 = Sum3_16(sum, sum, sum); - return vaddw_u8(sum3, s[1]); + return vaddw_u8(sum3, + (offset == 0) ? vget_low_u8(ma3[1]) : vget_high_u8(ma3[1])); } inline uint32x4_t Sum343W(const uint16x4_t src[3]) { @@ -748,7 +869,7 @@ inline uint32x4_t Sum343W(const uint16x4_t src[3]) { return vaddw_u16(sum3, src[1]); } -inline uint32x4x2_t Sum343W(const uint16x8x2_t src) { +inline uint32x4x2_t Sum343W(const uint16x8_t src[2]) { uint16x4_t low[3], high[3]; uint32x4x2_t d; Prepare3_16(src, low, high); @@ -757,13 +878,13 @@ inline uint32x4x2_t Sum343W(const uint16x8x2_t src) { return d; } -inline uint16x8_t Sum565(const uint8x8x2_t src) { - uint8x8_t s[3]; - Prepare3_8(src, s); - const uint16x8_t sum = Sum3W_16(s); +template <int offset> +inline uint16x8_t Sum565(const uint8x16_t ma5[2]) { + const uint16x8_t sum = (offset == 0) ? Sum3WLo16(ma5) : Sum3WHi16(ma5); const uint16x8_t sum4 = vshlq_n_u16(sum, 2); const uint16x8_t sum5 = vaddq_u16(sum4, sum); - return vaddw_u8(sum5, s[1]); + return vaddw_u8(sum5, + (offset == 0) ? vget_low_u8(ma5[1]) : vget_high_u8(ma5[1])); } inline uint32x4_t Sum565W(const uint16x4_t src[3]) { @@ -773,7 +894,7 @@ inline uint32x4_t Sum565W(const uint16x4_t src[3]) { return vaddw_u16(sum5, src[1]); } -inline uint32x4x2_t Sum565W(const uint16x8x2_t src) { +inline uint32x4x2_t Sum565W(const uint16x8_t src[2]) { uint16x4_t low[3], high[3]; uint32x4x2_t d; Prepare3_16(src, low, high); @@ -783,21 +904,21 @@ inline uint32x4x2_t Sum565W(const uint16x8x2_t src) { } inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride, - const int height, const ptrdiff_t sum_stride, uint16_t* sum3, - uint16_t* sum5, uint32_t* square_sum3, - uint32_t* square_sum5) { - int y = height; + const ptrdiff_t sum_stride, uint16_t* sum3, uint16_t* sum5, + uint32_t* square_sum3, uint32_t* square_sum5) { + int y = 2; + // Don't change loop width to 16, which is even slower. do { - uint8x8x2_t s; - uint16x8x2_t sq; - s.val[0] = vld1_u8(src); - sq.val[0] = vmull_u8(s.val[0], s.val[0]); + uint8x8_t s[2]; + uint16x8_t sq[2]; + s[0] = vld1_u8(src); + sq[0] = vmull_u8(s[0], s[0]); ptrdiff_t x = 0; do { uint16x8_t row3, row5; uint32x4x2_t row_sq3, row_sq5; - s.val[1] = vld1_u8(src + x + 8); - sq.val[1] = vmull_u8(s.val[1], s.val[1]); + s[1] = vld1_u8(src + x + 8); + sq[1] = vmull_u8(s[1], s[1]); SumHorizontal(s, sq, &row3, &row5, &row_sq3, &row_sq5); vst1q_u16(sum3, row3); vst1q_u16(sum5, row5); @@ -805,8 +926,8 @@ inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride, vst1q_u32(square_sum3 + 4, row_sq3.val[1]); vst1q_u32(square_sum5 + 0, row_sq5.val[0]); vst1q_u32(square_sum5 + 4, row_sq5.val[1]); - s.val[0] = s.val[1]; - sq.val[0] = sq.val[1]; + s[0] = s[1]; + sq[0] = sq[1]; sum3 += 8; sum5 += 8; square_sum3 += 8; @@ -819,21 +940,22 @@ inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride, template <int size> inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride, - const int height, const ptrdiff_t sum_stride, uint16_t* sums, + const ptrdiff_t sum_stride, uint16_t* sums, uint32_t* square_sums) { static_assert(size == 3 || size == 5, ""); - int y = height; + int y = 2; + // Don't change loop width to 16, which is even slower. do { - uint8x8x2_t s; - uint16x8x2_t sq; - s.val[0] = vld1_u8(src); - sq.val[0] = vmull_u8(s.val[0], s.val[0]); + uint8x8_t s[2]; + uint16x8_t sq[2]; + s[0] = vld1_u8(src); + sq[0] = vmull_u8(s[0], s[0]); ptrdiff_t x = 0; do { uint16x8_t row; uint32x4x2_t row_sq; - s.val[1] = vld1_u8(src + x + 8); - sq.val[1] = vmull_u8(s.val[1], s.val[1]); + s[1] = vld1_u8(src + x + 8); + sq[1] = vmull_u8(s[1], s[1]); if (size == 3) { row = Sum3Horizontal(s); row_sq = Sum3WHorizontal(sq); @@ -844,8 +966,8 @@ inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride, vst1q_u16(sums, row); vst1q_u32(square_sums + 0, row_sq.val[0]); vst1q_u32(square_sums + 4, row_sq.val[1]); - s.val[0] = s.val[1]; - sq.val[0] = sq.val[1]; + s[0] = s[1]; + sq[0] = sq[1]; sums += 8; square_sums += 8; x += 8; @@ -871,10 +993,18 @@ inline uint16x4_t CalculateMa(const uint16x4_t sum, const uint32x4_t sum_sq, return vmovn_u32(shifted); } -template <int n> +inline uint8x8_t AdjustValue(const uint8x8_t value, const uint8x8_t index, + const int threshold) { + const uint8x8_t thresholds = vdup_n_u8(threshold); + const uint8x8_t offset = vcgt_u8(index, thresholds); + // Adding 255 is equivalent to subtracting 1 for 8-bit data. + return vadd_u8(value, offset); +} + +template <int n, int offset> inline void CalculateIntermediate(const uint16x8_t sum, const uint32x4x2_t sum_sq, - const uint32_t scale, uint8x8_t* const ma, + const uint32_t scale, uint8x16_t* const ma, uint16x8_t* const b) { constexpr uint32_t one_over_n = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n; @@ -882,19 +1012,39 @@ inline void CalculateIntermediate(const uint16x8_t sum, const uint16x4_t z1 = CalculateMa<n>(vget_high_u16(sum), sum_sq.val[1], scale); const uint16x8_t z01 = vcombine_u16(z0, z1); - // Using vqmovn_u16() needs an extra sign extension instruction. - const uint16x8_t z = vminq_u16(z01, vdupq_n_u16(255)); - // Using vgetq_lane_s16() can save the sign extension instruction. - const uint8_t lookup[8] = { - kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 0)], - kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 1)], - kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 2)], - kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 3)], - kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 4)], - kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 5)], - kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 6)], - kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 7)]}; - *ma = vld1_u8(lookup); + const uint8x8_t idx = vqmovn_u16(z01); + // Use table lookup to read elements whose indices are less than 48. + // Using one uint8x8x4_t vector and one uint8x8x2_t vector is faster than + // using two uint8x8x3_t vectors. + uint8x8x4_t table0; + uint8x8x2_t table1; + table0.val[0] = vld1_u8(kSgrMaLookup + 0 * 8); + table0.val[1] = vld1_u8(kSgrMaLookup + 1 * 8); + table0.val[2] = vld1_u8(kSgrMaLookup + 2 * 8); + table0.val[3] = vld1_u8(kSgrMaLookup + 3 * 8); + table1.val[0] = vld1_u8(kSgrMaLookup + 4 * 8); + table1.val[1] = vld1_u8(kSgrMaLookup + 5 * 8); + // All elements whose indices are out of range [0, 47] are set to 0. + uint8x8_t val = vtbl4_u8(table0, idx); // Range [0, 31]. + // Subtract 8 to shuffle the next index range. + const uint8x8_t index = vsub_u8(idx, vdup_n_u8(32)); + const uint8x8_t res = vtbl2_u8(table1, index); // Range [32, 47]. + // Use OR instruction to combine shuffle results together. + val = vorr_u8(val, res); + + // For elements whose indices are larger than 47, since they seldom change + // values with the increase of the index, we use comparison and arithmetic + // operations to calculate their values. + // Elements whose indices are larger than 47 (with value 0) are set to 5. + val = vmax_u8(val, vdup_n_u8(5)); + val = AdjustValue(val, idx, 55); // 55 is the last index which value is 5. + val = AdjustValue(val, idx, 72); // 72 is the last index which value is 4. + val = AdjustValue(val, idx, 101); // 101 is the last index which value is 3. + val = AdjustValue(val, idx, 169); // 169 is the last index which value is 2. + val = AdjustValue(val, idx, 254); // 254 is the last index which value is 1. + *ma = (offset == 0) ? vcombine_u8(val, vget_high_u8(*ma)) + : vcombine_u8(vget_low_u8(*ma), val); + // b = ma * b * one_over_n // |ma| = [0, 255] // |sum| is a box sum with radius 1 or 2. @@ -906,7 +1056,8 @@ inline void CalculateIntermediate(const uint16x8_t sum, // |kSgrProjReciprocalBits| is 12. // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits). // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits). - const uint16x8_t maq = vmovl_u8(*ma); + const uint16x8_t maq = + vmovl_u8((offset == 0) ? vget_low_u8(*ma) : vget_high_u8(*ma)); const uint32x4_t m0 = vmull_u16(vget_low_u16(maq), vget_low_u16(sum)); const uint32x4_t m1 = vmull_u16(vget_high_u16(maq), vget_high_u16(sum)); const uint32x4_t m2 = vmulq_n_u32(m0, one_over_n); @@ -916,37 +1067,39 @@ inline void CalculateIntermediate(const uint16x8_t sum, *b = vcombine_u16(b_lo, b_hi); } +template <int offset> inline void CalculateIntermediate5(const uint16x8_t s5[5], const uint32x4x2_t sq5[5], - const uint32_t scale, uint8x8_t* const ma, + const uint32_t scale, uint8x16_t* const ma, uint16x8_t* const b) { const uint16x8_t sum = Sum5_16(s5); const uint32x4x2_t sum_sq = Sum5_32(sq5); - CalculateIntermediate<25>(sum, sum_sq, scale, ma, b); + CalculateIntermediate<25, offset>(sum, sum_sq, scale, ma, b); } +template <int offset> inline void CalculateIntermediate3(const uint16x8_t s3[3], const uint32x4x2_t sq3[3], - const uint32_t scale, uint8x8_t* const ma, + const uint32_t scale, uint8x16_t* const ma, uint16x8_t* const b) { const uint16x8_t sum = Sum3_16(s3); const uint32x4x2_t sum_sq = Sum3_32(sq3); - CalculateIntermediate<9>(sum, sum_sq, scale, ma, b); + CalculateIntermediate<9, offset>(sum, sum_sq, scale, ma, b); } -inline void Store343_444(const uint8x8x2_t ma3, const uint16x8x2_t b3, +template <int offset> +inline void Store343_444(const uint8x16_t ma3[3], const uint16x8_t b3[2], const ptrdiff_t x, uint16x8_t* const sum_ma343, uint16x8_t* const sum_ma444, uint32x4x2_t* const sum_b343, uint32x4x2_t* const sum_b444, uint16_t* const ma343, uint16_t* const ma444, uint32_t* const b343, uint32_t* const b444) { - uint8x8_t s[3]; - Prepare3_8(ma3, s); - const uint16x8_t sum_ma111 = Sum3W_16(s); + const uint16x8_t sum_ma111 = (offset == 0) ? Sum3WLo16(ma3) : Sum3WHi16(ma3); *sum_ma444 = vshlq_n_u16(sum_ma111, 2); const uint16x8_t sum333 = vsubq_u16(*sum_ma444, sum_ma111); - *sum_ma343 = vaddw_u8(sum333, s[1]); + *sum_ma343 = vaddw_u8( + sum333, (offset == 0) ? vget_low_u8(ma3[1]) : vget_high_u8(ma3[1])); uint16x4_t low[3], high[3]; uint32x4x2_t sum_b111; Prepare3_16(b3, low, high); @@ -966,93 +1119,211 @@ inline void Store343_444(const uint8x8x2_t ma3, const uint16x8x2_t b3, vst1q_u32(b444 + x + 4, sum_b444->val[1]); } -inline void Store343_444(const uint8x8x2_t ma3, const uint16x8x2_t b3, +template <int offset> +inline void Store343_444(const uint8x16_t ma3[3], const uint16x8_t b3[2], const ptrdiff_t x, uint16x8_t* const sum_ma343, uint32x4x2_t* const sum_b343, uint16_t* const ma343, uint16_t* const ma444, uint32_t* const b343, uint32_t* const b444) { uint16x8_t sum_ma444; uint32x4x2_t sum_b444; - Store343_444(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, &sum_b444, ma343, - ma444, b343, b444); + Store343_444<offset>(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, &sum_b444, + ma343, ma444, b343, b444); } -inline void Store343_444(const uint8x8x2_t ma3, const uint16x8x2_t b3, +template <int offset> +inline void Store343_444(const uint8x16_t ma3[3], const uint16x8_t b3[2], const ptrdiff_t x, uint16_t* const ma343, uint16_t* const ma444, uint32_t* const b343, uint32_t* const b444) { uint16x8_t sum_ma343; uint32x4x2_t sum_b343; - Store343_444(ma3, b3, x, &sum_ma343, &sum_b343, ma343, ma444, b343, b444); + Store343_444<offset>(ma3, b3, x, &sum_ma343, &sum_b343, ma343, ma444, b343, + b444); } -LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5( - const uint8_t* const src0, const uint8_t* const src1, const ptrdiff_t x, - const uint32_t scale, uint16_t* const sum5[5], - uint32_t* const square_sum5[5], uint8x8x2_t s[2], uint16x8x2_t sq[2], - uint8x8_t* const ma, uint16x8_t* const b) { +LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5Lo( + const uint8_t* const src0, const uint8_t* const src1, const uint32_t scale, + uint8x16_t s[2][2], uint16_t* const sum5[5], uint32_t* const square_sum5[5], + uint16x8_t sq[2][4], uint8x16_t* const ma, uint16x8_t* const b) { uint16x8_t s5[5]; uint32x4x2_t sq5[5]; - s[0].val[1] = vld1_u8(src0 + x + 8); - s[1].val[1] = vld1_u8(src1 + x + 8); - sq[0].val[1] = vmull_u8(s[0].val[1], s[0].val[1]); - sq[1].val[1] = vmull_u8(s[1].val[1], s[1].val[1]); - s5[3] = Sum5Horizontal(s[0]); - s5[4] = Sum5Horizontal(s[1]); + s[0][0] = vld1q_u8(src0); + s[1][0] = vld1q_u8(src1); + sq[0][0] = vmull_u8(vget_low_u8(s[0][0]), vget_low_u8(s[0][0])); + sq[1][0] = vmull_u8(vget_low_u8(s[1][0]), vget_low_u8(s[1][0])); + sq[0][1] = vmull_u8(vget_high_u8(s[0][0]), vget_high_u8(s[0][0])); + sq[1][1] = vmull_u8(vget_high_u8(s[1][0]), vget_high_u8(s[1][0])); + s5[3] = Sum5Horizontal(s[0][0]); + s5[4] = Sum5Horizontal(s[1][0]); sq5[3] = Sum5WHorizontal(sq[0]); sq5[4] = Sum5WHorizontal(sq[1]); - vst1q_u16(sum5[3] + x, s5[3]); - vst1q_u16(sum5[4] + x, s5[4]); + vst1q_u16(sum5[3], s5[3]); + vst1q_u16(sum5[4], s5[4]); + vst1q_u32(square_sum5[3] + 0, sq5[3].val[0]); + vst1q_u32(square_sum5[3] + 4, sq5[3].val[1]); + vst1q_u32(square_sum5[4] + 0, sq5[4].val[0]); + vst1q_u32(square_sum5[4] + 4, sq5[4].val[1]); + s5[0] = vld1q_u16(sum5[0]); + s5[1] = vld1q_u16(sum5[1]); + s5[2] = vld1q_u16(sum5[2]); + sq5[0].val[0] = vld1q_u32(square_sum5[0] + 0); + sq5[0].val[1] = vld1q_u32(square_sum5[0] + 4); + sq5[1].val[0] = vld1q_u32(square_sum5[1] + 0); + sq5[1].val[1] = vld1q_u32(square_sum5[1] + 4); + sq5[2].val[0] = vld1q_u32(square_sum5[2] + 0); + sq5[2].val[1] = vld1q_u32(square_sum5[2] + 4); + CalculateIntermediate5<0>(s5, sq5, scale, ma, b); +} + +LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5( + const uint8_t* const src0, const uint8_t* const src1, const ptrdiff_t x, + const uint32_t scale, uint8x16_t s[2][2], uint16_t* const sum5[5], + uint32_t* const square_sum5[5], uint16x8_t sq[2][4], uint8x16_t ma[2], + uint16x8_t b[2]) { + uint16x8_t s5[2][5]; + uint32x4x2_t sq5[5]; + s[0][1] = vld1q_u8(src0 + x + 8); + s[1][1] = vld1q_u8(src1 + x + 8); + sq[0][2] = vmull_u8(vget_low_u8(s[0][1]), vget_low_u8(s[0][1])); + sq[1][2] = vmull_u8(vget_low_u8(s[1][1]), vget_low_u8(s[1][1])); + Sum5Horizontal<8>(s[0], &s5[0][3], &s5[1][3]); + Sum5Horizontal<8>(s[1], &s5[0][4], &s5[1][4]); + sq5[3] = Sum5WHorizontal(sq[0] + 1); + sq5[4] = Sum5WHorizontal(sq[1] + 1); + vst1q_u16(sum5[3] + x, s5[0][3]); + vst1q_u16(sum5[4] + x, s5[0][4]); vst1q_u32(square_sum5[3] + x + 0, sq5[3].val[0]); vst1q_u32(square_sum5[3] + x + 4, sq5[3].val[1]); vst1q_u32(square_sum5[4] + x + 0, sq5[4].val[0]); vst1q_u32(square_sum5[4] + x + 4, sq5[4].val[1]); - s5[0] = vld1q_u16(sum5[0] + x); - s5[1] = vld1q_u16(sum5[1] + x); - s5[2] = vld1q_u16(sum5[2] + x); + s5[0][0] = vld1q_u16(sum5[0] + x); + s5[0][1] = vld1q_u16(sum5[1] + x); + s5[0][2] = vld1q_u16(sum5[2] + x); sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 0); sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 4); sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 0); sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 4); sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 0); sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 4); - CalculateIntermediate5(s5, sq5, scale, ma, b); + CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], &b[0]); + + sq[0][3] = vmull_u8(vget_high_u8(s[0][1]), vget_high_u8(s[0][1])); + sq[1][3] = vmull_u8(vget_high_u8(s[1][1]), vget_high_u8(s[1][1])); + sq5[3] = Sum5WHorizontal(sq[0] + 2); + sq5[4] = Sum5WHorizontal(sq[1] + 2); + vst1q_u16(sum5[3] + x + 8, s5[1][3]); + vst1q_u16(sum5[4] + x + 8, s5[1][4]); + vst1q_u32(square_sum5[3] + x + 8, sq5[3].val[0]); + vst1q_u32(square_sum5[3] + x + 12, sq5[3].val[1]); + vst1q_u32(square_sum5[4] + x + 8, sq5[4].val[0]); + vst1q_u32(square_sum5[4] + x + 12, sq5[4].val[1]); + s5[1][0] = vld1q_u16(sum5[0] + x + 8); + s5[1][1] = vld1q_u16(sum5[1] + x + 8); + s5[1][2] = vld1q_u16(sum5[2] + x + 8); + sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 8); + sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 12); + sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 8); + sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 12); + sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 8); + sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 12); + CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], &b[1]); +} + +LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRowLo( + const uint8_t* const src, const uint32_t scale, uint8x16_t* const s, + const uint16_t* const sum5[5], const uint32_t* const square_sum5[5], + uint16x8_t sq[2], uint8x16_t* const ma, uint16x8_t* const b) { + uint16x8_t s5[5]; + uint32x4x2_t sq5[5]; + *s = vld1q_u8(src); + sq[0] = vmull_u8(vget_low_u8(*s), vget_low_u8(*s)); + sq[1] = vmull_u8(vget_high_u8(*s), vget_high_u8(*s)); + s5[3] = s5[4] = Sum5Horizontal(*s); + sq5[3] = sq5[4] = Sum5WHorizontal(sq); + s5[0] = vld1q_u16(sum5[0]); + s5[1] = vld1q_u16(sum5[1]); + s5[2] = vld1q_u16(sum5[2]); + sq5[0].val[0] = vld1q_u32(square_sum5[0] + 0); + sq5[0].val[1] = vld1q_u32(square_sum5[0] + 4); + sq5[1].val[0] = vld1q_u32(square_sum5[1] + 0); + sq5[1].val[1] = vld1q_u32(square_sum5[1] + 4); + sq5[2].val[0] = vld1q_u32(square_sum5[2] + 0); + sq5[2].val[1] = vld1q_u32(square_sum5[2] + 4); + CalculateIntermediate5<0>(s5, sq5, scale, ma, b); } LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRow( const uint8_t* const src, const ptrdiff_t x, const uint32_t scale, - const uint16_t* const sum5[5], const uint32_t* const square_sum5[5], - uint8x8x2_t* const s, uint16x8x2_t* const sq, uint8x8_t* const ma, - uint16x8_t* const b) { - uint16x8_t s5[5]; + uint8x16_t s[2], const uint16_t* const sum5[5], + const uint32_t* const square_sum5[5], uint16x8_t sq[3], uint8x16_t ma[2], + uint16x8_t b[2]) { + uint16x8_t s5[2][5]; uint32x4x2_t sq5[5]; - s->val[1] = vld1_u8(src + x + 8); - sq->val[1] = vmull_u8(s->val[1], s->val[1]); - s5[3] = s5[4] = Sum5Horizontal(*s); - sq5[3] = sq5[4] = Sum5WHorizontal(*sq); - s5[0] = vld1q_u16(sum5[0] + x); - s5[1] = vld1q_u16(sum5[1] + x); - s5[2] = vld1q_u16(sum5[2] + x); + s[1] = vld1q_u8(src + x + 8); + sq[1] = vmull_u8(vget_low_u8(s[1]), vget_low_u8(s[1])); + Sum5Horizontal<8>(s, &s5[0][3], &s5[1][3]); + sq5[3] = sq5[4] = Sum5WHorizontal(sq); + s5[0][0] = vld1q_u16(sum5[0] + x); + s5[0][1] = vld1q_u16(sum5[1] + x); + s5[0][2] = vld1q_u16(sum5[2] + x); + s5[0][4] = s5[0][3]; sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 0); sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 4); sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 0); sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 4); sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 0); sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 4); - CalculateIntermediate5(s5, sq5, scale, ma, b); + CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], &b[0]); + + sq[2] = vmull_u8(vget_high_u8(s[1]), vget_high_u8(s[1])); + sq5[3] = sq5[4] = Sum5WHorizontal(sq + 1); + s5[1][0] = vld1q_u16(sum5[0] + x + 8); + s5[1][1] = vld1q_u16(sum5[1] + x + 8); + s5[1][2] = vld1q_u16(sum5[2] + x + 8); + s5[1][4] = s5[1][3]; + sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 8); + sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 12); + sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 8); + sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 12); + sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 8); + sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 12); + CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], &b[1]); +} + +LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3Lo( + const uint8_t* const src, const uint32_t scale, uint8x16_t* const s, + uint16_t* const sum3[3], uint32_t* const square_sum3[3], uint16x8_t sq[2], + uint8x16_t* const ma, uint16x8_t* const b) { + uint16x8_t s3[3]; + uint32x4x2_t sq3[3]; + *s = vld1q_u8(src); + sq[0] = vmull_u8(vget_low_u8(*s), vget_low_u8(*s)); + sq[1] = vmull_u8(vget_high_u8(*s), vget_high_u8(*s)); + s3[2] = Sum3Horizontal(*s); + sq3[2] = Sum3WHorizontal(sq); + vst1q_u16(sum3[2], s3[2]); + vst1q_u32(square_sum3[2] + 0, sq3[2].val[0]); + vst1q_u32(square_sum3[2] + 4, sq3[2].val[1]); + s3[0] = vld1q_u16(sum3[0]); + s3[1] = vld1q_u16(sum3[1]); + sq3[0].val[0] = vld1q_u32(square_sum3[0] + 0); + sq3[0].val[1] = vld1q_u32(square_sum3[0] + 4); + sq3[1].val[0] = vld1q_u32(square_sum3[1] + 0); + sq3[1].val[1] = vld1q_u32(square_sum3[1] + 4); + CalculateIntermediate3<0>(s3, sq3, scale, ma, b); } LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3( const uint8_t* const src, const ptrdiff_t x, const uint32_t scale, - uint16_t* const sum3[3], uint32_t* const square_sum3[3], - uint8x8x2_t* const s, uint16x8x2_t* const sq, uint8x8_t* const ma, - uint16x8_t* const b) { - uint16x8_t s3[3]; + uint16_t* const sum3[3], uint32_t* const square_sum3[3], uint8x16_t s[2], + uint16x8_t sq[3], uint8x16_t ma[2], uint16x8_t b[2]) { + uint16x8_t s3[4]; uint32x4x2_t sq3[3]; - s->val[1] = vld1_u8(src + x + 8); - sq->val[1] = vmull_u8(s->val[1], s->val[1]); - s3[2] = Sum3Horizontal(*s); - sq3[2] = Sum3WHorizontal(*sq); + s[1] = vld1q_u8(src + x + 8); + sq[1] = vmull_u8(vget_low_u8(s[1]), vget_low_u8(s[1])); + Sum3Horizontal<8>(s, s3 + 2); + sq3[2] = Sum3WHorizontal(sq); vst1q_u16(sum3[2] + x, s3[2]); vst1q_u32(square_sum3[2] + x + 0, sq3[2].val[0]); vst1q_u32(square_sum3[2] + x + 4, sq3[2].val[1]); @@ -1062,71 +1333,204 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3( sq3[0].val[1] = vld1q_u32(square_sum3[0] + x + 4); sq3[1].val[0] = vld1q_u32(square_sum3[1] + x + 0); sq3[1].val[1] = vld1q_u32(square_sum3[1] + x + 4); - CalculateIntermediate3(s3, sq3, scale, ma, b); + CalculateIntermediate3<8>(s3, sq3, scale, &ma[0], &b[0]); + + sq[2] = vmull_u8(vget_high_u8(s[1]), vget_high_u8(s[1])); + sq3[2] = Sum3WHorizontal(sq + 1); + vst1q_u16(sum3[2] + x + 8, s3[3]); + vst1q_u32(square_sum3[2] + x + 8, sq3[2].val[0]); + vst1q_u32(square_sum3[2] + x + 12, sq3[2].val[1]); + s3[1] = vld1q_u16(sum3[0] + x + 8); + s3[2] = vld1q_u16(sum3[1] + x + 8); + sq3[0].val[0] = vld1q_u32(square_sum3[0] + x + 8); + sq3[0].val[1] = vld1q_u32(square_sum3[0] + x + 12); + sq3[1].val[0] = vld1q_u32(square_sum3[1] + x + 8); + sq3[1].val[1] = vld1q_u32(square_sum3[1] + x + 12); + CalculateIntermediate3<0>(s3 + 1, sq3, scale, &ma[1], &b[1]); +} + +LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLo( + const uint8_t* const src0, const uint8_t* const src1, + const uint16_t scales[2], uint8x16_t s[2][2], uint16_t* const sum3[4], + uint16_t* const sum5[5], uint32_t* const square_sum3[4], + uint32_t* const square_sum5[5], uint16x8_t sq[2][4], uint8x16_t ma3[2][2], + uint16x8_t b3[2][3], uint8x16_t* const ma5, uint16x8_t* const b5) { + uint16x8_t s3[4], s5[5]; + uint32x4x2_t sq3[4], sq5[5]; + s[0][0] = vld1q_u8(src0); + s[1][0] = vld1q_u8(src1); + sq[0][0] = vmull_u8(vget_low_u8(s[0][0]), vget_low_u8(s[0][0])); + sq[1][0] = vmull_u8(vget_low_u8(s[1][0]), vget_low_u8(s[1][0])); + sq[0][1] = vmull_u8(vget_high_u8(s[0][0]), vget_high_u8(s[0][0])); + sq[1][1] = vmull_u8(vget_high_u8(s[1][0]), vget_high_u8(s[1][0])); + SumHorizontal(s[0][0], sq[0], &s3[2], &s5[3], &sq3[2], &sq5[3]); + SumHorizontal(s[1][0], sq[1], &s3[3], &s5[4], &sq3[3], &sq5[4]); + vst1q_u16(sum3[2], s3[2]); + vst1q_u16(sum3[3], s3[3]); + vst1q_u32(square_sum3[2] + 0, sq3[2].val[0]); + vst1q_u32(square_sum3[2] + 4, sq3[2].val[1]); + vst1q_u32(square_sum3[3] + 0, sq3[3].val[0]); + vst1q_u32(square_sum3[3] + 4, sq3[3].val[1]); + vst1q_u16(sum5[3], s5[3]); + vst1q_u16(sum5[4], s5[4]); + vst1q_u32(square_sum5[3] + 0, sq5[3].val[0]); + vst1q_u32(square_sum5[3] + 4, sq5[3].val[1]); + vst1q_u32(square_sum5[4] + 0, sq5[4].val[0]); + vst1q_u32(square_sum5[4] + 4, sq5[4].val[1]); + s3[0] = vld1q_u16(sum3[0]); + s3[1] = vld1q_u16(sum3[1]); + sq3[0].val[0] = vld1q_u32(square_sum3[0] + 0); + sq3[0].val[1] = vld1q_u32(square_sum3[0] + 4); + sq3[1].val[0] = vld1q_u32(square_sum3[1] + 0); + sq3[1].val[1] = vld1q_u32(square_sum3[1] + 4); + s5[0] = vld1q_u16(sum5[0]); + s5[1] = vld1q_u16(sum5[1]); + s5[2] = vld1q_u16(sum5[2]); + sq5[0].val[0] = vld1q_u32(square_sum5[0] + 0); + sq5[0].val[1] = vld1q_u32(square_sum5[0] + 4); + sq5[1].val[0] = vld1q_u32(square_sum5[1] + 0); + sq5[1].val[1] = vld1q_u32(square_sum5[1] + 4); + sq5[2].val[0] = vld1q_u32(square_sum5[2] + 0); + sq5[2].val[1] = vld1q_u32(square_sum5[2] + 4); + CalculateIntermediate3<0>(s3, sq3, scales[1], ma3[0], b3[0]); + CalculateIntermediate3<0>(s3 + 1, sq3 + 1, scales[1], ma3[1], b3[1]); + CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5); } LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess( const uint8_t* const src0, const uint8_t* const src1, const ptrdiff_t x, - const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5], - uint32_t* const square_sum3[4], uint32_t* const square_sum5[5], - uint8x8x2_t s[2], uint16x8x2_t sq[2], uint8x8_t* const ma3_0, - uint8x8_t* const ma3_1, uint16x8_t* const b3_0, uint16x8_t* const b3_1, - uint8x8_t* const ma5, uint16x8_t* const b5) { - uint16x8_t s3[4], s5[5]; + const uint16_t scales[2], uint8x16_t s[2][2], uint16_t* const sum3[4], + uint16_t* const sum5[5], uint32_t* const square_sum3[4], + uint32_t* const square_sum5[5], uint16x8_t sq[2][4], uint8x16_t ma3[2][2], + uint16x8_t b3[2][3], uint8x16_t ma5[2], uint16x8_t b5[2]) { + uint16x8_t s3[2][4], s5[2][5]; uint32x4x2_t sq3[4], sq5[5]; - s[0].val[1] = vld1_u8(src0 + x + 8); - s[1].val[1] = vld1_u8(src1 + x + 8); - sq[0].val[1] = vmull_u8(s[0].val[1], s[0].val[1]); - sq[1].val[1] = vmull_u8(s[1].val[1], s[1].val[1]); - SumHorizontal(s[0], sq[0], &s3[2], &s5[3], &sq3[2], &sq5[3]); - SumHorizontal(s[1], sq[1], &s3[3], &s5[4], &sq3[3], &sq5[4]); - vst1q_u16(sum3[2] + x, s3[2]); - vst1q_u16(sum3[3] + x, s3[3]); + s[0][1] = vld1q_u8(src0 + x + 8); + s[1][1] = vld1q_u8(src1 + x + 8); + sq[0][2] = vmull_u8(vget_low_u8(s[0][1]), vget_low_u8(s[0][1])); + sq[1][2] = vmull_u8(vget_low_u8(s[1][1]), vget_low_u8(s[1][1])); + SumHorizontal<8>(s[0], &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]); + SumHorizontal<8>(s[1], &s3[0][3], &s3[1][3], &s5[0][4], &s5[1][4]); + SumHorizontal(sq[0] + 1, &sq3[2], &sq5[3]); + SumHorizontal(sq[1] + 1, &sq3[3], &sq5[4]); + vst1q_u16(sum3[2] + x, s3[0][2]); + vst1q_u16(sum3[3] + x, s3[0][3]); vst1q_u32(square_sum3[2] + x + 0, sq3[2].val[0]); vst1q_u32(square_sum3[2] + x + 4, sq3[2].val[1]); vst1q_u32(square_sum3[3] + x + 0, sq3[3].val[0]); vst1q_u32(square_sum3[3] + x + 4, sq3[3].val[1]); - vst1q_u16(sum5[3] + x, s5[3]); - vst1q_u16(sum5[4] + x, s5[4]); + vst1q_u16(sum5[3] + x, s5[0][3]); + vst1q_u16(sum5[4] + x, s5[0][4]); vst1q_u32(square_sum5[3] + x + 0, sq5[3].val[0]); vst1q_u32(square_sum5[3] + x + 4, sq5[3].val[1]); vst1q_u32(square_sum5[4] + x + 0, sq5[4].val[0]); vst1q_u32(square_sum5[4] + x + 4, sq5[4].val[1]); - s3[0] = vld1q_u16(sum3[0] + x); - s3[1] = vld1q_u16(sum3[1] + x); + s3[0][0] = vld1q_u16(sum3[0] + x); + s3[0][1] = vld1q_u16(sum3[1] + x); sq3[0].val[0] = vld1q_u32(square_sum3[0] + x + 0); sq3[0].val[1] = vld1q_u32(square_sum3[0] + x + 4); sq3[1].val[0] = vld1q_u32(square_sum3[1] + x + 0); sq3[1].val[1] = vld1q_u32(square_sum3[1] + x + 4); - s5[0] = vld1q_u16(sum5[0] + x); - s5[1] = vld1q_u16(sum5[1] + x); - s5[2] = vld1q_u16(sum5[2] + x); + s5[0][0] = vld1q_u16(sum5[0] + x); + s5[0][1] = vld1q_u16(sum5[1] + x); + s5[0][2] = vld1q_u16(sum5[2] + x); sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 0); sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 4); sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 0); sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 4); sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 0); sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 4); - CalculateIntermediate3(s3, sq3, scales[1], ma3_0, b3_0); - CalculateIntermediate3(s3 + 1, sq3 + 1, scales[1], ma3_1, b3_1); - CalculateIntermediate5(s5, sq5, scales[0], ma5, b5); + CalculateIntermediate3<8>(s3[0], sq3, scales[1], &ma3[0][0], &b3[0][1]); + CalculateIntermediate3<8>(s3[0] + 1, sq3 + 1, scales[1], &ma3[1][0], + &b3[1][1]); + CalculateIntermediate5<8>(s5[0], sq5, scales[0], &ma5[0], &b5[0]); + + sq[0][3] = vmull_u8(vget_high_u8(s[0][1]), vget_high_u8(s[0][1])); + sq[1][3] = vmull_u8(vget_high_u8(s[1][1]), vget_high_u8(s[1][1])); + SumHorizontal(sq[0] + 2, &sq3[2], &sq5[3]); + SumHorizontal(sq[1] + 2, &sq3[3], &sq5[4]); + vst1q_u16(sum3[2] + x + 8, s3[1][2]); + vst1q_u16(sum3[3] + x + 8, s3[1][3]); + vst1q_u32(square_sum3[2] + x + 8, sq3[2].val[0]); + vst1q_u32(square_sum3[2] + x + 12, sq3[2].val[1]); + vst1q_u32(square_sum3[3] + x + 8, sq3[3].val[0]); + vst1q_u32(square_sum3[3] + x + 12, sq3[3].val[1]); + vst1q_u16(sum5[3] + x + 8, s5[1][3]); + vst1q_u16(sum5[4] + x + 8, s5[1][4]); + vst1q_u32(square_sum5[3] + x + 8, sq5[3].val[0]); + vst1q_u32(square_sum5[3] + x + 12, sq5[3].val[1]); + vst1q_u32(square_sum5[4] + x + 8, sq5[4].val[0]); + vst1q_u32(square_sum5[4] + x + 12, sq5[4].val[1]); + s3[1][0] = vld1q_u16(sum3[0] + x + 8); + s3[1][1] = vld1q_u16(sum3[1] + x + 8); + sq3[0].val[0] = vld1q_u32(square_sum3[0] + x + 8); + sq3[0].val[1] = vld1q_u32(square_sum3[0] + x + 12); + sq3[1].val[0] = vld1q_u32(square_sum3[1] + x + 8); + sq3[1].val[1] = vld1q_u32(square_sum3[1] + x + 12); + s5[1][0] = vld1q_u16(sum5[0] + x + 8); + s5[1][1] = vld1q_u16(sum5[1] + x + 8); + s5[1][2] = vld1q_u16(sum5[2] + x + 8); + sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 8); + sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 12); + sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 8); + sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 12); + sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 8); + sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 12); + CalculateIntermediate3<0>(s3[1], sq3, scales[1], &ma3[0][1], &b3[0][2]); + CalculateIntermediate3<0>(s3[1] + 1, sq3 + 1, scales[1], &ma3[1][1], + &b3[1][2]); + CalculateIntermediate5<0>(s5[1], sq5, scales[0], &ma5[1], &b5[1]); +} + +LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRowLo( + const uint8_t* const src, const uint16_t scales[2], + const uint16_t* const sum3[4], const uint16_t* const sum5[5], + const uint32_t* const square_sum3[4], const uint32_t* const square_sum5[5], + uint8x16_t* const s, uint16x8_t sq[2], uint8x16_t* const ma3, + uint8x16_t* const ma5, uint16x8_t* const b3, uint16x8_t* const b5) { + uint16x8_t s3[3], s5[5]; + uint32x4x2_t sq3[3], sq5[5]; + *s = vld1q_u8(src); + sq[0] = vmull_u8(vget_low_u8(*s), vget_low_u8(*s)); + sq[1] = vmull_u8(vget_high_u8(*s), vget_high_u8(*s)); + SumHorizontal(*s, sq, &s3[2], &s5[3], &sq3[2], &sq5[3]); + s5[0] = vld1q_u16(sum5[0]); + s5[1] = vld1q_u16(sum5[1]); + s5[2] = vld1q_u16(sum5[2]); + s5[4] = s5[3]; + sq5[0].val[0] = vld1q_u32(square_sum5[0] + 0); + sq5[0].val[1] = vld1q_u32(square_sum5[0] + 4); + sq5[1].val[0] = vld1q_u32(square_sum5[1] + 0); + sq5[1].val[1] = vld1q_u32(square_sum5[1] + 4); + sq5[2].val[0] = vld1q_u32(square_sum5[2] + 0); + sq5[2].val[1] = vld1q_u32(square_sum5[2] + 4); + sq5[4] = sq5[3]; + CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5); + s3[0] = vld1q_u16(sum3[0]); + s3[1] = vld1q_u16(sum3[1]); + sq3[0].val[0] = vld1q_u32(square_sum3[0] + 0); + sq3[0].val[1] = vld1q_u32(square_sum3[0] + 4); + sq3[1].val[0] = vld1q_u32(square_sum3[1] + 0); + sq3[1].val[1] = vld1q_u32(square_sum3[1] + 4); + CalculateIntermediate3<0>(s3, sq3, scales[1], ma3, b3); } LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRow( const uint8_t* const src, const ptrdiff_t x, const uint16_t scales[2], const uint16_t* const sum3[4], const uint16_t* const sum5[5], const uint32_t* const square_sum3[4], const uint32_t* const square_sum5[5], - uint8x8x2_t* const s, uint16x8x2_t* const sq, uint8x8_t* const ma3, - uint8x8_t* const ma5, uint16x8_t* const b3, uint16x8_t* const b5) { - uint16x8_t s3[3], s5[5]; + uint8x16_t s[2], uint16x8_t sq[3], uint8x16_t ma3[2], uint8x16_t ma5[2], + uint16x8_t b3[2], uint16x8_t b5[2]) { + uint16x8_t s3[2][3], s5[2][5]; uint32x4x2_t sq3[3], sq5[5]; - s->val[1] = vld1_u8(src + x + 8); - sq->val[1] = vmull_u8(s->val[1], s->val[1]); - SumHorizontal(*s, *sq, &s3[2], &s5[3], &sq3[2], &sq5[3]); - s5[0] = vld1q_u16(sum5[0] + x); - s5[1] = vld1q_u16(sum5[1] + x); - s5[2] = vld1q_u16(sum5[2] + x); - s5[4] = s5[3]; + s[1] = vld1q_u8(src + x + 8); + sq[1] = vmull_u8(vget_low_u8(s[1]), vget_low_u8(s[1])); + SumHorizontal<8>(s, &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]); + SumHorizontal(sq, &sq3[2], &sq5[3]); + s5[0][0] = vld1q_u16(sum5[0] + x); + s5[0][1] = vld1q_u16(sum5[1] + x); + s5[0][2] = vld1q_u16(sum5[2] + x); + s5[0][4] = s5[0][3]; sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 0); sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 4); sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 0); @@ -1134,14 +1538,36 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRow( sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 0); sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 4); sq5[4] = sq5[3]; - CalculateIntermediate5(s5, sq5, scales[0], ma5, b5); - s3[0] = vld1q_u16(sum3[0] + x); - s3[1] = vld1q_u16(sum3[1] + x); + CalculateIntermediate5<8>(s5[0], sq5, scales[0], &ma5[0], &b5[0]); + s3[0][0] = vld1q_u16(sum3[0] + x); + s3[0][1] = vld1q_u16(sum3[1] + x); sq3[0].val[0] = vld1q_u32(square_sum3[0] + x + 0); sq3[0].val[1] = vld1q_u32(square_sum3[0] + x + 4); sq3[1].val[0] = vld1q_u32(square_sum3[1] + x + 0); sq3[1].val[1] = vld1q_u32(square_sum3[1] + x + 4); - CalculateIntermediate3(s3, sq3, scales[1], ma3, b3); + CalculateIntermediate3<8>(s3[0], sq3, scales[1], &ma3[0], &b3[0]); + + sq[2] = vmull_u8(vget_high_u8(s[1]), vget_high_u8(s[1])); + SumHorizontal(sq + 1, &sq3[2], &sq5[3]); + s5[1][0] = vld1q_u16(sum5[0] + x + 8); + s5[1][1] = vld1q_u16(sum5[1] + x + 8); + s5[1][2] = vld1q_u16(sum5[2] + x + 8); + s5[1][4] = s5[1][3]; + sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 8); + sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 12); + sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 8); + sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 12); + sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 8); + sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 12); + sq5[4] = sq5[3]; + CalculateIntermediate5<0>(s5[1], sq5, scales[0], &ma5[1], &b5[1]); + s3[1][0] = vld1q_u16(sum3[0] + x + 8); + s3[1][1] = vld1q_u16(sum3[1] + x + 8); + sq3[0].val[0] = vld1q_u32(square_sum3[0] + x + 8); + sq3[0].val[1] = vld1q_u32(square_sum3[0] + x + 12); + sq3[1].val[0] = vld1q_u32(square_sum3[1] + x + 8); + sq3[1].val[1] = vld1q_u32(square_sum3[1] + x + 12); + CalculateIntermediate3<0>(s3[1], sq3, scales[1], &ma3[1], &b3[1]); } inline void BoxSumFilterPreProcess5(const uint8_t* const src0, @@ -1150,33 +1576,39 @@ inline void BoxSumFilterPreProcess5(const uint8_t* const src0, uint16_t* const sum5[5], uint32_t* const square_sum5[5], uint16_t* ma565, uint32_t* b565) { - uint8x8x2_t s[2], mas; - uint16x8x2_t sq[2], bs; - s[0].val[0] = vld1_u8(src0); - s[1].val[0] = vld1_u8(src1); - sq[0].val[0] = vmull_u8(s[0].val[0], s[0].val[0]); - sq[1].val[0] = vmull_u8(s[1].val[0], s[1].val[0]); - BoxFilterPreProcess5(src0, src1, 0, scale, sum5, square_sum5, s, sq, - &mas.val[0], &bs.val[0]); + uint8x16_t s[2][2], mas[2]; + uint16x8_t sq[2][4], bs[3]; + BoxFilterPreProcess5Lo(src0, src1, scale, s, sum5, square_sum5, sq, &mas[0], + &bs[0]); int x = 0; do { - s[0].val[0] = s[0].val[1]; - s[1].val[0] = s[1].val[1]; - sq[0].val[0] = sq[0].val[1]; - sq[1].val[0] = sq[1].val[1]; - BoxFilterPreProcess5(src0, src1, x + 8, scale, sum5, square_sum5, s, sq, - &mas.val[1], &bs.val[1]); - const uint16x8_t ma = Sum565(mas); - const uint32x4x2_t b = Sum565W(bs); - vst1q_u16(ma565, ma); - vst1q_u32(b565 + 0, b.val[0]); - vst1q_u32(b565 + 4, b.val[1]); - mas.val[0] = mas.val[1]; - bs.val[0] = bs.val[1]; - ma565 += 8; - b565 += 8; - x += 8; + uint16x8_t ma[2]; + uint8x16_t masx[3]; + uint32x4x2_t b[2]; + BoxFilterPreProcess5(src0, src1, x + 8, scale, s, sum5, square_sum5, sq, + mas, bs + 1); + Prepare3_8<0>(mas, masx); + ma[0] = Sum565<0>(masx); + b[0] = Sum565W(bs); + vst1q_u16(ma565, ma[0]); + vst1q_u32(b565 + 0, b[0].val[0]); + vst1q_u32(b565 + 4, b[0].val[1]); + + ma[1] = Sum565<8>(masx); + b[1] = Sum565W(bs + 1); + vst1q_u16(ma565 + 8, ma[1]); + vst1q_u32(b565 + 8, b[1].val[0]); + vst1q_u32(b565 + 12, b[1].val[1]); + s[0][0] = s[0][1]; + s[1][0] = s[1][1]; + sq[0][1] = sq[0][3]; + sq[1][1] = sq[1][3]; + mas[0] = mas[1]; + bs[0] = bs[2]; + ma565 += 16; + b565 += 16; + x += 16; } while (x < width); } @@ -1185,35 +1617,44 @@ LIBGAV1_ALWAYS_INLINE void BoxSumFilterPreProcess3( const uint8_t* const src, const int width, const uint32_t scale, uint16_t* const sum3[3], uint32_t* const square_sum3[3], uint16_t* ma343, uint16_t* ma444, uint32_t* b343, uint32_t* b444) { - uint8x8x2_t s, mas; - uint16x8x2_t sq, bs; - s.val[0] = vld1_u8(src); - sq.val[0] = vmull_u8(s.val[0], s.val[0]); - BoxFilterPreProcess3(src, 0, scale, sum3, square_sum3, &s, &sq, &mas.val[0], - &bs.val[0]); + uint8x16_t s[2], mas[2]; + uint16x8_t sq[4], bs[3]; + BoxFilterPreProcess3Lo(src, scale, &s[0], sum3, square_sum3, sq, &mas[0], + &bs[0]); int x = 0; do { - s.val[0] = s.val[1]; - sq.val[0] = sq.val[1]; - BoxFilterPreProcess3(src, x + 8, scale, sum3, square_sum3, &s, &sq, - &mas.val[1], &bs.val[1]); + uint8x16_t ma3x[3]; + BoxFilterPreProcess3(src, x + 8, scale, sum3, square_sum3, s, sq + 1, mas, + bs + 1); + Prepare3_8<0>(mas, ma3x); if (calculate444) { - Store343_444(mas, bs, 0, ma343, ma444, b343, b444); - ma444 += 8; - b444 += 8; + Store343_444<0>(ma3x, bs + 0, 0, ma343, ma444, b343, b444); + Store343_444<8>(ma3x, bs + 1, 0, ma343 + 8, ma444 + 8, b343 + 8, + b444 + 8); + ma444 += 16; + b444 += 16; } else { - const uint16x8_t ma = Sum343(mas); - const uint32x4x2_t b = Sum343W(bs); - vst1q_u16(ma343, ma); - vst1q_u32(b343 + 0, b.val[0]); - vst1q_u32(b343 + 4, b.val[1]); + uint16x8_t ma[2]; + uint32x4x2_t b[2]; + ma[0] = Sum343<0>(ma3x); + b[0] = Sum343W(bs); + vst1q_u16(ma343, ma[0]); + vst1q_u32(b343 + 0, b[0].val[0]); + vst1q_u32(b343 + 4, b[0].val[1]); + ma[1] = Sum343<8>(ma3x); + b[1] = Sum343W(bs + 1); + vst1q_u16(ma343 + 8, ma[1]); + vst1q_u32(b343 + 8, b[1].val[0]); + vst1q_u32(b343 + 12, b[1].val[1]); } - mas.val[0] = mas.val[1]; - bs.val[0] = bs.val[1]; - ma343 += 8; - b343 += 8; - x += 8; + s[0] = s[1]; + sq[1] = sq[3]; + mas[0] = mas[1]; + bs[0] = bs[2]; + ma343 += 16; + b343 += 16; + x += 16; } while (x < width); } @@ -1221,48 +1662,58 @@ inline void BoxSumFilterPreProcess( const uint8_t* const src0, const uint8_t* const src1, const int width, const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5], uint32_t* const square_sum3[4], uint32_t* const square_sum5[5], - uint16_t* const ma343[4], uint16_t* const ma444[2], uint16_t* ma565, - uint32_t* const b343[4], uint32_t* const b444[2], uint32_t* b565) { - uint8x8x2_t s[2]; - uint8x8x2_t ma3[2], ma5; - uint16x8x2_t sq[2], b3[2], b5; - s[0].val[0] = vld1_u8(src0); - s[1].val[0] = vld1_u8(src1); - sq[0].val[0] = vmull_u8(s[0].val[0], s[0].val[0]); - sq[1].val[0] = vmull_u8(s[1].val[0], s[1].val[0]); - BoxFilterPreProcess(src0, src1, 0, scales, sum3, sum5, square_sum3, - square_sum5, s, sq, &ma3[0].val[0], &ma3[1].val[0], - &b3[0].val[0], &b3[1].val[0], &ma5.val[0], &b5.val[0]); + uint16_t* const ma343[4], uint16_t* const ma444, uint16_t* ma565, + uint32_t* const b343[4], uint32_t* const b444, uint32_t* b565) { + uint8x16_t s[2][2], ma3[2][2], ma5[2]; + uint16x8_t sq[2][4], b3[2][3], b5[3]; + BoxFilterPreProcessLo(src0, src1, scales, s, sum3, sum5, square_sum3, + square_sum5, sq, ma3, b3, &ma5[0], &b5[0]); int x = 0; do { - s[0].val[0] = s[0].val[1]; - s[1].val[0] = s[1].val[1]; - sq[0].val[0] = sq[0].val[1]; - sq[1].val[0] = sq[1].val[1]; - BoxFilterPreProcess(src0, src1, x + 8, scales, sum3, sum5, square_sum3, - square_sum5, s, sq, &ma3[0].val[1], &ma3[1].val[1], - &b3[0].val[1], &b3[1].val[1], &ma5.val[1], &b5.val[1]); - uint16x8_t ma = Sum343(ma3[0]); - uint32x4x2_t b = Sum343W(b3[0]); - vst1q_u16(ma343[0] + x, ma); - vst1q_u32(b343[0] + x, b.val[0]); - vst1q_u32(b343[0] + x + 4, b.val[1]); - Store343_444(ma3[1], b3[1], x, ma343[1], ma444[0], b343[1], b444[0]); - ma = Sum565(ma5); - b = Sum565W(b5); - vst1q_u16(ma565, ma); - vst1q_u32(b565 + 0, b.val[0]); - vst1q_u32(b565 + 4, b.val[1]); - ma3[0].val[0] = ma3[0].val[1]; - ma3[1].val[0] = ma3[1].val[1]; - b3[0].val[0] = b3[0].val[1]; - b3[1].val[0] = b3[1].val[1]; - ma5.val[0] = ma5.val[1]; - b5.val[0] = b5.val[1]; - ma565 += 8; - b565 += 8; - x += 8; + uint16x8_t ma[2]; + uint8x16_t ma3x[3], ma5x[3]; + uint32x4x2_t b[2]; + BoxFilterPreProcess(src0, src1, x + 8, scales, s, sum3, sum5, square_sum3, + square_sum5, sq, ma3, b3, ma5, b5 + 1); + Prepare3_8<0>(ma3[0], ma3x); + ma[0] = Sum343<0>(ma3x); + ma[1] = Sum343<8>(ma3x); + b[0] = Sum343W(b3[0] + 0); + b[1] = Sum343W(b3[0] + 1); + vst1q_u16(ma343[0] + x, ma[0]); + vst1q_u16(ma343[0] + x + 8, ma[1]); + vst1q_u32(b343[0] + x, b[0].val[0]); + vst1q_u32(b343[0] + x + 4, b[0].val[1]); + vst1q_u32(b343[0] + x + 8, b[1].val[0]); + vst1q_u32(b343[0] + x + 12, b[1].val[1]); + Prepare3_8<0>(ma3[1], ma3x); + Store343_444<0>(ma3x, b3[1], x, ma343[1], ma444, b343[1], b444); + Store343_444<8>(ma3x, b3[1] + 1, x + 8, ma343[1], ma444, b343[1], b444); + Prepare3_8<0>(ma5, ma5x); + ma[0] = Sum565<0>(ma5x); + ma[1] = Sum565<8>(ma5x); + b[0] = Sum565W(b5); + b[1] = Sum565W(b5 + 1); + vst1q_u16(ma565, ma[0]); + vst1q_u16(ma565 + 8, ma[1]); + vst1q_u32(b565 + 0, b[0].val[0]); + vst1q_u32(b565 + 4, b[0].val[1]); + vst1q_u32(b565 + 8, b[1].val[0]); + vst1q_u32(b565 + 12, b[1].val[1]); + s[0][0] = s[0][1]; + s[1][0] = s[1][1]; + sq[0][1] = sq[0][3]; + sq[1][1] = sq[1][3]; + ma3[0][0] = ma3[0][1]; + ma3[1][0] = ma3[1][1]; + b3[0][0] = b3[0][2]; + b3[1][0] = b3[1][2]; + ma5[0] = ma5[1]; + b5[0] = b5[2]; + ma565 += 16; + b565 += 16; + x += 16; } while (x < width); } @@ -1310,37 +1761,36 @@ inline int16x8_t CalculateFilteredOutputPass2(const uint8x8_t s, return CalculateFilteredOutput<5>(s, ma_sum, b_sum); } -inline void SelfGuidedFinal(const uint8x8_t src, const int32x4_t v[2], - uint8_t* const dst) { +inline uint8x8_t SelfGuidedFinal(const uint8x8_t src, const int32x4_t v[2]) { const int16x4_t v_lo = vrshrn_n_s32(v[0], kSgrProjRestoreBits + kSgrProjPrecisionBits); const int16x4_t v_hi = vrshrn_n_s32(v[1], kSgrProjRestoreBits + kSgrProjPrecisionBits); const int16x8_t vv = vcombine_s16(v_lo, v_hi); - const int16x8_t s = ZeroExtend(src); - const int16x8_t d = vaddq_s16(s, vv); - vst1_u8(dst, vqmovun_s16(d)); + const int16x8_t d = + vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(vv), src)); + return vqmovun_s16(d); } -inline void SelfGuidedDoubleMultiplier(const uint8x8_t src, - const int16x8_t filter[2], const int w0, - const int w2, uint8_t* const dst) { +inline uint8x8_t SelfGuidedDoubleMultiplier(const uint8x8_t src, + const int16x8_t filter[2], + const int w0, const int w2) { int32x4_t v[2]; v[0] = vmull_n_s16(vget_low_s16(filter[0]), w0); v[1] = vmull_n_s16(vget_high_s16(filter[0]), w0); v[0] = vmlal_n_s16(v[0], vget_low_s16(filter[1]), w2); v[1] = vmlal_n_s16(v[1], vget_high_s16(filter[1]), w2); - SelfGuidedFinal(src, v, dst); + return SelfGuidedFinal(src, v); } -inline void SelfGuidedSingleMultiplier(const uint8x8_t src, - const int16x8_t filter, const int w0, - uint8_t* const dst) { +inline uint8x8_t SelfGuidedSingleMultiplier(const uint8x8_t src, + const int16x8_t filter, + const int w0) { // weight: -96 to 96 (Sgrproj_Xqd_Min/Max) int32x4_t v[2]; v[0] = vmull_n_s16(vget_low_s16(filter), w0); v[1] = vmull_n_s16(vget_high_s16(filter), w0); - SelfGuidedFinal(src, v, dst); + return SelfGuidedFinal(src, v); } LIBGAV1_ALWAYS_INLINE void BoxFilterPass1( @@ -1349,43 +1799,60 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterPass1( uint32_t* const square_sum5[5], const int width, const uint32_t scale, const int16_t w0, uint16_t* const ma565[2], uint32_t* const b565[2], uint8_t* const dst) { - uint8x8x2_t s[2], mas; - uint16x8x2_t sq[2], bs; - s[0].val[0] = vld1_u8(src0); - s[1].val[0] = vld1_u8(src1); - sq[0].val[0] = vmull_u8(s[0].val[0], s[0].val[0]); - sq[1].val[0] = vmull_u8(s[1].val[0], s[1].val[0]); - BoxFilterPreProcess5(src0, src1, 0, scale, sum5, square_sum5, s, sq, - &mas.val[0], &bs.val[0]); + uint8x16_t s[2][2], mas[2]; + uint16x8_t sq[2][4], bs[3]; + BoxFilterPreProcess5Lo(src0, src1, scale, s, sum5, square_sum5, sq, &mas[0], + &bs[0]); int x = 0; do { - s[0].val[0] = s[0].val[1]; - s[1].val[0] = s[1].val[1]; - sq[0].val[0] = sq[0].val[1]; - sq[1].val[0] = sq[1].val[1]; - BoxFilterPreProcess5(src0, src1, x + 8, scale, sum5, square_sum5, s, sq, - &mas.val[1], &bs.val[1]); uint16x8_t ma[2]; + uint8x16_t masx[3]; uint32x4x2_t b[2]; - ma[1] = Sum565(mas); + int16x8_t p0, p1; + BoxFilterPreProcess5(src0, src1, x + 8, scale, s, sum5, square_sum5, sq, + mas, bs + 1); + Prepare3_8<0>(mas, masx); + ma[1] = Sum565<0>(masx); b[1] = Sum565W(bs); vst1q_u16(ma565[1] + x, ma[1]); vst1q_u32(b565[1] + x + 0, b[1].val[0]); vst1q_u32(b565[1] + x + 4, b[1].val[1]); - const uint8x8_t sr0 = vld1_u8(src + x); - const uint8x8_t sr1 = vld1_u8(src + stride + x); - int16x8_t p0, p1; + const uint8x16_t sr0 = vld1q_u8(src + x); + const uint8x16_t sr1 = vld1q_u8(src + stride + x); + const uint8x8_t sr00 = vget_low_u8(sr0); + const uint8x8_t sr10 = vget_low_u8(sr1); ma[0] = vld1q_u16(ma565[0] + x); b[0].val[0] = vld1q_u32(b565[0] + x + 0); b[0].val[1] = vld1q_u32(b565[0] + x + 4); - p0 = CalculateFilteredOutputPass1(sr0, ma, b); - p1 = CalculateFilteredOutput<4>(sr1, ma[1], b[1]); - SelfGuidedSingleMultiplier(sr0, p0, w0, dst + x); - SelfGuidedSingleMultiplier(sr1, p1, w0, dst + stride + x); - mas.val[0] = mas.val[1]; - bs.val[0] = bs.val[1]; - x += 8; + p0 = CalculateFilteredOutputPass1(sr00, ma, b); + p1 = CalculateFilteredOutput<4>(sr10, ma[1], b[1]); + const uint8x8_t d00 = SelfGuidedSingleMultiplier(sr00, p0, w0); + const uint8x8_t d10 = SelfGuidedSingleMultiplier(sr10, p1, w0); + + ma[1] = Sum565<8>(masx); + b[1] = Sum565W(bs + 1); + vst1q_u16(ma565[1] + x + 8, ma[1]); + vst1q_u32(b565[1] + x + 8, b[1].val[0]); + vst1q_u32(b565[1] + x + 12, b[1].val[1]); + const uint8x8_t sr01 = vget_high_u8(sr0); + const uint8x8_t sr11 = vget_high_u8(sr1); + ma[0] = vld1q_u16(ma565[0] + x + 8); + b[0].val[0] = vld1q_u32(b565[0] + x + 8); + b[0].val[1] = vld1q_u32(b565[0] + x + 12); + p0 = CalculateFilteredOutputPass1(sr01, ma, b); + p1 = CalculateFilteredOutput<4>(sr11, ma[1], b[1]); + const uint8x8_t d01 = SelfGuidedSingleMultiplier(sr01, p0, w0); + const uint8x8_t d11 = SelfGuidedSingleMultiplier(sr11, p1, w0); + vst1q_u8(dst + x, vcombine_u8(d00, d01)); + vst1q_u8(dst + stride + x, vcombine_u8(d10, d11)); + s[0][0] = s[0][1]; + s[1][0] = s[1][1]; + sq[0][1] = sq[0][3]; + sq[1][1] = sq[1][3]; + mas[0] = mas[1]; + bs[0] = bs[2]; + x += 16; } while (x < width); } @@ -1396,34 +1863,45 @@ inline void BoxFilterPass1LastRow(const uint8_t* const src, uint32_t* const square_sum5[5], uint16_t* ma565, uint32_t* b565, uint8_t* const dst) { - uint8x8x2_t s, mas; - uint16x8x2_t sq, bs; - s.val[0] = vld1_u8(src0); - sq.val[0] = vmull_u8(s.val[0], s.val[0]); - BoxFilterPreProcess5LastRow(src0, 0, scale, sum5, square_sum5, &s, &sq, - &mas.val[0], &bs.val[0]); + uint8x16_t s[2], mas[2]; + uint16x8_t sq[4], bs[4]; + BoxFilterPreProcess5LastRowLo(src0, scale, s, sum5, square_sum5, sq, &mas[0], + &bs[0]); int x = 0; do { - s.val[0] = s.val[1]; - sq.val[0] = sq.val[1]; - BoxFilterPreProcess5LastRow(src0, x + 8, scale, sum5, square_sum5, &s, &sq, - &mas.val[1], &bs.val[1]); uint16x8_t ma[2]; + uint8x16_t masx[3]; uint32x4x2_t b[2]; - ma[1] = Sum565(mas); + BoxFilterPreProcess5LastRow(src0, x + 8, scale, s, sum5, square_sum5, + sq + 1, mas, bs + 1); + Prepare3_8<0>(mas, masx); + ma[1] = Sum565<0>(masx); b[1] = Sum565W(bs); - mas.val[0] = mas.val[1]; - bs.val[0] = bs.val[1]; ma[0] = vld1q_u16(ma565); b[0].val[0] = vld1q_u32(b565 + 0); b[0].val[1] = vld1q_u32(b565 + 4); - const uint8x8_t sr = vld1_u8(src + x); - const int16x8_t p = CalculateFilteredOutputPass1(sr, ma, b); - SelfGuidedSingleMultiplier(sr, p, w0, dst + x); - ma565 += 8; - b565 += 8; - x += 8; + const uint8x16_t sr = vld1q_u8(src + x); + const uint8x8_t sr0 = vget_low_u8(sr); + const int16x8_t p0 = CalculateFilteredOutputPass1(sr0, ma, b); + const uint8x8_t d0 = SelfGuidedSingleMultiplier(sr0, p0, w0); + + ma[1] = Sum565<8>(masx); + b[1] = Sum565W(bs + 1); + bs[0] = bs[2]; + const uint8x8_t sr1 = vget_high_u8(sr); + ma[0] = vld1q_u16(ma565 + 8); + b[0].val[0] = vld1q_u32(b565 + 8); + b[0].val[1] = vld1q_u32(b565 + 12); + const int16x8_t p1 = CalculateFilteredOutputPass1(sr1, ma, b); + const uint8x8_t d1 = SelfGuidedSingleMultiplier(sr1, p1, w0); + vst1q_u8(dst + x, vcombine_u8(d0, d1)); + s[0] = s[1]; + sq[1] = sq[3]; + mas[0] = mas[1]; + ma565 += 16; + b565 += 16; + x += 16; } while (x < width); } @@ -1433,35 +1911,49 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterPass2( uint32_t* const square_sum3[3], uint16_t* const ma343[3], uint16_t* const ma444[2], uint32_t* const b343[3], uint32_t* const b444[2], uint8_t* const dst) { - uint8x8x2_t s, mas; - uint16x8x2_t sq, bs; - s.val[0] = vld1_u8(src0); - sq.val[0] = vmull_u8(s.val[0], s.val[0]); - BoxFilterPreProcess3(src0, 0, scale, sum3, square_sum3, &s, &sq, &mas.val[0], - &bs.val[0]); + uint8x16_t s[2], mas[2]; + uint16x8_t sq[4], bs[3]; + BoxFilterPreProcess3Lo(src0, scale, &s[0], sum3, square_sum3, sq, &mas[0], + &bs[0]); int x = 0; do { - s.val[0] = s.val[1]; - sq.val[0] = sq.val[1]; - BoxFilterPreProcess3(src0, x + 8, scale, sum3, square_sum3, &s, &sq, - &mas.val[1], &bs.val[1]); uint16x8_t ma[3]; + uint8x16_t ma3x[3]; uint32x4x2_t b[3]; - Store343_444(mas, bs, x, &ma[2], &b[2], ma343[2], ma444[1], b343[2], - b444[1]); - const uint8x8_t sr = vld1_u8(src + x); + BoxFilterPreProcess3(src0, x + 8, scale, sum3, square_sum3, s, sq + 1, mas, + bs + 1); + Prepare3_8<0>(mas, ma3x); + Store343_444<0>(ma3x, bs, x, &ma[2], &b[2], ma343[2], ma444[1], b343[2], + b444[1]); + const uint8x16_t sr = vld1q_u8(src + x); + const uint8x8_t sr0 = vget_low_u8(sr); ma[0] = vld1q_u16(ma343[0] + x); ma[1] = vld1q_u16(ma444[0] + x); b[0].val[0] = vld1q_u32(b343[0] + x + 0); b[0].val[1] = vld1q_u32(b343[0] + x + 4); b[1].val[0] = vld1q_u32(b444[0] + x + 0); b[1].val[1] = vld1q_u32(b444[0] + x + 4); - const int16x8_t p = CalculateFilteredOutputPass2(sr, ma, b); - SelfGuidedSingleMultiplier(sr, p, w0, dst + x); - mas.val[0] = mas.val[1]; - bs.val[0] = bs.val[1]; - x += 8; + const int16x8_t p0 = CalculateFilteredOutputPass2(sr0, ma, b); + const uint8x8_t d0 = SelfGuidedSingleMultiplier(sr0, p0, w0); + + Store343_444<8>(ma3x, bs + 1, x + 8, &ma[2], &b[2], ma343[2], ma444[1], + b343[2], b444[1]); + const uint8x8_t sr1 = vget_high_u8(sr); + ma[0] = vld1q_u16(ma343[0] + x + 8); + ma[1] = vld1q_u16(ma444[0] + x + 8); + b[0].val[0] = vld1q_u32(b343[0] + x + 8); + b[0].val[1] = vld1q_u32(b343[0] + x + 12); + b[1].val[0] = vld1q_u32(b444[0] + x + 8); + b[1].val[1] = vld1q_u32(b444[0] + x + 12); + const int16x8_t p1 = CalculateFilteredOutputPass2(sr1, ma, b); + const uint8x8_t d1 = SelfGuidedSingleMultiplier(sr1, p1, w0); + vst1q_u8(dst + x, vcombine_u8(d0, d1)); + s[0] = s[1]; + sq[1] = sq[3]; + mas[0] = mas[1]; + bs[0] = bs[2]; + x += 16; } while (x < width); } @@ -1474,64 +1966,96 @@ LIBGAV1_ALWAYS_INLINE void BoxFilter( uint16_t* const ma343[4], uint16_t* const ma444[3], uint16_t* const ma565[2], uint32_t* const b343[4], uint32_t* const b444[3], uint32_t* const b565[2], uint8_t* const dst) { - uint8x8x2_t s[2], ma3[2], ma5; - uint16x8x2_t sq[2], b3[2], b5; - s[0].val[0] = vld1_u8(src0); - s[1].val[0] = vld1_u8(src1); - sq[0].val[0] = vmull_u8(s[0].val[0], s[0].val[0]); - sq[1].val[0] = vmull_u8(s[1].val[0], s[1].val[0]); - BoxFilterPreProcess(src0, src1, 0, scales, sum3, sum5, square_sum3, - square_sum5, s, sq, &ma3[0].val[0], &ma3[1].val[0], - &b3[0].val[0], &b3[1].val[0], &ma5.val[0], &b5.val[0]); + uint8x16_t s[2][2], ma3[2][2], ma5[2]; + uint16x8_t sq[2][4], b3[2][3], b5[3]; + BoxFilterPreProcessLo(src0, src1, scales, s, sum3, sum5, square_sum3, + square_sum5, sq, ma3, b3, &ma5[0], &b5[0]); int x = 0; do { - s[0].val[0] = s[0].val[1]; - s[1].val[0] = s[1].val[1]; - sq[0].val[0] = sq[0].val[1]; - sq[1].val[0] = sq[1].val[1]; - BoxFilterPreProcess(src0, src1, x + 8, scales, sum3, sum5, square_sum3, - square_sum5, s, sq, &ma3[0].val[1], &ma3[1].val[1], - &b3[0].val[1], &b3[1].val[1], &ma5.val[1], &b5.val[1]); uint16x8_t ma[3][3]; + uint8x16_t ma3x[2][3], ma5x[3]; uint32x4x2_t b[3][3]; - Store343_444(ma3[0], b3[0], x, &ma[1][2], &ma[2][1], &b[1][2], &b[2][1], - ma343[2], ma444[1], b343[2], b444[1]); - Store343_444(ma3[1], b3[1], x, &ma[2][2], &b[2][2], ma343[3], ma444[2], - b343[3], b444[2]); - ma[0][1] = Sum565(ma5); + int16x8_t p[2][2]; + BoxFilterPreProcess(src0, src1, x + 8, scales, s, sum3, sum5, square_sum3, + square_sum5, sq, ma3, b3, ma5, b5 + 1); + Prepare3_8<0>(ma3[0], ma3x[0]); + Prepare3_8<0>(ma3[1], ma3x[1]); + Store343_444<0>(ma3x[0], b3[0], x, &ma[1][2], &ma[2][1], &b[1][2], &b[2][1], + ma343[2], ma444[1], b343[2], b444[1]); + Store343_444<0>(ma3x[1], b3[1], x, &ma[2][2], &b[2][2], ma343[3], ma444[2], + b343[3], b444[2]); + Prepare3_8<0>(ma5, ma5x); + ma[0][1] = Sum565<0>(ma5x); b[0][1] = Sum565W(b5); vst1q_u16(ma565[1] + x, ma[0][1]); vst1q_u32(b565[1] + x, b[0][1].val[0]); vst1q_u32(b565[1] + x + 4, b[0][1].val[1]); - ma3[0].val[0] = ma3[0].val[1]; - ma3[1].val[0] = ma3[1].val[1]; - b3[0].val[0] = b3[0].val[1]; - b3[1].val[0] = b3[1].val[1]; - ma5.val[0] = ma5.val[1]; - b5.val[0] = b5.val[1]; - int16x8_t p[2][2]; - const uint8x8_t sr0 = vld1_u8(src + x); - const uint8x8_t sr1 = vld1_u8(src + stride + x); + const uint8x16_t sr0 = vld1q_u8(src + x); + const uint8x16_t sr1 = vld1q_u8(src + stride + x); + const uint8x8_t sr00 = vget_low_u8(sr0); + const uint8x8_t sr10 = vget_low_u8(sr1); ma[0][0] = vld1q_u16(ma565[0] + x); b[0][0].val[0] = vld1q_u32(b565[0] + x); b[0][0].val[1] = vld1q_u32(b565[0] + x + 4); - p[0][0] = CalculateFilteredOutputPass1(sr0, ma[0], b[0]); - p[1][0] = CalculateFilteredOutput<4>(sr1, ma[0][1], b[0][1]); + p[0][0] = CalculateFilteredOutputPass1(sr00, ma[0], b[0]); + p[1][0] = CalculateFilteredOutput<4>(sr10, ma[0][1], b[0][1]); ma[1][0] = vld1q_u16(ma343[0] + x); ma[1][1] = vld1q_u16(ma444[0] + x); b[1][0].val[0] = vld1q_u32(b343[0] + x); b[1][0].val[1] = vld1q_u32(b343[0] + x + 4); b[1][1].val[0] = vld1q_u32(b444[0] + x); b[1][1].val[1] = vld1q_u32(b444[0] + x + 4); - p[0][1] = CalculateFilteredOutputPass2(sr0, ma[1], b[1]); + p[0][1] = CalculateFilteredOutputPass2(sr00, ma[1], b[1]); ma[2][0] = vld1q_u16(ma343[1] + x); b[2][0].val[0] = vld1q_u32(b343[1] + x); b[2][0].val[1] = vld1q_u32(b343[1] + x + 4); - p[1][1] = CalculateFilteredOutputPass2(sr1, ma[2], b[2]); - SelfGuidedDoubleMultiplier(sr0, p[0], w0, w2, dst + x); - SelfGuidedDoubleMultiplier(sr1, p[1], w0, w2, dst + stride + x); - x += 8; + p[1][1] = CalculateFilteredOutputPass2(sr10, ma[2], b[2]); + const uint8x8_t d00 = SelfGuidedDoubleMultiplier(sr00, p[0], w0, w2); + const uint8x8_t d10 = SelfGuidedDoubleMultiplier(sr10, p[1], w0, w2); + + Store343_444<8>(ma3x[0], b3[0] + 1, x + 8, &ma[1][2], &ma[2][1], &b[1][2], + &b[2][1], ma343[2], ma444[1], b343[2], b444[1]); + Store343_444<8>(ma3x[1], b3[1] + 1, x + 8, &ma[2][2], &b[2][2], ma343[3], + ma444[2], b343[3], b444[2]); + ma[0][1] = Sum565<8>(ma5x); + b[0][1] = Sum565W(b5 + 1); + vst1q_u16(ma565[1] + x + 8, ma[0][1]); + vst1q_u32(b565[1] + x + 8, b[0][1].val[0]); + vst1q_u32(b565[1] + x + 12, b[0][1].val[1]); + b3[0][0] = b3[0][2]; + b3[1][0] = b3[1][2]; + b5[0] = b5[2]; + const uint8x8_t sr01 = vget_high_u8(sr0); + const uint8x8_t sr11 = vget_high_u8(sr1); + ma[0][0] = vld1q_u16(ma565[0] + x + 8); + b[0][0].val[0] = vld1q_u32(b565[0] + x + 8); + b[0][0].val[1] = vld1q_u32(b565[0] + x + 12); + p[0][0] = CalculateFilteredOutputPass1(sr01, ma[0], b[0]); + p[1][0] = CalculateFilteredOutput<4>(sr11, ma[0][1], b[0][1]); + ma[1][0] = vld1q_u16(ma343[0] + x + 8); + ma[1][1] = vld1q_u16(ma444[0] + x + 8); + b[1][0].val[0] = vld1q_u32(b343[0] + x + 8); + b[1][0].val[1] = vld1q_u32(b343[0] + x + 12); + b[1][1].val[0] = vld1q_u32(b444[0] + x + 8); + b[1][1].val[1] = vld1q_u32(b444[0] + x + 12); + p[0][1] = CalculateFilteredOutputPass2(sr01, ma[1], b[1]); + ma[2][0] = vld1q_u16(ma343[1] + x + 8); + b[2][0].val[0] = vld1q_u32(b343[1] + x + 8); + b[2][0].val[1] = vld1q_u32(b343[1] + x + 12); + p[1][1] = CalculateFilteredOutputPass2(sr11, ma[2], b[2]); + const uint8x8_t d01 = SelfGuidedDoubleMultiplier(sr01, p[0], w0, w2); + const uint8x8_t d11 = SelfGuidedDoubleMultiplier(sr11, p[1], w0, w2); + vst1q_u8(dst + x, vcombine_u8(d00, d01)); + vst1q_u8(dst + stride + x, vcombine_u8(d10, d11)); + s[0][0] = s[0][1]; + s[1][0] = s[1][1]; + sq[0][1] = sq[0][3]; + sq[1][1] = sq[1][3]; + ma3[0][0] = ma3[0][1]; + ma3[1][0] = ma3[1][1]; + ma5[0] = ma5[1]; + x += 16; } while (x < width); } @@ -1540,58 +2064,79 @@ inline void BoxFilterLastRow( const uint16_t scales[2], const int16_t w0, const int16_t w2, uint16_t* const sum3[4], uint16_t* const sum5[5], uint32_t* const square_sum3[4], uint32_t* const square_sum5[5], - uint16_t* const ma343[4], uint16_t* const ma444[3], - uint16_t* const ma565[2], uint32_t* const b343[4], uint32_t* const b444[3], - uint32_t* const b565[2], uint8_t* const dst) { - uint8x8x2_t s, ma3, ma5; - uint16x8x2_t sq, b3, b5; - uint16x8_t ma[3]; + uint16_t* const ma343, uint16_t* const ma444, uint16_t* const ma565, + uint32_t* const b343, uint32_t* const b444, uint32_t* const b565, + uint8_t* const dst) { + uint8x16_t s[2], ma3[2], ma5[2]; + uint16x8_t sq[4], ma[3], b3[3], b5[3]; uint32x4x2_t b[3]; - s.val[0] = vld1_u8(src0); - sq.val[0] = vmull_u8(s.val[0], s.val[0]); - BoxFilterPreProcessLastRow(src0, 0, scales, sum3, sum5, square_sum3, - square_sum5, &s, &sq, &ma3.val[0], &ma5.val[0], - &b3.val[0], &b5.val[0]); + BoxFilterPreProcessLastRowLo(src0, scales, sum3, sum5, square_sum3, + square_sum5, &s[0], sq, &ma3[0], &ma5[0], &b3[0], + &b5[0]); int x = 0; do { - s.val[0] = s.val[1]; - sq.val[0] = sq.val[1]; + uint8x16_t ma3x[3], ma5x[3]; + int16x8_t p[2]; BoxFilterPreProcessLastRow(src0, x + 8, scales, sum3, sum5, square_sum3, - square_sum5, &s, &sq, &ma3.val[1], &ma5.val[1], - &b3.val[1], &b5.val[1]); - ma[1] = Sum565(ma5); + square_sum5, s, sq + 1, ma3, ma5, &b3[1], + &b5[1]); + Prepare3_8<0>(ma5, ma5x); + ma[1] = Sum565<0>(ma5x); b[1] = Sum565W(b5); - ma5.val[0] = ma5.val[1]; - b5.val[0] = b5.val[1]; - ma[2] = Sum343(ma3); + Prepare3_8<0>(ma3, ma3x); + ma[2] = Sum343<0>(ma3x); b[2] = Sum343W(b3); - ma3.val[0] = ma3.val[1]; - b3.val[0] = b3.val[1]; - const uint8x8_t sr = vld1_u8(src + x); - int16x8_t p[2]; - ma[0] = vld1q_u16(ma565[0] + x); - b[0].val[0] = vld1q_u32(b565[0] + x + 0); - b[0].val[1] = vld1q_u32(b565[0] + x + 4); - p[0] = CalculateFilteredOutputPass1(sr, ma, b); - ma[0] = vld1q_u16(ma343[0] + x); - ma[1] = vld1q_u16(ma444[0] + x); - b[0].val[0] = vld1q_u32(b343[0] + x + 0); - b[0].val[1] = vld1q_u32(b343[0] + x + 4); - b[1].val[0] = vld1q_u32(b444[0] + x + 0); - b[1].val[1] = vld1q_u32(b444[0] + x + 4); - p[1] = CalculateFilteredOutputPass2(sr, ma, b); - SelfGuidedDoubleMultiplier(sr, p, w0, w2, dst + x); - x += 8; + const uint8x16_t sr = vld1q_u8(src + x); + const uint8x8_t sr0 = vget_low_u8(sr); + ma[0] = vld1q_u16(ma565 + x); + b[0].val[0] = vld1q_u32(b565 + x + 0); + b[0].val[1] = vld1q_u32(b565 + x + 4); + p[0] = CalculateFilteredOutputPass1(sr0, ma, b); + ma[0] = vld1q_u16(ma343 + x); + ma[1] = vld1q_u16(ma444 + x); + b[0].val[0] = vld1q_u32(b343 + x + 0); + b[0].val[1] = vld1q_u32(b343 + x + 4); + b[1].val[0] = vld1q_u32(b444 + x + 0); + b[1].val[1] = vld1q_u32(b444 + x + 4); + p[1] = CalculateFilteredOutputPass2(sr0, ma, b); + const uint8x8_t d0 = SelfGuidedDoubleMultiplier(sr0, p, w0, w2); + + ma[1] = Sum565<8>(ma5x); + b[1] = Sum565W(b5 + 1); + b5[0] = b5[2]; + ma[2] = Sum343<8>(ma3x); + b[2] = Sum343W(b3 + 1); + b3[0] = b3[2]; + const uint8x8_t sr1 = vget_high_u8(sr); + ma[0] = vld1q_u16(ma565 + x + 8); + b[0].val[0] = vld1q_u32(b565 + x + 8); + b[0].val[1] = vld1q_u32(b565 + x + 12); + p[0] = CalculateFilteredOutputPass1(sr1, ma, b); + ma[0] = vld1q_u16(ma343 + x + 8); + ma[1] = vld1q_u16(ma444 + x + 8); + b[0].val[0] = vld1q_u32(b343 + x + 8); + b[0].val[1] = vld1q_u32(b343 + x + 12); + b[1].val[0] = vld1q_u32(b444 + x + 8); + b[1].val[1] = vld1q_u32(b444 + x + 12); + p[1] = CalculateFilteredOutputPass2(sr1, ma, b); + const uint8x8_t d1 = SelfGuidedDoubleMultiplier(sr1, p, w0, w2); + vst1q_u8(dst + x, vcombine_u8(d0, d1)); + s[0] = s[1]; + sq[1] = sq[3]; + ma3[0] = ma3[1]; + ma5[0] = ma5[1]; + x += 16; } while (x < width); } LIBGAV1_ALWAYS_INLINE void BoxFilterProcess( const RestorationUnitInfo& restoration_info, const uint8_t* src, - const uint8_t* const top_border, const uint8_t* bottom_border, - const ptrdiff_t stride, const int width, const int height, + const ptrdiff_t stride, const uint8_t* const top_border, + const ptrdiff_t top_border_stride, const uint8_t* bottom_border, + const ptrdiff_t bottom_border_stride, const int width, const int height, SgrBuffer* const sgr_buffer, uint8_t* dst) { - const auto temp_stride = Align<ptrdiff_t>(width, 8); + const auto temp_stride = Align<ptrdiff_t>(width, 16); const ptrdiff_t sum_stride = temp_stride + 8; const int sgr_proj_index = restoration_info.sgr_proj_info.index; const uint16_t* const scales = kSgrScaleParameter[sgr_proj_index]; // < 2^12. @@ -1628,13 +2173,13 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterProcess( b565[1] = b565[0] + temp_stride; assert(scales[0] != 0); assert(scales[1] != 0); - BoxSum(top_border, stride, 2, sum_stride, sum3[0], sum5[1], square_sum3[0], - square_sum5[1]); + BoxSum(top_border, top_border_stride, sum_stride, sum3[0], sum5[1], + square_sum3[0], square_sum5[1]); sum5[0] = sum5[1]; square_sum5[0] = square_sum5[1]; const uint8_t* const s = (height > 1) ? src + stride : bottom_border; BoxSumFilterPreProcess(src, s, width, scales, sum3, sum5, square_sum3, - square_sum5, ma343, ma444, ma565[0], b343, b444, + square_sum5, ma343, ma444[0], ma565[0], b343, b444[0], b565[0]); sum5[0] = sgr_buffer->sum5; square_sum5[0] = sgr_buffer->square_sum5; @@ -1665,7 +2210,7 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterProcess( const uint8_t* sr[2]; if ((height & 1) == 0) { sr[0] = bottom_border; - sr[1] = bottom_border + stride; + sr[1] = bottom_border + bottom_border_stride; } else { sr[0] = src + 2 * stride; sr[1] = bottom_border; @@ -1689,20 +2234,22 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterProcess( std::swap(ma565[0], ma565[1]); std::swap(b565[0], b565[1]); } - BoxFilterLastRow(src + 3, bottom_border + stride, width, scales, w0, w2, - sum3, sum5, square_sum3, square_sum5, ma343, ma444, ma565, - b343, b444, b565, dst); + BoxFilterLastRow(src + 3, bottom_border + bottom_border_stride, width, + scales, w0, w2, sum3, sum5, square_sum3, square_sum5, + ma343[0], ma444[0], ma565[0], b343[0], b444[0], b565[0], + dst); } } inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info, - const uint8_t* src, + const uint8_t* src, const ptrdiff_t stride, const uint8_t* const top_border, + const ptrdiff_t top_border_stride, const uint8_t* bottom_border, - const ptrdiff_t stride, const int width, - const int height, SgrBuffer* const sgr_buffer, - uint8_t* dst) { - const auto temp_stride = Align<ptrdiff_t>(width, 8); + const ptrdiff_t bottom_border_stride, + const int width, const int height, + SgrBuffer* const sgr_buffer, uint8_t* dst) { + const auto temp_stride = Align<ptrdiff_t>(width, 16); const ptrdiff_t sum_stride = temp_stride + 8; const int sgr_proj_index = restoration_info.sgr_proj_info.index; const uint32_t scale = kSgrScaleParameter[sgr_proj_index][0]; // < 2^12. @@ -1720,7 +2267,7 @@ inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info, b565[0] = sgr_buffer->b565; b565[1] = b565[0] + temp_stride; assert(scale != 0); - BoxSum<5>(top_border, stride, 2, sum_stride, sum5[1], square_sum5[1]); + BoxSum<5>(top_border, top_border_stride, sum_stride, sum5[1], square_sum5[1]); sum5[0] = sum5[1]; square_sum5[0] = square_sum5[1]; const uint8_t* const s = (height > 1) ? src + stride : bottom_border; @@ -1746,7 +2293,7 @@ inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info, const uint8_t* sr[2]; if ((height & 1) == 0) { sr[0] = bottom_border; - sr[1] = bottom_border + stride; + sr[1] = bottom_border + bottom_border_stride; } else { sr[0] = src + 2 * stride; sr[1] = bottom_border; @@ -1763,20 +2310,21 @@ inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info, Circulate5PointersBy2<uint16_t>(sum5); Circulate5PointersBy2<uint32_t>(square_sum5); } - BoxFilterPass1LastRow(src + 3, bottom_border + stride, width, scale, w0, - sum5, square_sum5, ma565[0], b565[0], dst); + BoxFilterPass1LastRow(src + 3, bottom_border + bottom_border_stride, width, + scale, w0, sum5, square_sum5, ma565[0], b565[0], dst); } } inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info, - const uint8_t* src, + const uint8_t* src, const ptrdiff_t stride, const uint8_t* const top_border, + const ptrdiff_t top_border_stride, const uint8_t* bottom_border, - const ptrdiff_t stride, const int width, - const int height, SgrBuffer* const sgr_buffer, - uint8_t* dst) { + const ptrdiff_t bottom_border_stride, + const int width, const int height, + SgrBuffer* const sgr_buffer, uint8_t* dst) { assert(restoration_info.sgr_proj_info.multiplier[0] == 0); - const auto temp_stride = Align<ptrdiff_t>(width, 8); + const auto temp_stride = Align<ptrdiff_t>(width, 16); const ptrdiff_t sum_stride = temp_stride + 8; const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1]; const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1; @@ -1799,7 +2347,7 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info, b444[0] = sgr_buffer->b444; b444[1] = b444[0] + temp_stride; assert(scale != 0); - BoxSum<3>(top_border, stride, 2, sum_stride, sum3[0], square_sum3[0]); + BoxSum<3>(top_border, top_border_stride, sum_stride, sum3[0], square_sum3[0]); BoxSumFilterPreProcess3<false>(src, width, scale, sum3, square_sum3, ma343[0], nullptr, b343[0], nullptr); Circulate3PointersBy1<uint16_t>(sum3); @@ -1809,7 +2357,7 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info, s = src + stride; } else { s = bottom_border; - bottom_border += stride; + bottom_border += bottom_border_stride; } BoxSumFilterPreProcess3<true>(s, width, scale, sum3, square_sum3, ma343[1], ma444[0], b343[1], b444[0]); @@ -1836,7 +2384,7 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info, ma343, ma444, b343, b444, dst); src += stride; dst += stride; - bottom_border += stride; + bottom_border += bottom_border_stride; Circulate3PointersBy1<uint16_t>(ma343); Circulate3PointersBy1<uint32_t>(b343); std::swap(ma444[0], ma444[1]); @@ -1849,8 +2397,9 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info, // part of the visible frame. void SelfGuidedFilter_NEON( const RestorationUnitInfo& restoration_info, const void* const source, - const void* const top_border, const void* const bottom_border, - const ptrdiff_t stride, const int width, const int height, + const ptrdiff_t stride, const void* const top_border, + const ptrdiff_t top_border_stride, const void* const bottom_border, + const ptrdiff_t bottom_border_stride, const int width, const int height, RestorationBuffer* const restoration_buffer, void* const dest) { const int index = restoration_info.sgr_proj_info.index; const int radius_pass_0 = kSgrProjParams[index][0]; // 2 or 0 @@ -1864,14 +2413,17 @@ void SelfGuidedFilter_NEON( // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the // following assertion. assert(radius_pass_0 != 0); - BoxFilterProcessPass1(restoration_info, src - 3, top - 3, bottom - 3, - stride, width, height, sgr_buffer, dst); + BoxFilterProcessPass1(restoration_info, src - 3, stride, top - 3, + top_border_stride, bottom - 3, bottom_border_stride, + width, height, sgr_buffer, dst); } else if (radius_pass_0 == 0) { - BoxFilterProcessPass2(restoration_info, src - 2, top - 2, bottom - 2, - stride, width, height, sgr_buffer, dst); + BoxFilterProcessPass2(restoration_info, src - 2, stride, top - 2, + top_border_stride, bottom - 2, bottom_border_stride, + width, height, sgr_buffer, dst); } else { - BoxFilterProcess(restoration_info, src - 3, top - 3, bottom - 3, stride, - width, height, sgr_buffer, dst); + BoxFilterProcess(restoration_info, src - 3, stride, top - 3, + top_border_stride, bottom - 3, bottom_border_stride, width, + height, sgr_buffer, dst); } } @@ -1890,7 +2442,7 @@ void LoopRestorationInit_NEON() { low_bitdepth::Init8bpp(); } } // namespace dsp } // namespace libgav1 -#else // !LIBGAV1_ENABLE_NEON +#else // !LIBGAV1_ENABLE_NEON namespace libgav1 { namespace dsp { diff --git a/src/dsp/arm/mask_blend_neon.cc b/src/dsp/arm/mask_blend_neon.cc index 084f42f..ee50923 100644 --- a/src/dsp/arm/mask_blend_neon.cc +++ b/src/dsp/arm/mask_blend_neon.cc @@ -432,7 +432,7 @@ void MaskBlendInit_NEON() { low_bitdepth::Init8bpp(); } } // namespace dsp } // namespace libgav1 -#else // !LIBGAV1_ENABLE_NEON +#else // !LIBGAV1_ENABLE_NEON namespace libgav1 { namespace dsp { diff --git a/src/dsp/arm/motion_field_projection_neon.cc b/src/dsp/arm/motion_field_projection_neon.cc index 8caba7d..3e731b2 100644 --- a/src/dsp/arm/motion_field_projection_neon.cc +++ b/src/dsp/arm/motion_field_projection_neon.cc @@ -382,7 +382,7 @@ void MotionFieldProjectionInit_NEON() { } // namespace dsp } // namespace libgav1 -#else // !LIBGAV1_ENABLE_NEON +#else // !LIBGAV1_ENABLE_NEON namespace libgav1 { namespace dsp { diff --git a/src/dsp/arm/motion_vector_search_neon.cc b/src/dsp/arm/motion_vector_search_neon.cc index 8a403a6..da3ba17 100644 --- a/src/dsp/arm/motion_vector_search_neon.cc +++ b/src/dsp/arm/motion_vector_search_neon.cc @@ -256,7 +256,7 @@ void MotionVectorSearchInit_NEON() { } // namespace dsp } // namespace libgav1 -#else // !LIBGAV1_ENABLE_NEON +#else // !LIBGAV1_ENABLE_NEON namespace libgav1 { namespace dsp { diff --git a/src/dsp/arm/obmc_neon.cc b/src/dsp/arm/obmc_neon.cc index 66ad663..1111a90 100644 --- a/src/dsp/arm/obmc_neon.cc +++ b/src/dsp/arm/obmc_neon.cc @@ -380,7 +380,7 @@ void ObmcInit_NEON() { Init8bpp(); } } // namespace dsp } // namespace libgav1 -#else // !LIBGAV1_ENABLE_NEON +#else // !LIBGAV1_ENABLE_NEON namespace libgav1 { namespace dsp { diff --git a/src/dsp/arm/super_res_neon.cc b/src/dsp/arm/super_res_neon.cc index 1680450..91537c4 100644 --- a/src/dsp/arm/super_res_neon.cc +++ b/src/dsp/arm/super_res_neon.cc @@ -12,7 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "src/dsp/arm/common_neon.h" #include "src/dsp/super_res.h" #include "src/utils/cpu.h" @@ -20,6 +19,7 @@ #include <arm_neon.h> +#include "src/dsp/arm/common_neon.h" #include "src/dsp/constants.h" #include "src/dsp/dsp.h" #include "src/utils/common.h" @@ -82,10 +82,10 @@ inline uint8x8_t SuperRes(const uint8x8_t src[kSuperResFilterTaps], } void SuperRes_NEON(const void* const coefficients, void* const source, - const ptrdiff_t stride, const int height, + const ptrdiff_t source_stride, const int height, const int downscaled_width, const int upscaled_width, const int initial_subpixel_x, const int step, - void* const dest) { + void* const dest, const ptrdiff_t dest_stride) { auto* src = static_cast<uint8_t*>(source) - DivideBy2(kSuperResFilterTaps); auto* dst = static_cast<uint8_t*>(dest); int y = height; @@ -100,7 +100,7 @@ void SuperRes_NEON(const void* const coefficients, void* const source, int x = RightShiftWithCeiling(upscaled_width, 4); // The below code calculates up to 15 extra upscaled // pixels which will over-read up to 15 downscaled pixels in the end of each - // row. kSuperResHorizontalBorder accounts for this. + // row. kSuperResHorizontalPadding accounts for this. do { for (int i = 0; i < 8; ++i, subpixel_x += step) { sr[i] = vld1_u8(&src[subpixel_x >> kSuperResScaleBits]); @@ -135,8 +135,8 @@ void SuperRes_NEON(const void* const coefficients, void* const source, vst1q_u8(dst_ptr, vcombine_u8(d0, d1)); dst_ptr += 16; } while (--x != 0); - src += stride; - dst += stride; + src += source_stride; + dst += dest_stride; } while (--y != 0); } @@ -149,12 +149,147 @@ void Init8bpp() { } // namespace } // namespace low_bitdepth -void SuperResInit_NEON() { low_bitdepth::Init8bpp(); } +//------------------------------------------------------------------------------ +#if LIBGAV1_MAX_BITDEPTH >= 10 +namespace high_bitdepth { +namespace { + +void SuperResCoefficients_NEON(const int upscaled_width, + const int initial_subpixel_x, const int step, + void* const coefficients) { + auto* dst = static_cast<uint16_t*>(coefficients); + int subpixel_x = initial_subpixel_x; + int x = RightShiftWithCeiling(upscaled_width, 3); + do { + uint16x8_t filter[8]; + for (int i = 0; i < 8; ++i, subpixel_x += step) { + const uint8x8_t filter_8 = + vld1_u8(kUpscaleFilterUnsigned[(subpixel_x & kSuperResScaleMask) >> + kSuperResExtraBits]); + // uint8_t -> uint16_t + filter[i] = vmovl_u8(filter_8); + } + + Transpose8x8(filter); + + vst1q_u16(dst, filter[0]); + dst += 8; + vst1q_u16(dst, filter[1]); + dst += 8; + vst1q_u16(dst, filter[2]); + dst += 8; + vst1q_u16(dst, filter[3]); + dst += 8; + vst1q_u16(dst, filter[4]); + dst += 8; + vst1q_u16(dst, filter[5]); + dst += 8; + vst1q_u16(dst, filter[6]); + dst += 8; + vst1q_u16(dst, filter[7]); + dst += 8; + } while (--x != 0); +} + +// The sum is clipped to [0, ((1 << bitdepth) -1)]. Adding all positive and then +// subtracting all negative with saturation will clip to zero. +// 0 1 2 3 4 5 6 7 +// tap sign: - + - + + - + - +inline uint16x8_t SuperRes(const uint16x8_t src[kSuperResFilterTaps], + const uint16_t** coefficients, int bitdepth) { + uint16x8_t f[kSuperResFilterTaps]; + for (int i = 0; i < kSuperResFilterTaps; ++i, *coefficients += 8) { + f[i] = vld1q_u16(*coefficients); + } + + uint32x4_t res_lo = vmull_u16(vget_low_u16(src[1]), vget_low_u16(f[1])); + res_lo = vmlal_u16(res_lo, vget_low_u16(src[3]), vget_low_u16(f[3])); + res_lo = vmlal_u16(res_lo, vget_low_u16(src[4]), vget_low_u16(f[4])); + res_lo = vmlal_u16(res_lo, vget_low_u16(src[6]), vget_low_u16(f[6])); + + uint32x4_t temp_lo = vmull_u16(vget_low_u16(src[0]), vget_low_u16(f[0])); + temp_lo = vmlal_u16(temp_lo, vget_low_u16(src[2]), vget_low_u16(f[2])); + temp_lo = vmlal_u16(temp_lo, vget_low_u16(src[5]), vget_low_u16(f[5])); + temp_lo = vmlal_u16(temp_lo, vget_low_u16(src[7]), vget_low_u16(f[7])); + + res_lo = vqsubq_u32(res_lo, temp_lo); + + uint32x4_t res_hi = vmull_u16(vget_high_u16(src[1]), vget_high_u16(f[1])); + res_hi = vmlal_u16(res_hi, vget_high_u16(src[3]), vget_high_u16(f[3])); + res_hi = vmlal_u16(res_hi, vget_high_u16(src[4]), vget_high_u16(f[4])); + res_hi = vmlal_u16(res_hi, vget_high_u16(src[6]), vget_high_u16(f[6])); + uint32x4_t temp_hi = vmull_u16(vget_high_u16(src[0]), vget_high_u16(f[0])); + temp_hi = vmlal_u16(temp_hi, vget_high_u16(src[2]), vget_high_u16(f[2])); + temp_hi = vmlal_u16(temp_hi, vget_high_u16(src[5]), vget_high_u16(f[5])); + temp_hi = vmlal_u16(temp_hi, vget_high_u16(src[7]), vget_high_u16(f[7])); + + res_hi = vqsubq_u32(res_hi, temp_hi); + + const uint16x8_t res = vcombine_u16(vqrshrn_n_u32(res_lo, kFilterBits), + vqrshrn_n_u32(res_hi, kFilterBits)); + + // Clip the result at (1 << bd) - 1. + return vminq_u16(res, vdupq_n_u16((1 << bitdepth) - 1)); +} + +template <int bitdepth> +void SuperRes_NEON(const void* const coefficients, void* const source, + const ptrdiff_t source_stride, const int height, + const int downscaled_width, const int upscaled_width, + const int initial_subpixel_x, const int step, + void* const dest, const ptrdiff_t dest_stride) { + auto* src = static_cast<uint16_t*>(source) - DivideBy2(kSuperResFilterTaps); + auto* dst = static_cast<uint16_t*>(dest); + int y = height; + do { + const auto* filter = static_cast<const uint16_t*>(coefficients); + uint16_t* dst_ptr = dst; + ExtendLine<uint16_t>(src + DivideBy2(kSuperResFilterTaps), downscaled_width, + kSuperResHorizontalBorder, kSuperResHorizontalBorder); + int subpixel_x = initial_subpixel_x; + uint16x8_t sr[8]; + int x = RightShiftWithCeiling(upscaled_width, 3); + // The below code calculates up to 7 extra upscaled + // pixels which will over-read up to 7 downscaled pixels in the end of each + // row. kSuperResHorizontalBorder accounts for this. + do { + for (int i = 0; i < 8; ++i, subpixel_x += step) { + sr[i] = vld1q_u16(&src[subpixel_x >> kSuperResScaleBits]); + } + + Transpose8x8(sr); + + const uint16x8_t d0 = SuperRes(sr, &filter, bitdepth); + vst1q_u16(dst_ptr, d0); + dst_ptr += 8; + } while (--x != 0); + src += source_stride; + dst += dest_stride; + } while (--y != 0); +} + +void Init10bpp() { + Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth10); + assert(dsp != nullptr); + dsp->super_res_coefficients = SuperResCoefficients_NEON; + dsp->super_res = SuperRes_NEON<10>; +} + +} // namespace +} // namespace high_bitdepth +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +void SuperResInit_NEON() { + low_bitdepth::Init8bpp(); +#if LIBGAV1_MAX_BITDEPTH >= 10 + high_bitdepth::Init10bpp(); +#endif +} } // namespace dsp } // namespace libgav1 -#else // !LIBGAV1_ENABLE_NEON +#else // !LIBGAV1_ENABLE_NEON namespace libgav1 { namespace dsp { diff --git a/src/dsp/arm/super_res_neon.h b/src/dsp/arm/super_res_neon.h index f51785d..65e48c5 100644 --- a/src/dsp/arm/super_res_neon.h +++ b/src/dsp/arm/super_res_neon.h @@ -31,7 +31,10 @@ void SuperResInit_NEON(); #if LIBGAV1_ENABLE_NEON #define LIBGAV1_Dsp8bpp_SuperRes LIBGAV1_CPU_NEON -#define LIBGAV1_Dsp8bpp_SuperResClip LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_SuperResCoefficients LIBGAV1_CPU_NEON + +#define LIBGAV1_Dsp10bpp_SuperResCoefficients LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_SuperRes LIBGAV1_CPU_NEON #endif // LIBGAV1_ENABLE_NEON #endif // LIBGAV1_SRC_DSP_ARM_SUPER_RES_NEON_H_ diff --git a/src/dsp/arm/warp_neon.cc b/src/dsp/arm/warp_neon.cc index 7a41998..c7fb739 100644 --- a/src/dsp/arm/warp_neon.cc +++ b/src/dsp/arm/warp_neon.cc @@ -289,7 +289,7 @@ void Warp_NEON(const void* const source, const ptrdiff_t source_stride, const int16x8_t sum = vld1q_s16(tmp); vst1_u8(reinterpret_cast<uint8_t*>(dst_row), vqmovun_s16(sum)); } -#else // !defined(__aarch64__) +#else // !defined(__aarch64__) int16x8_t filter[8]; for (int x = 0; x < 8; ++x) { const int offset = @@ -442,7 +442,7 @@ void WarpInit_NEON() { low_bitdepth::Init8bpp(); } } // namespace dsp } // namespace libgav1 -#else // !LIBGAV1_ENABLE_NEON +#else // !LIBGAV1_ENABLE_NEON namespace libgav1 { namespace dsp { diff --git a/src/dsp/arm/weight_mask_neon.cc b/src/dsp/arm/weight_mask_neon.cc index 49d3be0..7e5bff0 100644 --- a/src/dsp/arm/weight_mask_neon.cc +++ b/src/dsp/arm/weight_mask_neon.cc @@ -451,7 +451,7 @@ void WeightMaskInit_NEON() { low_bitdepth::Init8bpp(); } } // namespace dsp } // namespace libgav1 -#else // !LIBGAV1_ENABLE_NEON +#else // !LIBGAV1_ENABLE_NEON namespace libgav1 { namespace dsp { diff --git a/src/dsp/average_blend.cc b/src/dsp/average_blend.cc index a59abb0..d3ec21f 100644 --- a/src/dsp/average_blend.cc +++ b/src/dsp/average_blend.cc @@ -76,9 +76,7 @@ void Init10bpp() { Dsp* const dsp = dsp_internal::GetWritableDspTable(10); assert(dsp != nullptr); #if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS -#ifndef LIBGAV1_Dsp10bpp_AverageBlend dsp->average_blend = AverageBlend_C<10, uint16_t>; -#endif #else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS static_cast<void>(dsp); #ifndef LIBGAV1_Dsp10bpp_AverageBlend diff --git a/src/dsp/average_blend_test.cc b/src/dsp/average_blend_test.cc new file mode 100644 index 0000000..fe8a9d6 --- /dev/null +++ b/src/dsp/average_blend_test.cc @@ -0,0 +1,322 @@ +// Copyright 2020 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/average_blend.h" + +#include <cstdint> +#include <ostream> +#include <string> +#include <type_traits> + +#include "absl/strings/match.h" +#include "absl/strings/str_format.h" +#include "absl/strings/string_view.h" +#include "absl/time/clock.h" +#include "absl/time/time.h" +#include "gtest/gtest.h" +#include "src/dsp/constants.h" +#include "src/dsp/distance_weighted_blend.h" +#include "src/dsp/dsp.h" +#include "src/utils/common.h" +#include "src/utils/constants.h" +#include "src/utils/cpu.h" +#include "src/utils/memory.h" +#include "tests/block_utils.h" +#include "tests/third_party/libvpx/acm_random.h" +#include "tests/utils.h" + +namespace libgav1 { +namespace dsp { +namespace { + +constexpr int kNumSpeedTests = 5e8; +constexpr char kAverageBlend[] = "AverageBlend"; +// average_blend is applied to compound prediction values. This implies a range +// far exceeding that of pixel values. +// The ranges include kCompoundOffset in 10bpp and 12bpp. +// see: src/dsp/convolve.cc & src/dsp/warp.cc. +constexpr int kCompoundPredictionRange[3][2] = { + // 8bpp + {-5132, 9212}, + // 10bpp + {3988, 61532}, + // 12bpp + {3974, 61559}, +}; + +struct TestParam { + TestParam(int width, int height) : width(width), height(height) {} + int width; + int height; +}; + +std::ostream& operator<<(std::ostream& os, const TestParam& param) { + return os << "BlockSize" << param.width << "x" << param.height; +} + +template <int bitdepth, typename Pixel> +class AverageBlendTest : public testing::TestWithParam<TestParam>, + public test_utils::MaxAlignedAllocable { + public: + AverageBlendTest() = default; + ~AverageBlendTest() override = default; + + void SetUp() override { + test_utils::ResetDspTable(bitdepth); + AverageBlendInit_C(); + DistanceWeightedBlendInit_C(); + const dsp::Dsp* const dsp = dsp::GetDspTable(bitdepth); + ASSERT_NE(dsp, nullptr); + base_func_ = dsp->average_blend; + const testing::TestInfo* const test_info = + testing::UnitTest::GetInstance()->current_test_info(); + const absl::string_view test_case = test_info->test_suite_name(); + if (absl::StartsWith(test_case, "C/")) { + base_func_ = nullptr; + } else if (absl::StartsWith(test_case, "SSE41/")) { + if ((GetCpuInfo() & kSSE4_1) != 0) { + AverageBlendInit_SSE4_1(); + } + } else if (absl::StartsWith(test_case, "NEON/")) { + AverageBlendInit_NEON(); + } else { + FAIL() << "Unrecognized architecture prefix in test case name: " + << test_case; + } + func_ = dsp->average_blend; + dist_blend_func_ = dsp->distance_weighted_blend; + } + + protected: + void Test(const char* digest, int num_tests, bool debug); + + private: + using PredType = + typename std::conditional<bitdepth == 8, int16_t, uint16_t>::type; + static constexpr int kDestStride = kMaxSuperBlockSizeInPixels; + const int width_ = GetParam().width; + const int height_ = GetParam().height; + alignas(kMaxAlignment) PredType + source1_[kMaxSuperBlockSizeInPixels * kMaxSuperBlockSizeInPixels]; + alignas(kMaxAlignment) PredType + source2_[kMaxSuperBlockSizeInPixels * kMaxSuperBlockSizeInPixels]; + Pixel dest_[kMaxSuperBlockSizeInPixels * kMaxSuperBlockSizeInPixels] = {}; + Pixel reference_[kMaxSuperBlockSizeInPixels * kMaxSuperBlockSizeInPixels] = + {}; + dsp::AverageBlendFunc base_func_; + dsp::AverageBlendFunc func_; + dsp::DistanceWeightedBlendFunc dist_blend_func_; +}; + +template <int bitdepth, typename Pixel> +void AverageBlendTest<bitdepth, Pixel>::Test(const char* digest, int num_tests, + bool debug) { + if (func_ == nullptr) return; + libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed()); + PredType* src_1 = source1_; + PredType* src_2 = source2_; + for (int y = 0; y < height_; ++y) { + for (int x = 0; x < width_; ++x) { + constexpr int bitdepth_index = (bitdepth - 8) >> 1; + const int min_val = kCompoundPredictionRange[bitdepth_index][0]; + const int max_val = kCompoundPredictionRange[bitdepth_index][1]; + src_1[x] = static_cast<PredType>(rnd(max_val - min_val) + min_val); + src_2[x] = static_cast<PredType>(rnd(max_val - min_val) + min_val); + } + src_1 += width_; + src_2 += width_; + } + absl::Duration elapsed_time; + for (int i = 0; i < num_tests; ++i) { + const absl::Time start = absl::Now(); + func_(source1_, source2_, width_, height_, dest_, + sizeof(dest_[0]) * kDestStride); + elapsed_time += absl::Now() - start; + } + if (debug) { + if (base_func_ != nullptr) { + base_func_(source1_, source2_, width_, height_, reference_, + sizeof(reference_[0]) * kDestStride); + } else { + // Use dist_blend_func_ as the base for C tests. + const int8_t weight = 8; + dist_blend_func_(source1_, source2_, weight, weight, width_, height_, + reference_, sizeof(reference_[0]) * kDestStride); + } + EXPECT_TRUE(test_utils::CompareBlocks(dest_, reference_, width_, height_, + kDestStride, kDestStride, false)); + } + + test_utils::CheckMd5Digest( + kAverageBlend, absl::StrFormat("%dx%d", width_, height_).c_str(), digest, + dest_, sizeof(dest_[0]) * kDestStride * height_, elapsed_time); +} + +const TestParam kTestParam[] = { + TestParam(4, 4), TestParam(4, 8), TestParam(8, 8), + TestParam(8, 16), TestParam(16, 8), TestParam(16, 16), + TestParam(16, 32), TestParam(32, 16), TestParam(32, 32), + TestParam(32, 64), TestParam(64, 32), TestParam(64, 64), + TestParam(64, 128), TestParam(128, 64), TestParam(128, 128), +}; + +using AverageBlendTest8bpp = AverageBlendTest<8, uint8_t>; + +const char* GetAverageBlendDigest8bpp(const TestParam block_size) { + static const char* const kDigestsWidth4[] = { + "152bcc35946900b1ed16369b3e7a81b7", + "c23e9b5698f7384eaae30a3908118b77", + }; + static const char* const kDigestsWidth8[] = { + "d90d3abd368e58c513070a88b34649ba", + "77f7d53d0edeffb3537afffd9ff33a4a", + }; + static const char* const kDigestsWidth16[] = { + "a50e268e93b48ae39cc2a47d377410e2", + "65c8502ff6d78065d466f9911ed6bb3e", + "bc2c873b9f5d74b396e1df705e87f699", + }; + static const char* const kDigestsWidth32[] = { + "ca40d46d89773e7f858b15fcecd43cc0", + "bfdc894707323f4dc43d1326309f8368", + "f4733417621719b7feba3166ec0da5b9", + }; + static const char* const kDigestsWidth64[] = { + "db38fe2e082bd4a09acb3bb1d52ee11e", + "3ad44401cc731215c46c9b7d96f7e4ae", + "6c43267be5ed03d204a05fe36090f870", + }; + static const char* const kDigestsWidth128[] = { + "c8cfe46ebf166c1cbf08e8804206aadb", + "b0557b5156d2334c8ce4a7ee12f9d6b4", + }; + // height < width implies 0. + // height == width implies 1. + // height > width implies 2. + const int height_index = block_size.height / block_size.width; + switch (block_size.width) { + case 4: + return kDigestsWidth4[height_index - 1]; + case 8: + return kDigestsWidth8[height_index - 1]; + case 16: + return kDigestsWidth16[height_index]; + case 32: + return kDigestsWidth32[height_index]; + case 64: + return kDigestsWidth64[height_index]; + default: + EXPECT_EQ(block_size.width, 128) + << "Unknown width parameter: " << block_size.width; + return kDigestsWidth128[height_index]; + } +} + +TEST_P(AverageBlendTest8bpp, Blending) { + Test(GetAverageBlendDigest8bpp(GetParam()), 1, false); +} + +TEST_P(AverageBlendTest8bpp, DISABLED_Speed) { + Test(GetAverageBlendDigest8bpp(GetParam()), + kNumSpeedTests / (GetParam().height * GetParam().width), false); +} + +INSTANTIATE_TEST_SUITE_P(C, AverageBlendTest8bpp, + testing::ValuesIn(kTestParam)); +#if LIBGAV1_ENABLE_SSE4_1 +INSTANTIATE_TEST_SUITE_P(SSE41, AverageBlendTest8bpp, + testing::ValuesIn(kTestParam)); +#endif +#if LIBGAV1_ENABLE_NEON +INSTANTIATE_TEST_SUITE_P(NEON, AverageBlendTest8bpp, + testing::ValuesIn(kTestParam)); +#endif + +#if LIBGAV1_MAX_BITDEPTH >= 10 +using AverageBlendTest10bpp = AverageBlendTest<10, uint16_t>; + +const char* GetAverageBlendDigest10bpp(const TestParam block_size) { + static const char* const kDigestsWidth4[] = { + "98c0671c092b4288adcaaa17362cc4a3", + "7083f3def8bfb63ab3a985ef5616a923", + }; + static const char* const kDigestsWidth8[] = { + "3bee144b9ea6f4288b860c24f88a22f3", + "27113bd17bf95034f100e9046c7b59d2", + }; + static const char* const kDigestsWidth16[] = { + "24c9e079b9a8647a6ee03f5441f2cdd9", + "dd05777751ccdb4356856c90e1176e53", + "27b1d69d035b1525c013b7373cfe3875", + }; + static const char* const kDigestsWidth32[] = { + "efd24dd7b555786bff1a482e51170ea3", + "3b37ddac87de443cd18784f02c2d1dd5", + "80d8070939a743a20689a65bf5dc0a68", + }; + static const char* const kDigestsWidth64[] = { + "af1fe8c52487c9f2951c3ea516828abb", + "ea6f18ff56b053748c18032b7e048e83", + "af0cb87fe27d24c2e0afd2c90a8533a6", + }; + static const char* const kDigestsWidth128[] = { + "16a83b19911d6dc7278a694b8baa9901", + "bd22e77ce6fa727267ff63eeb4dcb19c", + }; + // (height < width) -> 0 + // (height == width) -> 1 + // (height > width) -> 2 + const int height_index = block_size.height / block_size.width; + switch (block_size.width) { + case 4: + return kDigestsWidth4[height_index - 1]; + case 8: + return kDigestsWidth8[height_index - 1]; + case 16: + return kDigestsWidth16[height_index]; + case 32: + return kDigestsWidth32[height_index]; + case 64: + return kDigestsWidth64[height_index]; + default: + EXPECT_EQ(block_size.width, 128) + << "Unknown width parameter: " << block_size.width; + return kDigestsWidth128[height_index]; + } +} + +TEST_P(AverageBlendTest10bpp, Blending) { + Test(GetAverageBlendDigest10bpp(GetParam()), 1, false); +} + +TEST_P(AverageBlendTest10bpp, DISABLED_Speed) { + Test(GetAverageBlendDigest10bpp(GetParam()), + kNumSpeedTests / (GetParam().height * GetParam().width) / 2, false); +} + +INSTANTIATE_TEST_SUITE_P(C, AverageBlendTest10bpp, + testing::ValuesIn(kTestParam)); +#if LIBGAV1_ENABLE_SSE4_1 +INSTANTIATE_TEST_SUITE_P(SSE41, AverageBlendTest10bpp, + testing::ValuesIn(kTestParam)); +#endif +#if LIBGAV1_ENABLE_NEON +INSTANTIATE_TEST_SUITE_P(NEON, AverageBlendTest10bpp, + testing::ValuesIn(kTestParam)); +#endif +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +} // namespace +} // namespace dsp +} // namespace libgav1 diff --git a/src/dsp/cdef.h b/src/dsp/cdef.h index 2d70d2c..b820b77 100644 --- a/src/dsp/cdef.h +++ b/src/dsp/cdef.h @@ -30,6 +30,7 @@ // The order of includes is important as each tests for a superior version // before setting the base. // clang-format off +#include "src/dsp/x86/cdef_avx2.h" #include "src/dsp/x86/cdef_sse4.h" // clang-format on // IWYU pragma: end_exports diff --git a/src/dsp/cdef_test.cc b/src/dsp/cdef_test.cc new file mode 100644 index 0000000..fd64593 --- /dev/null +++ b/src/dsp/cdef_test.cc @@ -0,0 +1,409 @@ +// Copyright 2021 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/cdef.h" + +#include <cstdint> +#include <cstring> +#include <ostream> + +#include "absl/strings/match.h" +#include "absl/time/clock.h" +#include "absl/time/time.h" +#include "gtest/gtest.h" +#include "src/dsp/dsp.h" +#include "src/utils/common.h" +#include "src/utils/constants.h" +#include "src/utils/cpu.h" +#include "src/utils/memory.h" +#include "tests/third_party/libvpx/acm_random.h" +#include "tests/third_party/libvpx/md5_helper.h" +#include "tests/utils.h" + +namespace libgav1 { +namespace dsp { +namespace { + +constexpr char kCdef[] = "Cdef"; +constexpr char kCdefDirectionName[] = "Cdef Direction"; +constexpr char kCdefFilterName[] = "Cdef Filtering"; +constexpr int kTestBufferStride = 8; +constexpr int kTestBufferSize = 64; +constexpr int kSourceStride = kMaxSuperBlockSizeInPixels + 2 * 8; +constexpr int kSourceBufferSize = + (kMaxSuperBlockSizeInPixels + 2 * 3) * kSourceStride; +constexpr int kNumSpeedTests = 5000; + +const char* GetDirectionDigest(const int bitdepth, const int num_runs) { + static const char* const kDigest[2][2] = { + {"de78c820a1fec7e81385aa0a615dbf8c", "7bfc543244f932a542691480dc4541b2"}, + {"b54236de5d25e16c0f8678d9784cb85e", "559144cf183f3c69cb0e5d98cbf532ff"}}; + const int bitdepth_index = (bitdepth == 8) ? 0 : 1; + const int run_index = (num_runs == 1) ? 0 : 1; + return kDigest[bitdepth_index][run_index]; +} + +template <int bitdepth, typename Pixel> +class CdefDirectionTest : public testing::TestWithParam<int> { + public: + CdefDirectionTest() = default; + CdefDirectionTest(const CdefDirectionTest&) = delete; + CdefDirectionTest& operator=(const CdefDirectionTest&) = delete; + ~CdefDirectionTest() override = default; + + protected: + void SetUp() override { + test_utils::ResetDspTable(bitdepth); + CdefInit_C(); + + const Dsp* const dsp = GetDspTable(bitdepth); + ASSERT_NE(dsp, nullptr); + base_cdef_direction_ = nullptr; + const testing::TestInfo* const test_info = + testing::UnitTest::GetInstance()->current_test_info(); + const char* const test_case = test_info->test_suite_name(); + if (absl::StartsWith(test_case, "C/")) { + } else if (absl::StartsWith(test_case, "SSE41/")) { + CdefInit_SSE4_1(); + } else if (absl::StartsWith(test_case, "AVX2/")) { + if ((GetCpuInfo() & kAVX2) != 0) { + CdefInit_AVX2(); + } + } else if (absl::StartsWith(test_case, "NEON/")) { + CdefInit_NEON(); + } else { + FAIL() << "Unrecognized architecture prefix in test case name: " + << test_case; + } + cur_cdef_direction_ = dsp->cdef_direction; + } + + void TestRandomValues(int num_runs); + + Pixel buffer_[kTestBufferSize]; + int strength_; + int size_; + + CdefDirectionFunc base_cdef_direction_; + CdefDirectionFunc cur_cdef_direction_; +}; + +template <int bitdepth, typename Pixel> +void CdefDirectionTest<bitdepth, Pixel>::TestRandomValues(int num_runs) { + if (cur_cdef_direction_ == nullptr) return; + libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed()); + absl::Duration elapsed_time; + libvpx_test::MD5 actual_digest; + for (int num_tests = 0; num_tests < num_runs; ++num_tests) { + for (int level = 0; level < (1 << bitdepth); level += 1 + (bitdepth - 8)) { + for (int bits = 0; bits <= bitdepth; ++bits) { + for (auto& pixel : buffer_) { + pixel = Clip3((rnd.Rand16() & ((1 << bits) - 1)) + level, 0, + (1 << bitdepth) - 1); + } + int output[2] = {}; + const absl::Time start = absl::Now(); + cur_cdef_direction_(buffer_, kTestBufferStride * sizeof(Pixel), + reinterpret_cast<uint8_t*>(&output[0]), &output[1]); + elapsed_time += absl::Now() - start; + actual_digest.Add(reinterpret_cast<const uint8_t*>(output), + sizeof(output)); + } + } + } + test_utils::CheckMd5Digest(kCdef, kCdefDirectionName, + GetDirectionDigest(bitdepth, num_runs), + actual_digest.Get(), elapsed_time); +} + +using CdefDirectionTest8bpp = CdefDirectionTest<8, uint8_t>; + +TEST_P(CdefDirectionTest8bpp, Correctness) { TestRandomValues(1); } + +TEST_P(CdefDirectionTest8bpp, DISABLED_Speed) { + TestRandomValues(kNumSpeedTests / 100); +} + +INSTANTIATE_TEST_SUITE_P(C, CdefDirectionTest8bpp, testing::Values(0)); + +#if LIBGAV1_ENABLE_NEON +INSTANTIATE_TEST_SUITE_P(NEON, CdefDirectionTest8bpp, testing::Values(0)); +#endif + +#if LIBGAV1_ENABLE_SSE4_1 +INSTANTIATE_TEST_SUITE_P(SSE41, CdefDirectionTest8bpp, testing::Values(0)); +#endif + +#if LIBGAV1_ENABLE_AVX2 +INSTANTIATE_TEST_SUITE_P(AVX2, CdefDirectionTest8bpp, testing::Values(0)); +#endif // LIBGAV1_ENABLE_AVX2 + +#if LIBGAV1_MAX_BITDEPTH >= 10 +using CdefDirectionTest10bpp = CdefDirectionTest<10, uint16_t>; + +TEST_P(CdefDirectionTest10bpp, Correctness) { TestRandomValues(1); } + +TEST_P(CdefDirectionTest10bpp, DISABLED_Speed) { + TestRandomValues(kNumSpeedTests / 100); +} + +INSTANTIATE_TEST_SUITE_P(C, CdefDirectionTest10bpp, testing::Values(0)); +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +const char* GetDigest8bpp(int id) { + static const char* const kDigest[] = { + "b6fe1a1f5bbb23e35197160ce57d90bd", "8aed39871b19184f1d381b145779bc33", + "82653dd66072e8ebd967083a0413ab03", "421c048396bc66ffaa6aafa016c7bc54", + "1f70ba51091e8c6034c3f0974af241c3", "8f700997452a24091136ca58890a5be4", + "9deaaf07db25ca1d96ea8762925372d3", "7edadd9ad058be518430e64f78fe34a2", + "862362a654edb2562609895395eb69cd", "3b4dae4d353b75f652ce67f96b2fd718", + "65c51f49e4fd848d9fef23a346702b17", "f93b3fa86764e53e4c206ef01d5ee9db", + "202e36551bc147c30b76ae359d5f7646", "3de677a2b6fe4aa6fc29a5e5f2d63063", + "ab860362809e878f7b47dacc6087bce3", "c0d991affc8aeb45d91ae36e7b3d77d8", + "27f19fffabfb79104b4be3c272723f62", "a54b981f562e2cf10a4fb037d0181e2d", + "9a65933d02867a1e8fc1f29097d4d0db", "c068b21d232145c61db8ef9298447bfa", + "8db1948c23648372509e4f3577e8eaa0", "c08a3b192ab0a47abe22f7f0ae78a5d7", + "4ff9bd4ae06f2cc2d2660df41cf1baca", "a0a634e48c55a2ca340cf5cac7f74cb6", + "f9f631985b42214f8b059c8f119d4401", "5fb136073300a45d74145649473970da", + "33624aab8ba0264657fa9304dbdcf72c", "e6a15775d451a3c4803a7c0604deb0ea", + "4c28b63022cdc5ea0e49b492c187d53d", "c5fa9792ee292d29c5a864e376ddacc0", + "fcdf7319978b64f03ca3b9d4d83a0c2a", "394931c89bd5065308b0633d12370b19", + "9e702d68000c1b02759001e9a8876df2", "c844919f0114e83960dd329b1aa7146f", + "499248c675884db3ef57018d0a0868b5", "4a9041ed183f9add717e5ddcdb280799", + }; + return kDigest[id]; +} + +#if LIBGAV1_MAX_BITDEPTH >= 10 +const char* GetDigest10bpp(int id) { + static const char* const kDigest[] = { + "0a9630b39974850998db653b07e09ab4", "97a924661d931b23ee57893da617ae70", + "0d79516b9a491ce5112eb00bbae5eb80", "d5801fd96029a7509cf66dde61e8e2d8", + "5bf5c0ea5a85e9b6c1e6991619c34ebc", "e2f1c08a8b3cd93b3a85511493a0ee31", + "18910f422e386c71ffde8680176d61c0", "3255afe8b3db5be4c17299420ae9b4b3", + "ccac34de92891d4ef25820737e7a4f06", "5c2109c4142867c15bc6bb81e19b8058", + "86e8300e2ad292bfce95185530ef06c8", "21c06ed6d62b8fbef1363cd177386cd0", + "fd6687987dbff6f15210c2cc61570daa", "7cb246cb65a9cf9b2f829ab086f7c45a", + "3a38dc3c89f7e400383b1b7ce3e73008", "7b23b520e41ad510b9608b47f9c5f87e", + "f9ca24b57fc06d7b8dc4151bbc4d2840", "070ef8fa64dcdc45701428ee6ef0ca79", + "0e7e3ca3cf8546972d01fc262b2b9cfb", "9ac81b7cf93173f33d195927b0a3685a", + "1f964b6959774651a79d961e5a2a6a56", "64d5f88995a918a85df317d4240f0862", + "55c94ec09facda30fac677d205beb708", "2c010b256f4dabf42ef78bf5a3851b2c", + "c7d18d0e287fa8658b94131603e378db", "4f7696fe2c8dbedd0c8e8a53b9dec0fc", + "b3483dc32665a4bb0606d78dfb3d285c", "0bcb4acd4090f5798c2d260df73b2c46", + "4f574c782f3b28fb9c85cdb70dfcb46a", "14bd700a88be0107e9ef2fe54f75cee6", + "5d3b2698c9ffa4a6aed45a9adbddb8bf", "eff870414f80897cf8958ebeea84f0a6", + "e042843275f82271a9f540bc3e4ef35c", "26e3ff3d661dac25861a0f5bab522340", + "239844e66b07796003f9315166b9e29e", "44b8e6884215a1793cc7f8f7ce40bcee", + }; + return kDigest[id]; +} +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +struct CdefTestParam { + CdefTestParam(int subsampling_x, int subsampling_y, int rows4x4, + int columns4x4) + : subsampling_x(subsampling_x), + subsampling_y(subsampling_y), + rows4x4(rows4x4), + columns4x4(columns4x4) {} + int subsampling_x; + int subsampling_y; + int rows4x4; + int columns4x4; +}; + +std::ostream& operator<<(std::ostream& os, const CdefTestParam& param) { + return os << "subsampling(x/y): " << param.subsampling_x << "/" + << param.subsampling_y << ", (rows,columns)4x4: " << param.rows4x4 + << ", " << param.columns4x4; +} + +// TODO(b/154245961): rework the parameters for this test to match +// CdefFilteringFuncs. It should cover 4x4, 8x4, 8x8 blocks and +// primary/secondary strength combinations for both Y and UV. +template <int bitdepth, typename Pixel> +class CdefFilteringTest : public testing::TestWithParam<CdefTestParam> { + public: + CdefFilteringTest() = default; + CdefFilteringTest(const CdefFilteringTest&) = delete; + CdefFilteringTest& operator=(const CdefFilteringTest&) = delete; + ~CdefFilteringTest() override = default; + + protected: + void SetUp() override { + test_utils::ResetDspTable(bitdepth); + CdefInit_C(); + + const Dsp* const dsp = GetDspTable(bitdepth); + ASSERT_NE(dsp, nullptr); + const testing::TestInfo* const test_info = + testing::UnitTest::GetInstance()->current_test_info(); + const char* const test_case = test_info->test_suite_name(); + if (absl::StartsWith(test_case, "C/")) { + } else if (absl::StartsWith(test_case, "NEON/")) { + CdefInit_NEON(); + } else if (absl::StartsWith(test_case, "SSE41/")) { + CdefInit_SSE4_1(); + } else if (absl::StartsWith(test_case, "AVX2/")) { + if ((GetCpuInfo() & kAVX2) != 0) { + CdefInit_AVX2(); + } + } else { + FAIL() << "Unrecognized architecture prefix in test case name: " + << test_case; + } + memcpy(cur_cdef_filter_, dsp->cdef_filters, sizeof(cur_cdef_filter_)); + } + + void TestRandomValues(int num_runs); + + uint16_t source_[kSourceBufferSize]; + Pixel dest_[kMaxPlanes][kTestBufferSize]; + int primary_strength_; + int secondary_strength_; + int damping_; + int direction_; + CdefTestParam param_ = GetParam(); + + CdefFilteringFuncs cur_cdef_filter_; +}; + +template <int bitdepth, typename Pixel> +void CdefFilteringTest<bitdepth, Pixel>::TestRandomValues(int num_runs) { + const int id = ((param_.rows4x4 < 4) + (param_.rows4x4 < 2)) * 3 + + param_.subsampling_x * 9 + param_.subsampling_y * 18; + absl::Duration elapsed_time; + for (int num_tests = 0; num_tests < num_runs; ++num_tests) { + for (int plane = kPlaneY; plane < kMaxPlanes; ++plane) { + const int subsampling_x = (plane == kPlaneY) ? 0 : param_.subsampling_x; + const int subsampling_y = (plane == kPlaneY) ? 0 : param_.subsampling_y; + const int block_width = 8 >> subsampling_x; + const int block_height = 8 >> subsampling_y; + libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed() + + id + plane); + const int offset = 2 * kSourceStride + 2; + // Fill boundaries with a large value such that cdef does not take them + // into calculation. + const int plane_width = MultiplyBy4(param_.columns4x4) >> subsampling_x; + const int plane_height = MultiplyBy4(param_.rows4x4) >> subsampling_y; + for (int y = 0; y < plane_height; ++y) { + for (int x = 0; x < plane_width; ++x) { + source_[y * kSourceStride + x + offset] = + rnd.Rand16() & ((1 << bitdepth) - 1); + } + } + for (int y = 0; y < 2; ++y) { + Memset(&source_[y * kSourceStride], kCdefLargeValue, kSourceStride); + Memset(&source_[(y + plane_height + 2) * kSourceStride], + kCdefLargeValue, kSourceStride); + } + for (int y = 0; y < plane_height; ++y) { + Memset(&source_[y * kSourceStride + offset - 2], kCdefLargeValue, 2); + Memset(&source_[y * kSourceStride + offset + plane_width], + kCdefLargeValue, 2); + } + do { + int strength = rnd.Rand16() & 15; + if (strength == 3) ++strength; + primary_strength_ = strength << (bitdepth - 8); + } while (primary_strength_ == 0); + do { + int strength = rnd.Rand16() & 3; + if (strength == 3) ++strength; + secondary_strength_ = strength << (bitdepth - 8); + } while (secondary_strength_ == 0); + damping_ = (rnd.Rand16() & 3) + 3; + direction_ = (rnd.Rand16() & 7); + + memset(dest_[plane], 0, sizeof(dest_[plane])); + const absl::Time start = absl::Now(); + const int width_index = block_width >> 3; + if (cur_cdef_filter_[width_index][0] == nullptr) return; + cur_cdef_filter_[width_index][0]( + source_ + offset, kSourceStride, block_height, primary_strength_, + secondary_strength_, damping_, direction_, dest_[plane], + kTestBufferStride * sizeof(dest_[0][0])); + elapsed_time += absl::Now() - start; + } + } + + for (int plane = kPlaneY; plane < kMaxPlanes; ++plane) { + if (bitdepth == 8) { + test_utils::CheckMd5Digest(kCdef, kCdefFilterName, + GetDigest8bpp(id + plane), + reinterpret_cast<uint8_t*>(dest_[plane]), + sizeof(dest_[plane]), elapsed_time); +#if LIBGAV1_MAX_BITDEPTH >= 10 + } else { + test_utils::CheckMd5Digest(kCdef, kCdefFilterName, + GetDigest10bpp(id + plane), + reinterpret_cast<uint8_t*>(dest_[plane]), + sizeof(dest_[plane]), elapsed_time); +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + } + } +} + +// Do not test single blocks with any subsampling. 2xH and Wx2 blocks are not +// supported. +const CdefTestParam cdef_test_param[] = { + CdefTestParam(0, 0, 4, 4), CdefTestParam(0, 0, 2, 2), + CdefTestParam(1, 0, 4, 4), CdefTestParam(1, 0, 2, 2), + CdefTestParam(0, 1, 4, 4), CdefTestParam(0, 1, 2, 2), + CdefTestParam(1, 1, 4, 4), CdefTestParam(1, 1, 2, 2), +}; + +using CdefFilteringTest8bpp = CdefFilteringTest<8, uint8_t>; + +TEST_P(CdefFilteringTest8bpp, Correctness) { TestRandomValues(1); } + +TEST_P(CdefFilteringTest8bpp, DISABLED_Speed) { + TestRandomValues(kNumSpeedTests); +} + +INSTANTIATE_TEST_SUITE_P(C, CdefFilteringTest8bpp, + testing::ValuesIn(cdef_test_param)); + +#if LIBGAV1_ENABLE_NEON +INSTANTIATE_TEST_SUITE_P(NEON, CdefFilteringTest8bpp, + testing::ValuesIn(cdef_test_param)); +#endif + +#if LIBGAV1_ENABLE_SSE4_1 +INSTANTIATE_TEST_SUITE_P(SSE41, CdefFilteringTest8bpp, + testing::ValuesIn(cdef_test_param)); +#endif + +#if LIBGAV1_ENABLE_AVX2 +INSTANTIATE_TEST_SUITE_P(AVX2, CdefFilteringTest8bpp, + testing::ValuesIn(cdef_test_param)); +#endif // LIBGAV1_ENABLE_AVX2 + +#if LIBGAV1_MAX_BITDEPTH >= 10 +using CdefFilteringTest10bpp = CdefFilteringTest<10, uint16_t>; + +TEST_P(CdefFilteringTest10bpp, Correctness) { TestRandomValues(1); } + +TEST_P(CdefFilteringTest10bpp, DISABLED_Speed) { + TestRandomValues(kNumSpeedTests); +} + +INSTANTIATE_TEST_SUITE_P(C, CdefFilteringTest10bpp, + testing::ValuesIn(cdef_test_param)); +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +} // namespace +} // namespace dsp +} // namespace libgav1 diff --git a/src/dsp/constants.cc b/src/dsp/constants.cc index 0099ca3..1b85795 100644 --- a/src/dsp/constants.cc +++ b/src/dsp/constants.cc @@ -20,7 +20,7 @@ namespace libgav1 { // Each set of 7 taps is padded with a 0 to easily align and pack into the high // and low 8 bytes. This way, we can load 16 at a time to fit mulhi and mullo. -const int8_t kFilterIntraTaps[kNumFilterIntraPredictors][8][8] = { +alignas(16) const int8_t kFilterIntraTaps[kNumFilterIntraPredictors][8][8] = { {{-6, 10, 0, 0, 0, 12, 0, 0}, {-5, 2, 10, 0, 0, 9, 0, 0}, {-3, 1, 1, 10, 0, 7, 0, 0}, diff --git a/src/dsp/convolve.cc b/src/dsp/convolve.cc index 8c6f68f..727b4af 100644 --- a/src/dsp/convolve.cc +++ b/src/dsp/convolve.cc @@ -623,6 +623,8 @@ void ConvolveIntraBlockCopy2D_C(const void* const reference, const int /*vertical_filter_id*/, const int width, const int height, void* prediction, const ptrdiff_t pred_stride) { + assert(width >= 4 && width <= kMaxSuperBlockSizeInPixels); + assert(height >= 4 && height <= kMaxSuperBlockSizeInPixels); const auto* src = static_cast<const Pixel*>(reference); const ptrdiff_t src_stride = reference_stride / sizeof(Pixel); auto* dest = static_cast<Pixel*>(prediction); @@ -676,6 +678,8 @@ void ConvolveIntraBlockCopy1D_C(const void* const reference, const int /*vertical_filter_id*/, const int width, const int height, void* prediction, const ptrdiff_t pred_stride) { + assert(width >= 4 && width <= kMaxSuperBlockSizeInPixels); + assert(height >= 4 && height <= kMaxSuperBlockSizeInPixels); const auto* src = static_cast<const Pixel*>(reference); const ptrdiff_t src_stride = reference_stride / sizeof(Pixel); auto* dest = static_cast<Pixel*>(prediction); diff --git a/src/dsp/convolve_test.cc b/src/dsp/convolve_test.cc new file mode 100644 index 0000000..4a2a9f1 --- /dev/null +++ b/src/dsp/convolve_test.cc @@ -0,0 +1,1373 @@ +// Copyright 2021 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/convolve.h" + +#include <algorithm> +#include <cassert> +#include <cmath> +#include <cstddef> +#include <cstdint> +#include <cstdio> +#include <cstring> +#include <ostream> +#include <string> +#include <tuple> + +#include "absl/strings/match.h" +#include "absl/strings/str_format.h" +#include "absl/strings/string_view.h" +#include "absl/time/clock.h" +#include "absl/time/time.h" +#include "gtest/gtest.h" +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/utils/common.h" +#include "src/utils/constants.h" +#include "src/utils/cpu.h" +#include "src/utils/memory.h" +#include "tests/block_utils.h" +#include "tests/third_party/libvpx/acm_random.h" +#include "tests/third_party/libvpx/md5_helper.h" +#include "tests/utils.h" + +namespace libgav1 { +namespace dsp { +namespace { + +// The convolve function will access at most (block_height + 7) rows/columns +// from the beginning. +constexpr int kMaxBlockWidth = kMaxSuperBlockSizeInPixels + kSubPixelTaps; +constexpr int kMaxBlockHeight = kMaxSuperBlockSizeInPixels + kSubPixelTaps; + +// Test all the filters in |kSubPixelFilters|. There are 6 different filters but +// filters [4] and [5] are only reached through GetFilterIndex(). +constexpr int kMinimumViableRuns = 4 * 16; + +// When is_scaled_convolve_ is true, we don't test every combination of +// type_param_, so some digests in ths array are redudant, marked as +// "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa". +// We keep it so that the logic of calculation id in GetDigestId() is clearer. +const char* GetDigest8bpp(int id) { + static const char* const kDigest[] = { + "ae5977a4ceffbac0cde72a04a43a9d57", "fab093b917d36f6b69fb4f50a6b5c822", + "1168251e6261e2ff1fa69a93226dbd76", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "d2f5ca2b7958c332a3fb771f66da01f0", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "6bbcc075f8b768a02cdc9149f150326d", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "c4e90cd202f9867517433b550afdc644", "43d6df191744f6c5d489c0673714a714", + "bfe8197057b0f3f096344251047f481f", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "1681719b0f8905d99382f4132fe1472a", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "8d24b59c0f3942079ba4945ed6686269", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "ae5977a4ceffbac0cde72a04a43a9d57", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "995318eff1fe62822366490192ad8b5e", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "0ef1c5beb3228c6d9ecf3ced584c4aa8", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "fc02228efb85c665bd27a3dab72a9037", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "6cf5f791fe0d8dcd3526be3c6b814035", "eaa0942097fd2b2dd621b77e0a659896", + "4821befdf63f8c6da6440afeb57f320f", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "7aec92c3b65e456b64ae285c12b03b0d", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "4ae70d9db2ec36885394db7d59bdd4f7", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "911212ae2492690de06d12bfaf71c7d4", "cb284b0ae039582039563638f682db26", + "6b4393b2d7387dd291d3a7bd3aabcae4", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "0804d93136549388b6cd7fdcd187a578", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "b25f037602efdb4eaacb3ade1dc5c28f", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "6cf5f791fe0d8dcd3526be3c6b814035", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "704b0bb4128aa163ef5899e6d8ad9664", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "abf3f31ec4daff000e80f7ab9628688b", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "09e12a389cd454e10f750062102ea1b2", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "d905dfcad930aded7718587c05b48aaf", "fe85aaee8007d2130d56919242e01163", + "c30fc44d83821141e84cc4793e127301", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "f72a99ad63f6a88c23724e898b705d21", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "5fee162fe52c11c823db4d5ede370654", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "a9210113ff6873e5b50d5d3ad67e440f", "b7633a78f959b20ca27ffb700b44b45c", + "6d1c5145be9fd636ababd64c64d23a10", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "d55d8012ddddb55e6c3e51dafab92980", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "b1948cb353fa308f0d5592b0ad338997", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "d905dfcad930aded7718587c05b48aaf", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "04e3b7f46e748431c76cf6125057601c", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "71362b65cffd008d1ca4a20adc8cc15f", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "987f7a6a8bef47acbd1e49bb39f51ac4", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "6baf153feff04cc5b7e87c0bb60a905d", "fa1ad095bf696745599079fb73975b75", + "a8293b933d9f2e5d7f922ea40111d643", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "07a1f07f114c4a38ba08d2f44e1e1132", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "9365186c59ef66d9def40f437022ad93", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "a7305087fae23de53d21a6909009ff69", "bd44440b5757b74bcc3e2f7f32ef42af", + "a5a1ac658d7ce4a846a32b9fcfaa3475", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "3b1ceebf0579fcbbfd6136938c595b91", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "3bfad931bce82335219e0e29c15f2b21", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "6baf153feff04cc5b7e87c0bb60a905d", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "4cfad2c437084a93ea76913e21c2dd89", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "1a0bdfc96a3b9fd904e658f238ab1076", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "b8a710baa6a9fc784909671d450ecd99", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "871ed5a69ca31e6444faa720895949bf", "e55d0c54fd28355d32e29d411488b571", + "354a54861a94e8b027afd9931e61f997", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "26b9de95edb45b31ac5aa19825831c7a", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "0f95fb0276c9c7910937fbdf75f2811d", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "8dcce009395264379c1a51239f4bb22c", "06925f05ea49811e3efc2a44b111b32b", + "2370f4e4a83edf91b7f504bbe4b00e90", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "ecafabcad1045f15d31ce2f3b13132f2", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "68a701313d2247d2b32636ebc1f2a008", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "871ed5a69ca31e6444faa720895949bf", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "d372f0c17bce98855d6d59fbee814c3d", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "56d16e54afe205e97527902770e71c71", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "f9e6a56382d8d12da676d6631bb6ef75", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "68e2f90eaa0ab5da7e6f5776993f7eea", "8718965c4831a363a321a25f4aada7ba", + "eeeb8589c1b31cbb565154736ca939ec", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "c1b836a6ce023663b90db0e320389414", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "b355dab2dbb6f5869018563eece22862", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "8dcce009395264379c1a51239f4bb22c", "e7c2bfd356c860c36053dea19f8d0705", + "ae5464066a049622a7a264cdf9394b55", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "5f211eba020e256a5781b203c5aa1d2e", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "05afe1f40d37a45a97a5e0aadd5066fb", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "68e2f90eaa0ab5da7e6f5776993f7eea", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "d99ffd2579eb781c30bc0df7b76ad61e", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "1f7b5b8282ff3cf4d8e8c52d80ef5b4d", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "3bf8e11e18527b16f0d7c0361d74a52d", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "f1f8282fb33c30eb68c0c315b7a4bc01", "4c718ddbe8b5aa7118c8bc1c2f5ea158", + "f49dab626ddd977ed171f79295c24935", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "5befcf222152ebc8d779fcc10b95320a", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "cf6ff8c43d8059cea6090a23ab66a0ef", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "d90a69e7bae8aa46ed0e1e5f911d7a07", "1d7113d705fa0edeef49e5c50a91151d", + "45368b6db3d1fee739a64b0bc823ea9c", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "3b04497634364dd2cd3f2482b5d4b32f", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "9e1f0e0bddb58d15d0925eeaede9b84c", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "f1f8282fb33c30eb68c0c315b7a4bc01", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "4e139e57cbb049a0f4ef816adc48d026", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "79e9e260a2028c5fe320005c272064b9", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "b9ff54c6f1e3b41fc7fc0f3fa0e75cf2", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "9412064b0eebf8123f23d74147d04dff", "0dee657827cd48c4ce4a7657f6f92233", + "78d2f27e0d4708cb16856d7d40dc16fb", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "62adf407fc27d8682ced4dd7b55af14e", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "a336f8b7bcf188840ca65c0d0e66518a", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "6ab4dc87be03be1dcc5d956ca819d938", "78cef82670ff99b1e4a279de3538c233", + "8dff0f28192d9f8c0bf7fb5405719dd8", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "a8ac7b5dc65ffb758b0643508a0e744e", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "03313cdaa593a1a7b4869010dcc7b241", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "9412064b0eebf8123f23d74147d04dff", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "be53b2507048e7ff50226d15c0b28865", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "2418ebcdf85551b9ae6e3725f04aae6d", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "06ef1504f31af5f173d3317866ca57cb", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "cc08936effe309ab9a4fa1bf7e28e24e", "a81bcdeb021d3a23477c40c47548df52", + "9d2393ea156a1c2083f5b4207793064b", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "35be0786a072bf2f1286989261bf6580", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "de953f03895923359c6a719e6a537b89", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "6ab4dc87be03be1dcc5d956ca819d938", "e053321d7c75951d5ff3dce85762acd3", + "632738ef3ff3021cff45045c41978849", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "561ed8be43c221a561f8885a0d74c7ef", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "88a50d2b4107ee5b5074b2520183f8ac", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "cc08936effe309ab9a4fa1bf7e28e24e", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "b73f3c1a10405de89d1f9e812ff73b5a", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "98bdf907ebacacb734c9eef1ee727c6e", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "635e8ee11cf04d73598549234ad732a0", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "36cbef36fa21b98df03536c918bf752a", "b7a4d080e2f24040eebb785f437de66a", + "a9c62745b95c66fa497a524886af57e2", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "90562fc42dc5d879ae74c4909c1dec30", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "8463ade9347ed602663e2cec5c4c3fe6", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "8f2afdb2f03cd04ffacd421b958caaa0", "2e15a26905467e5ad9f8da04b94e60b6", + "f7ec43384037e8d6c618e0df826ec029", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "8159619fc234598c8c75154d80021fd4", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "ac50ea9f7306da95a5092709442989cf", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "36cbef36fa21b98df03536c918bf752a", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "c7d51b1f2df49ab83962257e8a5934e5", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "4dd5672d53c8f359e8f80badaa843dfc", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "fab693410d59ee88aa2895527efc31ac", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "9d0da6321cf5311ea0bdd41271763030", "22ff7819c55ce6b2e0ce5431eb8c309c", + "2c614ec4463386ec075a0f1dbb587933", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "a1427352f9e413975a0949e2b300c657", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "392de11ffcd5c2ecf3db3480ee135340", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "710ccecc103033088d898a2b924551fb", "160c29a91e372d66b12e171e4d81bc18", + "a6bc648197781a2dc99c487e66464320", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "8f43645dce92cf7594aa4822aa53b17d", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "739b17591437edffd36799237b962658", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "9d0da6321cf5311ea0bdd41271763030", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "159e443d79cc59b11ca4a80aa7aa09be", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "a1bef519bbf07138e2eec5a91694de46", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "3041eb26c23a63a587fbec623919e2d2", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "55a10165ee8a660d7dddacf7de558cdd", "355b691a656e6a287e4504ef2fbb8034", + "7a8856480d752153370240b066b90f6a", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "bcbc418bc2beb243e463851cd95335a9", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "bddd31e3e852712e6244b616622af83d", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "710ccecc103033088d898a2b924551fb", "f6cb80c4d5683553929b1e41f99e487e", + "1112ebd509007154c72c5a485b220b62", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "b6ccddb7dfa4eddc87b4eff08b5a3195", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "b8a7eb7dd9c216e240517edfc6489397", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "55a10165ee8a660d7dddacf7de558cdd", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "6ef14b14882e1465b0482b0e0b16d8ce", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "df1cb51fe1a937cd7834e973dc5cb814", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "c61d99d5daf575664fb7ad64976f4b03", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "ac7fc9f9ea7213743fae5a023faaaf08", "a6307a981600c3fb5b9d3e89ddf55069", + "beaef1dbffadc701fccb7c18a03e3a41", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "cb8fedcbecee3947358dc61f95e56530", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "30a36245c40d978fc8976b442a8600c3", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "a4093e3e5902dd659407ce6471635a4e", "658f0f51eb2f965f7490053852653fc0", + "9714c4ce636b6fb0ad05cba246d48c76", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "b4e605327b28db573d88844a1a09db8d", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "75b755f199dbf4a0e5ebbb86c2bd871d", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "ac7fc9f9ea7213743fae5a023faaaf08", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "22a8d287b425c870f40c64a50f91ce54", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "317fe65abf81ef3ea07976ef8667baeb", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "822f6c4eb5db760468d822b21f48d94d", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "077e1b7b355c7ab3ca40230ee8efd8ea", "628229ce2484d67e72c51b2f4ad124a6", + "72b1e700c949d06eaf62d664dafdb5b6", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "0d0154a7d573685285a83a4cf201ac57", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "93aa662b988b8502e5ea95659eafde59", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "375d7f5358d7a088a498b8b3aaecc0d5", "b726ef75b641c21519ecc2f802bbaf39", + "2c93dde8884f09fb5bb5ad6d95cde86d", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "15b00a15d1cc6cc96ca85d00b167e4dd", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "31b0017ba1110e3d70b020901bc15564", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "077e1b7b355c7ab3ca40230ee8efd8ea", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "f1d96db5a2e0a2160df38bd96d28d19b", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "2da29da97806ae0ee300c5e69c35a4aa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "3f6fcb9fae3666e085b9e29002a802fc", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "7a3e8de2a1caae206cf3e51a86dfd15a", "c266a1b65599686c771fad8a822e7a49", + "684f5c3a25a080edaf79add6e9137a8e", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "b14bd8068f108905682b83cc15778065", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "70440ba9ee7f9d16d297dbb49e54a56e", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "375d7f5358d7a088a498b8b3aaecc0d5", "4dca696cc0552c1d684c4fc963adc336", + "a49e6160b5d1b56bc2046963101cd606", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "7bf911888c11a9fefd604b8b9c82e9a1", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "0a1aa8f5ecfd11ddba080af0051c576a", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "7a3e8de2a1caae206cf3e51a86dfd15a", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "637d1e5221422dfe9a6dbcfd7f62ebdd", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "555475f5d1685638169ab904447e4f13", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "d9b9fecd195736a6049c528d4cb886b5", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "1ddf9020f18fa7883355cf8c0881186a", "e681b35b1fe02e2a6698525040015cd0", + "3be970f49e4288988818b087201d54da", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "c96c867d998473197dde9b587be14e3a", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "1eb2be4c05b50e427e29c72fa566bff5", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "08867ea5cc38c705ec52af821bc4736a", "c51c8bb294f4fa20bdab355ad1e7df37", + "7f084953976111e9f65b57876e7552b1", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "bfb69b4d7d4aed73cfa75a0f55b66440", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "536181ee90de883cc383787aec089221", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "1ddf9020f18fa7883355cf8c0881186a", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "f275af4f1f350ffaaf650310cb5dddec", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "b3e3a6234e8045e6182cf90a09f767b2", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "fed17fc391e6c3db4aa14ea1d6596c87", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "2377dd167ef2707978bed6f10ffd4e76", "b1f6c0cd490b584b1883222a4c281e0f", + "d2b9dba2968894a414756bb510ac389a", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "f596c63c7b14cada0174e17124c83942", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "52c0980bae63e8459e82eee7d8af2334", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "2afb540e8063f58d1b03896486c5e89b", "b929f7956cf35dd6225ca6cf45eacb23", + "0846ec82555b66197c5c45b08240fbcc", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "034d1d62581bd0d840c4cf1e28227931", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "29f82b0f3e4113944bd28aacd9b8489a", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "2377dd167ef2707978bed6f10ffd4e76", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "f81c4d6b001a14584528880fa6988a87", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "849dfeca59074525dea59681a7f88ab4", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "d0d3482d981989e117cbb32fc4550267", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "f918e0e4422967c6a7e47298135c7ae9", "fc8718e6f9e6663c2b6bf9710f835bfc", + "9a3215eb97aedbbddd76c7440837d040", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "eb2822ad8204ed4ecbf0f30fcb210498", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "75e57104d6058cd2bce1d3d8142d273d", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "2afb540e8063f58d1b03896486c5e89b", "d9d9f3c699cd03ab9d698e6b235ddcc6", + "ca7471c126ccd22189e874f0a6e41960", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "8cba849640e9e2859d509bc81ca94acd", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "ee3e76371240d1f1ff811cea6a7d4f63", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "f918e0e4422967c6a7e47298135c7ae9", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "a5a2f9c2e7759d8a3dec1bc4b56be587", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "39a68af80be11e1682b6f3c4ede33530", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "39561688bf6680054edbfae6035316ce", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "b2264e129636368b5496760b39e64b7a", "4dbb4ce94d4948c990a51b15959d2fa6", + "4e317feac6da46addf0e8b9d8d54304b", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "538ce869ffd23b6963e61badfab7712b", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "b4c735269ade44419169adbd852d5ddc", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "6ce47b11d2e60c5d183c84ce9f2e46cc", "3ac8d5b68ebb29fd1a41c5fa9d5f4382", + "0802b6318fbd0969a33de8fdfcd07f10", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "bc79acf2a0fe419194cdb4529bc7dcc8", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "17a20dbbf09feae557d40aa5818fbe76", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "b2264e129636368b5496760b39e64b7a", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "2317c57ab69a36eb3bf278cf8a8795a3", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "b22d765af176d87e7d3048b4b89b86ad", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "087c5992ca6f829e1ba4ba5332d67947", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "c9cf1deba08dac5972b3b0a43eff8f98", "84777bdeb84e2530a1c8c1ee432ec934", + "b384e9e3d81f9f4f9024028fbe451d8b", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "4e4677a0623d44237eb8d6a622cdc526", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "356d4003477283e157c8d2b5a79d913c", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "c9cf1deba08dac5972b3b0a43eff8f98", "1e58b76ca365b0bd4fd3c4519ec4a500", + "24accebe2e795b13fcb56dd3abacf53f", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "98f584ceaf2d65af997f85d71ceeda1b", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "c9cf1deba08dac5972b3b0a43eff8f98", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "1e58b76ca365b0bd4fd3c4519ec4a500", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "24accebe2e795b13fcb56dd3abacf53f", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "98f584ceaf2d65af997f85d71ceeda1b", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + }; + return kDigest[id]; +} + +#if LIBGAV1_MAX_BITDEPTH >= 10 +const char* GetDigest10bpp(int id) { + static const char* const kDigest[] = { + "b1b6903d60501c7bc11e5285beb26a52", "3fa4ebd556ea33cfa7f0129ddfda0c5b", + "a693b4bd0334a3b98d45e67d3985bb63", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "3e787534dff83c22b3033750e448865a", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "fd1da8d197cb385f7917cd296d67afb9", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "d9941769b66d012c68f70accc1a3b664", "98728677401560d7c29ba8bec59c6a00", + "2924788891caa175bb0725b57de6cbd2", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "915a60e7bb2c38ad5a556098230d6092", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "a25de86fd8d389c1c75405aac8049b58", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "b1b6903d60501c7bc11e5285beb26a52", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "cf792b94b1f3f321fa0c1d6362d89c90", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "5f1622fde194bd04560b04f13dc47a7c", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "d935e0ec1d933d0c48fa529be4f998eb", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "a7855ed75772d7fa815978a202bbcd9f", "cd3e8b96ff6796650e138f5d106d70d4", + "156de3172d9acf3c7f251cd7a18ad461", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "4c91f676a054d582bcae1ca9adb87a31", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "a984202c527b757337c605443f376915", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "20a390cc7e06a265ecc1e118f776c25a", "ab0da36b88021ed0efd806a1a4cd4fa0", + "fc57a318fbf0c0f29c24edbc84e35ec6", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "568055866caf274d67e984307cda2742", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "3ff2b19730d6bb8b97f4d72085d2d5b8", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "a7855ed75772d7fa815978a202bbcd9f", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "acc8588292b326f15076dd3a3d260072", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "f990a13f7a062665d7f18a40bd5da2ae", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "931df73c3d50c4b2e4ec3502bc8774de", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "bde291a4e8087c085fe8b3632f4d7351", "555eead3b67766f56b0e3714d431506f", + "e545b8a3ff958f8363c7968cbae96732", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "eab5894046a99ad0a1a12c91b0f37bd7", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "c347f4a58fd784c5e88c1a23e4ff15d2", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "9272ee0820b09bfdc252a97b2e103862", "be8dd418158226a00d5e01ccc3e4f66b", + "34b37b59ee49108276be28a2e4585c2d", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "f4deb462014249d4ab02db7f7f62308e", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "6ae557169928f3be15c7aad8d67205b1", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "bde291a4e8087c085fe8b3632f4d7351", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "14be0f12550c814f75655b4e1e22ddde", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "af4cadb78ee54aacebac76c8ad275375", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "c0c4ebfd6dbbddd88114c36e8c9085da", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "238980eebc9e63ae3eea2771c7a70f12", "661c69a7b49984fa1e92cf8485ab28b6", + "7842b2047356c1417d9d88219707f1a1", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "765b4cfbfc1a4988878c412d53bcb597", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "29cbaadbff9adf4a3d49bd9900a9dd0b", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "7e3fa9c03bc3dfbdeb67f24c5d9a49cd", "a65e13b534b32fdff3f48d09389daaf1", + "da1a6ff2be03ec8acde4cb1cd519a6f0", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "d54206c34785cc3d8a06c2ceac46378c", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "b1f26ee13df2e14a757416ba8a682278", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "238980eebc9e63ae3eea2771c7a70f12", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "e552466a4e7ff187251b8914b084d404", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aba5d5ef5e96fe418e65d20e506ea834", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "972aeba65e8a6d20dd0f95279be2aa75", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "0eac13431bd7d8a573318408a72246d5", "71c57b774e4c3d9b965b060e2a895448", + "1a487c658d684314d91bb6d961a94672", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "bc63b29ec78c1efec5543885a45bb822", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "c5997b802a6ba1cf5ba1057ddc5baa7e", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "f3454ca93cbb0c8c09b0695d90a0df3d", "d259b9c0d0e3322114b2bcce04ae35dd", + "a4ca37cb869a0dbd1c4a2dcc449a8f31", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "85a11892ed884e3e74968435f6b16e64", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "996b6c166f9ed25bd07ea6acdf7597ff", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "0eac13431bd7d8a573318408a72246d5", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "981b7c44b6f7b7ac2acf0cc4096e6bf4", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "d70bf16e2a31e90b7b3cdeaef1494cf9", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "34165457282e2af2e9b3f5840e4dec5d", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "73438155feb62595e3e406921102d748", "86d00d2e3dd4a198343f37e3dc4461c9", + "0635a296be01b7e641de98ee27c33cd2", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "cecd57396a0033456408f3f3554c6912", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "59f33727e5beeb783a057770bec7b4cd", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "f3454ca93cbb0c8c09b0695d90a0df3d", "b11f98b5bb864413952d47a67b4add79", + "1b5d1d4c7be8d5ec00a42a49eecf918f", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "16434230d24b9522ae2680e8c37e1b95", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "34895d4c69a6c3303693e6f431bcd5d8", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "73438155feb62595e3e406921102d748", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "a4c75372af36162831cb872e24e1088c", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "6df80bb7f264f4f285d09a4d61533fae", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "b8c5582b9bbb789c45471f93be83b41f", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "5871e0e88a776840d619670fbf107858", "57dd2cde826c50e0b0ec504396cb3ceb", + "82dc120bf8c2043bc5eee81007309ebf", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "5b37f94ef136c1eb9a6181c19491459c", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "0654d72f22306b28d9ae42515845240c", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "1a77d2af4d2b6cf8737cfbcacacdc4e4", "7123d4aa8083da90ec6986dda0e126ce", + "98b77e88b0784baaea64c98c8707fe46", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "963dea92f3efbb99137d1de9c56728d3", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "c9497b00cb1bc3363dd126ffdddadc8e", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "5871e0e88a776840d619670fbf107858", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "497271227a70a72f9ad25b415d41563f", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "c8831118d1004a7cca015a4fca140018", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "257bf5467db570974d7cf2356bacf116", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "1c6376ce55c9ee9e35d432edb1ffb3b7", "6fff9189c1d11f183f7c42d4ce5febdb", + "58c826cad3c14cdf26a649265758c58b", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "716ba3a25b454e44b46caa42622c128c", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "6c9d7d9e6ef81d76e775a85c53abe209", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "89bec831efea2f88129dedcad06bb3fa", "e1ef4ae726d864b36a9b64b1e43ede7e", + "8148788044522edc3c497e1017efe2ce", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "b72fb6a9a073c2fe65013af1842dc9b0", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "1e461869bb2ee9b6069c5e52cf817291", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "1c6376ce55c9ee9e35d432edb1ffb3b7", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "c48bd7e11ec44ba7b2bc8b6a04592439", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "b7f82c140369067c105c7967c75b6f9e", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "5255dded79f56b0078543b5a1814a668", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "d675e0195c9feca956e637f3f1959f40", "670fa8c31c82fced9a810b64c03e87ee", + "f166254037c0dfb140f54cd7b08bddfe", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "9076f58c4ab20f2f06d701a6b53b1c4f", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "a35f435ccc67717a49251a07e62ae204", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "89bec831efea2f88129dedcad06bb3fa", "7c3a79a90f3f4b460540e796f3197ef1", + "acf60abeda98bbea161139b915317423", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "86fa0c299737eb499cbcdce94abe2d33", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "8d7f1d7ea6a0dcc922ad5d2e77bc74dd", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "d675e0195c9feca956e637f3f1959f40", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "0960a9af91250e9faa1eaac32227bf6f", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "130f47aae365aabfec4360fa5b5ff554", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "ef745100f5f34c8ff841b2b0b57eb33f", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "b5681673903ade13d69e295f82fdd009", "9ccd4cc6216eab35ddcb66a76b55dd2f", + "74ab206f14ac5f62653cd3dd71a7916d", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "d3212ab3922f147c3cf126c3b1aa17f6", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "c5325015cb0b7c42839ac4aa21803fa0", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "dead0fe4030085c22e92d16bb110de9d", "3c6d97f25d6bc647c843850be007f512", + "262c96b1f2c4f85c86c0e9c77fedff1e", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "6b80af04470b83673d98f46925e678a5", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "138855d9bf0ccd0c62ac14c7bff4fd37", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "b5681673903ade13d69e295f82fdd009", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "746c2e0f96ae2246d534d67102be068c", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "92483ed631de21b685ffe6ccadbbec8f", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "edae8ed67286ca6a31573a541b3deb6f", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "3c43020105ae93a301404b4cd6238654", "cef7cfdcb8ca8d2612f31a1fe95ce371", + "5621caef7cc1d6522903290ccc5c2cb8", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "b55fea77f0e14a8bf8b6562b766fe91f", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "f81f31f1585c0f70438c09e829416f20", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "306a2f5dfd675df4ed9af44fd5cac8c0", "1dfda318021a05a7e72fd815ddb0dfc8", + "f35a3d13516440f9168076d9b07c9e98", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "65baca6167fe5249f7a839ce5b2fd591", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "64035142864914d05a48ef8e013631d0", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "3c43020105ae93a301404b4cd6238654", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "d6f6db079da9b8909a153c07cc9d0e63", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "cbb6ab31547df6b91cfb48630fdffb48", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "01adcd8bf15fbf70df47fbf3a953aa14", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "dd2c5880a94ed3758bfea0b0e8c78286", "5f6c1725f4c7c73a8d8f0d9468106624", + "78ec6cf42cce4b1feb65e076c78ca241", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "59b578268ff26a1e21c5b4273f73f852", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "ab10b22fb8dd8199040745565b28595d", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "306a2f5dfd675df4ed9af44fd5cac8c0", "9209f83153ef6f09b5262536a2dc1671", + "13782526fc2726100cb3cf375b3150ed", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "e47ded6c0eec1d5baadd02aff172f2b1", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "205904fa3c644433b46e01c11dd2fe40", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "dd2c5880a94ed3758bfea0b0e8c78286", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "7c8928a0d769f4264d195f39cb68a772", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "1eea5e8a24d6aa11778eb3e5e5e9c9f2", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "ba539808a8501609ce052a1562a62b25", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "4ebb1a7b25a39d8b9868ec8a1243103f", "c2732a08997e1f5176dfb297d2e89235", + "42188e2dbb4e02cd353552ea147ad03f", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "16761e7c8ba2645718153bed83ae78f6", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "0d928d6111f86c60ccefc6c6604d5659", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "9d01c946a12f5ef9d9cebd9816e06014", "d738eb9f3f4f0b412b93687b55b6e45a", + "13c07441b47b0c1ed80f015ac302d220", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "c0950e609f278efb7050d319a9756bb3", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "291425aaf8206b20e88db8ebf3cf7e7f", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "4ebb1a7b25a39d8b9868ec8a1243103f", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "db645c96fc8be04015e0eb538afec9ae", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "9e193b6b28ce798c44c744efde19eee9", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "ac8e6391200cec2abdebb00744a2ba82", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "d34ec07845cd8523651e5f5112984a14", "745c794b557d4a0d734e45d720a7f7ad", + "f9813870fc27941a7c00a0443d7c2fe7", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "a9e9805769fe1baf5c7933793ccca0d8", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "4ed1a6200912995d4f571bdb7822aa83", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "768f63912e43148c13688d7f23281531", "43fb786fd2e79610d6a6d912b95f4509", + "02880fde51ac991ad18d8986f4e5145c", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "9051290279237f9fb1389989b142d2dd", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "cb6238b8eb6b72980958e6fcceb2f2eb", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "d34ec07845cd8523651e5f5112984a14", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "946af3a8f5362def5f4e27cb0fd4e754", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "885c384d90aaa34acd8303958033c252", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "54b17120f7d71ddb4d70590ecd231cc1", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "2ce55308d873f4cd244f16da2b06e06e", "af7b76d3471cfbdc97d1e57bc2876ce7", + "20b14a6b5af7aa356963bcaaf23d230d", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "553a2c24939dff18ec5833c77f556cfb", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "92e31a45513582f386dc9c22a57bbbbd", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "768f63912e43148c13688d7f23281531", "4e255554dab9dfa1064e20a905538308", + "aa25073115bad49432953254e7dce0bc", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "34cdc1be291c95981c98812c5c343a15", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "626321a6dfac542d0fc70321fac13ff3", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "2ce55308d873f4cd244f16da2b06e06e", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "7ad78dfe7bbedf696dd58d9ad01bcfba", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "8110ed10e7234851dff3c7e4a51108a2", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "f6e36446a97611a4db4425df926974b2", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "a4bb5d5ff4b25f391265b5231049a09a", "cf4867c6b1b8be86a7e0bee708c28d83", + "9c9c41435697f75fa118b6d6464ee7cb", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "5c1ec75a160c444fa90abf106fa1140e", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "6dbf310a9c8d85f76306d6a35545f8af", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "2e7927158e7b8e40e7269fc909fb584b", "8b72feff8bb0901229a2bd7da2857c4b", + "69e3361b7199e10e75685b90fb0df623", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "5b64a6911cb7c3d60bb8f961ed9782a2", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "1c6fda7501e0f8bdad972f7857cd9354", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "a4bb5d5ff4b25f391265b5231049a09a", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "f0fd9c09d454e4ce918faa97e9ac10be", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "6fb9383302eb7e7a13387464d2634e03", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "a82f4080699300b659bbe1b5c4463147", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "c9106e0c820b03bcdde3aa94efc11a3e", "0408e10e51a31ac756a57d5149a2b409", + "38816245ed832ba313fefafcbed1e5c8", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "2266840f11ac4c066d941ec473b1a54f", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "80fce29dc82d5857c1ed5ef2aea16835", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "123028e18c2bfb334e34adb5a4f67de4", "1670eb8ed876e609ed81236a683b4a3d", + "2f8ab35f6e7030e82ca922a68b29af4a", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "7133de9d03a4b07716a12226b5e493e8", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "4fd485dadcb570e5a0a5addaf9ba84da", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "c9106e0c820b03bcdde3aa94efc11a3e", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "af6ae5c0eb28417bd251184baf2eaba7", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "864d51fcc737bc73a3f588b67515039a", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "ecedb178f7cad3dc1b921eca67f9efb6", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "7ec2eae9e118506da8b33440b399511a", "108a4a6530a6b9c933ccf14edbd896be", + "5d34137cc8ddba75347b0fa1d0a91791", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "9e194755b2a37b615a517d5f8746dfbb", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "14f2c5b9d2cd621c178a39f1ec0c38eb", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "123028e18c2bfb334e34adb5a4f67de4", "2fdc713ba418780d0be33a3ebbcb323c", + "452f91b01833c57db4e909575a029ff6", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "3594eff52d5ed875bd9655ddbf106fae", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "d3f140aea9e8eabf4e1e5190e0148288", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "7ec2eae9e118506da8b33440b399511a", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "866f8df540dd3b58ab1339314d139cbd", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "2ecb7890f00234bcb28c1d969f489012", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "0609ca0ff3ca90069e8b48829b4b0891", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "78de867c8ee947ed6d29055747f26949", "0a7cb4f51f1acf0940b59295b2327465", + "465dcb046a0449b9dfb3e0b297aa3863", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "bbf86f8174334f0b8d869fd8d58bf92d", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "da54cfb4530841bda29966cfa05f4879", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "2c979c2bddef79a760e72a802f83cc76", "545426be3436073ba63790aa3c4a5598", + "1fabf0655bedb671e4d7287fec8119ba", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "90d7e13aa2f9a064493ff2b3b5b12109", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "e4938219593bbed5ae638a93f2f4a580", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "78de867c8ee947ed6d29055747f26949", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "72803589b453a29501540aeddc23e6f4", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "c4793d431dbf2d88826bb440bf027512", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "839e86c681e97359f7819c766000dd1c", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "d05a237ed7a9ca877256b71555b1b8e4", "3052776d186fca6dd8011f4fe908a212", + "94b3e5bcd6b849b66a4571ec3d23f9be", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "91d6bdbc62d4bb80c9b371d9704e3c9e", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "4f750f6375524311d260306deb233861", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "d05a237ed7a9ca877256b71555b1b8e4", "03ce2d07cac044d6b68604d398571844", + "68ece92dcbe70a2ae9776d72972740a7", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "380d296d0d55a49dd86ee562b053a9d8", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "d05a237ed7a9ca877256b71555b1b8e4", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "03ce2d07cac044d6b68604d398571844", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "68ece92dcbe70a2ae9776d72972740a7", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "380d296d0d55a49dd86ee562b053a9d8", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + }; + return kDigest[id]; +} +#endif + +struct ConvolveTestParam { + ConvolveTestParam(int width, int height) : width(width), height(height) {} + int width; + int height; +}; + +struct ConvolveTypeParam { + ConvolveTypeParam(bool is_intra_block_copy, bool is_compound, + bool has_vertical_filter, bool has_horizontal_filter) + : is_intra_block_copy(is_intra_block_copy), + is_compound(is_compound), + has_vertical_filter(has_vertical_filter), + has_horizontal_filter(has_horizontal_filter) {} + bool is_intra_block_copy; + bool is_compound; + bool has_vertical_filter; + bool has_horizontal_filter; +}; + +std::ostream& operator<<(std::ostream& os, const ConvolveTestParam& param) { + return os << "BlockSize" << param.width << "x" << param.height; +} + +std::ostream& operator<<(std::ostream& os, const ConvolveTypeParam& param) { + return os << "is_intra_block_copy: " << param.is_intra_block_copy + << ", is_compound: " << param.is_compound + << ", has_(vertical/horizontal)_filter: " + << param.has_vertical_filter << "/" << param.has_horizontal_filter; +} + +// TODO(b/146062680): split this to ConvolveTest and ConvolveScaleTest to +// simplify the members and test logic. +template <int bitdepth, typename Pixel> +class ConvolveTest + : public testing::TestWithParam< + std::tuple<ConvolveTestParam, ConvolveTypeParam, bool>> { + public: + ConvolveTest() = default; + ~ConvolveTest() override = default; + + void SetUp() override { + ConvolveInit_C(); + + const Dsp* const dsp = GetDspTable(bitdepth); + ASSERT_NE(dsp, nullptr); + GetConvolveFuncs(dsp, &base_convolve_func_, &base_convolve_scale_func_); + + const testing::TestInfo* const test_info = + testing::UnitTest::GetInstance()->current_test_info(); + const absl::string_view test_case = test_info->test_suite_name(); + if (absl::StartsWith(test_case, "C/")) { + base_convolve_func_ = nullptr; + base_convolve_scale_func_ = nullptr; + } else if (absl::StartsWith(test_case, "SSE41/")) { + if ((GetCpuInfo() & kSSE4_1) != 0) { + ConvolveInit_SSE4_1(); + } + } else if (absl::StartsWith(test_case, "AVX2/")) { + if ((GetCpuInfo() & kAVX2) != 0) { + ConvolveInit_AVX2(); + } + } else if (absl::StartsWith(test_case, "NEON/")) { + ConvolveInit_NEON(); + } else { + FAIL() << "Unrecognized architecture prefix in test case name: " + << test_case; + } + + GetConvolveFuncs(dsp, &cur_convolve_func_, &cur_convolve_scale_func_); + + // Skip functions that have not been specialized for this particular + // architecture. + if (cur_convolve_func_ == base_convolve_func_) { + cur_convolve_func_ = nullptr; + } + if (cur_convolve_scale_func_ == base_convolve_scale_func_) { + cur_convolve_scale_func_ = nullptr; + } + } + + protected: + int GetDigestId() const { + // id is the combination of the 3-dimension array: + // (param_, type_param_, is_scaled_convolve_) + // The number of each array is 20, 16, 2. + // The range of id is from 0 to 20x16x2 - 1. + // is_scaled_convolve_: false, id += 0; true, id += 1; + // type_param_: (0, 0, 0, 0), id += 0 * 2. + // (0, 0, 0, 1), id += 1 * 2; (0, 0, 1, 0), id += 2 * 2; + // ... + // param_: (2, 2), id += 0 * 32; (2, 4), id += 1 * 32; + // (4, 2), id += 2 * 32; (4, 4), id += 3 * 32; + // ... + int id = static_cast<int>(is_scaled_convolve_); + id += 2 * static_cast<int>(type_param_.has_horizontal_filter); + id += 2 * 2 * static_cast<int>(type_param_.has_vertical_filter); + id += 2 * 4 * static_cast<int>(type_param_.is_compound); + id += 2 * 8 * static_cast<int>(type_param_.is_intra_block_copy); + if (param_.width == param_.height) { + id += 32 * 3 * static_cast<int>(std::log2(param_.width) - 1); + } else if (param_.width < param_.height) { + id += 32 * (1 + 3 * static_cast<int>(std::log2(param_.width) - 1)); + } else { + // param_.width > param_.height + if (param_.width == 8 && param_.height == 2) { + // Special case is at the end of the array. + id += 32 * 19; + } else { + id += 32 * (2 + 3 * static_cast<int>(std::log2(param_.height) - 1)); + } + } + return id; + } + + void GetConvolveFuncs(const Dsp* dsp, ConvolveFunc* func, + ConvolveScaleFunc* scale_func); + void SetInputData(bool use_fixed_values, int value); + void Check(bool use_fixed_values, const Pixel* src, const Pixel* dest, + libvpx_test::MD5* md5_digest); + void Check16Bit(bool use_fixed_values, const uint16_t* src, + const uint16_t* dest, libvpx_test::MD5* md5_digest); + // |num_runs| covers the categories of filters (6) and the number of filters + // under each category (16). + void Test(bool use_fixed_values, int value, + int num_runs = kMinimumViableRuns); + + const ConvolveTestParam param_ = std::get<0>(GetParam()); + const ConvolveTypeParam type_param_ = std::get<1>(GetParam()); + const bool is_scaled_convolve_ = std::get<2>(GetParam()); + + private: + ConvolveFunc base_convolve_func_; + ConvolveFunc cur_convolve_func_; + ConvolveScaleFunc base_convolve_scale_func_; + ConvolveScaleFunc cur_convolve_scale_func_; + // Convolve filters are 7-tap, which needs 3 pixels (kRestorationBoder) + // padding. + // When is_scaled_convolve_ is true, the source can be at most 2 times of + // max width/height. So we allocate a larger memory for it and setup the + // extra memory when is_scaled_convolve_ is true. + Pixel source_[kMaxBlockHeight * kMaxBlockWidth * 4] = {}; + uint16_t source_16bit_[kMaxBlockHeight * kMaxBlockWidth * 4] = {}; + uint16_t dest_16bit_[kMaxBlockHeight * kMaxBlockWidth] = {}; + Pixel dest_clipped_[kMaxBlockHeight * kMaxBlockWidth] = {}; + + const int source_stride_ = + is_scaled_convolve_ ? kMaxBlockWidth * 2 : kMaxBlockWidth; + const int source_height_ = + is_scaled_convolve_ ? kMaxBlockHeight * 2 : kMaxBlockHeight; +}; + +template <int bitdepth, typename Pixel> +void ConvolveTest<bitdepth, Pixel>::GetConvolveFuncs( + const Dsp* const dsp, ConvolveFunc* func, ConvolveScaleFunc* scale_func) { + if (is_scaled_convolve_) { + *func = nullptr; + *scale_func = dsp->convolve_scale[type_param_.is_compound]; + } else { + *scale_func = nullptr; + *func = + dsp->convolve[type_param_.is_intra_block_copy][type_param_.is_compound] + [type_param_.has_vertical_filter] + [type_param_.has_horizontal_filter]; + } +} + +template <int bitdepth, typename Pixel> +void ConvolveTest<bitdepth, Pixel>::SetInputData(bool use_fixed_values, + int value) { + if (use_fixed_values) { + std::fill(source_, source_ + source_height_ * source_stride_, value); + } else { + const int offset = + kConvolveBorderLeftTop * source_stride_ + kConvolveBorderLeftTop; + const int mask = (1 << bitdepth) - 1; + libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed()); + const int height = is_scaled_convolve_ ? param_.height * 2 : param_.height; + const int width = is_scaled_convolve_ ? param_.width * 2 : param_.width; + for (int y = 0; y < height; ++y) { + for (int x = 0; x < width; ++x) { + source_[y * source_stride_ + x + offset] = rnd.Rand16() & mask; + } + } + // Copy border pixels to the left and right borders. + for (int y = 0; y < height; ++y) { + Memset(&source_[(y + kConvolveBorderLeftTop) * source_stride_], + source_[y * source_stride_ + offset], kConvolveBorderLeftTop); + Memset(&source_[y * source_stride_ + offset + width], + source_[y * source_stride_ + offset + width - 1], + kConvolveBorderLeftTop); + } + // Copy border pixels to the top and bottom borders. + for (int y = 0; y < kConvolveBorderLeftTop; ++y) { + memcpy(&source_[y * source_stride_], + &source_[kConvolveBorderLeftTop * source_stride_], + source_stride_ * sizeof(Pixel)); + memcpy(&source_[(y + kConvolveBorderLeftTop + height) * source_stride_], + &source_[(kConvolveBorderLeftTop + height - 1) * source_stride_], + source_stride_ * sizeof(Pixel)); + } + } +} + +template <int bitdepth, typename Pixel> +void ConvolveTest<bitdepth, Pixel>::Check(bool use_fixed_values, + const Pixel* src, const Pixel* dest, + libvpx_test::MD5* md5_digest) { + if (use_fixed_values) { + // For fixed values, input and output are identical. + const bool success = + test_utils::CompareBlocks(src, dest, param_.width, param_.height, + kMaxBlockWidth, kMaxBlockWidth, false, false); + EXPECT_TRUE(success); + } else { + // For random input, compare md5. + const int offset = + kConvolveBorderLeftTop * kMaxBlockWidth + kConvolveBorderLeftTop; + const size_t size = sizeof(dest_clipped_) - offset * sizeof(Pixel); + md5_digest->Add(reinterpret_cast<const uint8_t*>(dest), size); + } +} + +template <int bitdepth, typename Pixel> +void ConvolveTest<bitdepth, Pixel>::Check16Bit(bool use_fixed_values, + const uint16_t* src, + const uint16_t* dest, + libvpx_test::MD5* md5_digest) { + if (use_fixed_values) { + // For fixed values, input and output are identical. + const bool success = + test_utils::CompareBlocks(src, dest, param_.width, param_.height, + kMaxBlockWidth, kMaxBlockWidth, false); + EXPECT_TRUE(success); + } else { + // For random input, compare md5. + const int offset = + kConvolveBorderLeftTop * kMaxBlockWidth + kConvolveBorderLeftTop; + const size_t size = sizeof(dest_16bit_) - offset * sizeof(uint16_t); + md5_digest->Add(reinterpret_cast<const uint8_t*>(dest), size); + } +} + +template <int bitdepth, typename Pixel> +void ConvolveTest<bitdepth, Pixel>::Test(bool use_fixed_values, int value, + int num_runs /*= 16 * 6*/) { + // There's no meaning testing fixed input in compound convolve. + if (type_param_.is_compound && use_fixed_values) GTEST_SKIP(); + + // Scaled convolve does not behave differently under most params. Only need to + // test the enabled compound implementation. + if (is_scaled_convolve_ && + (type_param_.is_intra_block_copy || type_param_.has_vertical_filter || + type_param_.has_horizontal_filter)) { + GTEST_SKIP(); + } + + // There should not be any function set for this combination. + if (type_param_.is_intra_block_copy && type_param_.is_compound) { + ASSERT_EQ(cur_convolve_func_, nullptr); + return; + } + + // Compound and intra block copy functions are only used for blocks 4x4 or + // greater. + if (type_param_.is_compound || type_param_.is_intra_block_copy) { + if (param_.width < 4 || param_.height < 4) { + GTEST_SKIP(); + } + } + + // Skip unspecialized functions. + if (cur_convolve_func_ == nullptr && cur_convolve_scale_func_ == nullptr) { + GTEST_SKIP(); + } + + SetInputData(use_fixed_values, value); + libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed() + + GetDigestId()); + // [1,2048] for |step_[xy]|. This covers a scaling range of 1/1024 to 2x. + const int step_x = (rnd.Rand16() & ((1 << 11) - 1)) + 1; + const int step_y = (rnd.Rand16() & ((1 << 11) - 1)) + 1; + int subpixel_x = 0; + int subpixel_y = 0; + int vertical_index = 0; + int horizontal_index = 0; + const int offset = + kConvolveBorderLeftTop * kMaxBlockWidth + kConvolveBorderLeftTop; + const int offset_scale = + kConvolveBorderLeftTop * source_stride_ + kConvolveBorderLeftTop; + const Pixel* const src = source_ + offset; + const Pixel* const src_scale = source_ + offset_scale; + const ptrdiff_t src_stride = source_stride_ * sizeof(Pixel); + const ptrdiff_t src_stride_16 = source_stride_; + const ptrdiff_t dst_stride = kMaxBlockWidth * sizeof(Pixel); + // Pack Compound output since we control the predictor buffer. + const ptrdiff_t dst_stride_compound = param_.width; + + // Output is always 16 bits regardless of |bitdepth|. + uint16_t* dst_16 = dest_16bit_ + offset; + // Output depends on |bitdepth|. + Pixel* dst_pixel = dest_clipped_ + offset; + + // Collect the first |kMinimumViableRuns| into one md5 buffer. + libvpx_test::MD5 md5_digest; + + absl::Duration elapsed_time; + for (int i = 0; i < num_runs; ++i) { + // Test every filter. + // Because of masking |subpixel_{x,y}| values roll over every 16 iterations. + subpixel_x += 1 << 6; + subpixel_y += 1 << 6; + + const int horizontal_filter_id = (subpixel_x >> 6) & 0xF; + const int vertical_filter_id = (subpixel_y >> 6) & 0xF; + + // |filter_id| == 0 (copy) must be handled by the appropriate 1D or copy + // function. + if (horizontal_filter_id == 0 || vertical_filter_id == 0) { + continue; + } + + // For focused speed testing these can be set to the desired filter. Want + // only 8 tap filters? Set |{vertical,horizontal}_index| to 2. + vertical_index += static_cast<int>(i % 16 == 0); + vertical_index %= 4; + horizontal_index += static_cast<int>(i % 16 == 0); + horizontal_index %= 4; + + if (is_scaled_convolve_) { + ASSERT_EQ(cur_convolve_func_, nullptr); + // Output type is uint16_t. + const absl::Time start = absl::Now(); + if (type_param_.is_compound) { + cur_convolve_scale_func_( + source_, src_stride, horizontal_index, vertical_index, 0, 0, step_x, + step_y, param_.width, param_.height, dst_16, dst_stride_compound); + } else { + cur_convolve_scale_func_( + source_, src_stride, horizontal_index, vertical_index, 0, 0, step_x, + step_y, param_.width, param_.height, dst_pixel, dst_stride); + } + elapsed_time += absl::Now() - start; + } else if (type_param_.is_compound) { + ASSERT_EQ(cur_convolve_scale_func_, nullptr); + // Output type is uint16_t. + const absl::Time start = absl::Now(); + cur_convolve_func_(src, src_stride, horizontal_index, vertical_index, + horizontal_filter_id, vertical_filter_id, param_.width, + param_.height, dst_16, dst_stride_compound); + elapsed_time += absl::Now() - start; + } else { + ASSERT_EQ(cur_convolve_scale_func_, nullptr); + // Output type is Pixel. + const absl::Time start = absl::Now(); + cur_convolve_func_(src, src_stride, horizontal_index, vertical_index, + horizontal_filter_id, vertical_filter_id, param_.width, + param_.height, dst_pixel, dst_stride); + elapsed_time += absl::Now() - start; + } + + // Only check the output for the first set. After that it's just repeated + // runs for speed timing. + if (i >= kMinimumViableRuns) continue; + + if (is_scaled_convolve_) { + // Convolve function does not clip the output. The clipping is applied + // later. But libaom clips the output. So we apply clipping to match + // libaom in tests. + if (type_param_.is_compound) { + const int single_round_offset = (1 << bitdepth) + (1 << (bitdepth - 1)); + Pixel* dest_row = dest_clipped_; + for (int y = 0; y < kMaxBlockHeight; ++y) { + for (int x = 0; x < kMaxBlockWidth; ++x) { + dest_row[x] = static_cast<Pixel>(Clip3( + dest_16bit_[y * dst_stride_compound + x] - single_round_offset, + 0, (1 << bitdepth) - 1)); + } + dest_row += kMaxBlockWidth; + } + } + + if (type_param_.is_compound) { + Check16Bit(use_fixed_values, source_16bit_ + offset_scale, dst_16, + &md5_digest); + } else { + Check(use_fixed_values, src_scale, dst_pixel, &md5_digest); + } + } else if (type_param_.is_compound) { + // Need to copy source to a uint16_t buffer for comparison. + Pixel* src_ptr = source_; + uint16_t* src_ptr_16 = source_16bit_; + for (int y = 0; y < kMaxBlockHeight; ++y) { + for (int x = 0; x < kMaxBlockWidth; ++x) { + src_ptr_16[x] = src_ptr[x]; + } + src_ptr += src_stride_16; + src_ptr_16 += src_stride_16; + } + + Check16Bit(use_fixed_values, source_16bit_ + offset, dst_16, &md5_digest); + } else { + Check(use_fixed_values, src, dst_pixel, &md5_digest); + } + } + + if (!use_fixed_values) { + // md5 sums are only calculated for random input. + const char* ref_digest; + if (bitdepth == 8) { + ref_digest = GetDigest8bpp(GetDigestId()); + } else { +#if LIBGAV1_MAX_BITDEPTH >= 10 + ref_digest = GetDigest10bpp(GetDigestId()); +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + } + const char* direction; + if (is_scaled_convolve_ || (type_param_.has_vertical_filter && + type_param_.has_horizontal_filter)) { + direction = "2D"; + } else if (type_param_.has_vertical_filter) { + direction = "Vertical"; + } else if (type_param_.has_horizontal_filter) { + direction = "Horizontal"; + } else { + direction = "Copy"; + } + const auto elapsed_time_us = + static_cast<int>(absl::ToInt64Microseconds(elapsed_time)); + printf("Mode Convolve%s%s%s%s[%25s]: %5d us MD5: %s\n", + type_param_.is_compound ? "Compound" : "", + type_param_.is_intra_block_copy ? "IntraBlockCopy" : "", + is_scaled_convolve_ ? "Scale" : "", direction, + absl::StrFormat("%dx%d", param_.width, param_.height).c_str(), + elapsed_time_us, md5_digest.Get()); + EXPECT_STREQ(ref_digest, md5_digest.Get()); + } +} + +void ApplyFilterToSignedInput(const int min_input, const int max_input, + const int8_t filter[kSubPixelTaps], + int* min_output, int* max_output) { + int min = 0, max = 0; + for (int i = 0; i < kSubPixelTaps; ++i) { + const int tap = filter[i]; + if (tap > 0) { + max += max_input * tap; + min += min_input * tap; + } else { + min += max_input * tap; + max += min_input * tap; + } + } + *min_output = min; + *max_output = max; +} + +void ApplyFilterToUnsignedInput(const int max_input, + const int8_t filter[kSubPixelTaps], + int* min_output, int* max_output) { + ApplyFilterToSignedInput(0, max_input, filter, min_output, max_output); +} + +// Validate the maximum ranges for different parts of the Convolve process. +template <int bitdepth> +void ShowRange() { + // Subtract one from the shift bits because the filter is pre-shifted by 1. + constexpr int horizontal_bits = (bitdepth == kBitdepth12) + ? kInterRoundBitsHorizontal12bpp - 1 + : kInterRoundBitsHorizontal - 1; + constexpr int vertical_bits = (bitdepth == kBitdepth12) + ? kInterRoundBitsVertical12bpp - 1 + : kInterRoundBitsVertical - 1; + constexpr int compound_vertical_bits = kInterRoundBitsCompoundVertical - 1; + + constexpr int compound_offset = (bitdepth == 8) ? 0 : kCompoundOffset; + + constexpr int max_input = (1 << bitdepth) - 1; + + const int8_t* worst_convolve_filter = kHalfSubPixelFilters[2][8]; + + // First pass. + printf("Bitdepth: %2d Input range: [%8d, %8d]\n", bitdepth, 0, + max_input); + + int min, max; + ApplyFilterToUnsignedInput(max_input, worst_convolve_filter, &min, &max); + + if (bitdepth == 8) { + // 8bpp can use int16_t for sums. + assert(min > INT16_MIN); + assert(max < INT16_MAX); + } else { + // 10bpp and 12bpp require int32_t. + assert(min > INT32_MIN); + assert(max > INT16_MAX && max < INT32_MAX); + } + + printf(" intermediate range: [%8d, %8d]\n", min, max); + + const int first_pass_min = RightShiftWithRounding(min, horizontal_bits); + const int first_pass_max = RightShiftWithRounding(max, horizontal_bits); + + // All bitdepths can use int16_t for first pass output. + assert(first_pass_min > INT16_MIN); + assert(first_pass_max < INT16_MAX); + + printf(" first pass output range: [%8d, %8d]\n", first_pass_min, + first_pass_max); + + // Second pass. + ApplyFilterToSignedInput(first_pass_min, first_pass_max, + worst_convolve_filter, &min, &max); + + // All bitdepths require int32_t for second pass sums. + assert(min < INT16_MIN && min > INT32_MIN); + assert(max > INT16_MAX && max < INT32_MAX); + + printf(" intermediate range: [%8d, %8d]\n", min, max); + + // Second pass non-compound output is clipped to Pixel values. + const int second_pass_min = + Clip3(RightShiftWithRounding(min, vertical_bits), 0, max_input); + const int second_pass_max = + Clip3(RightShiftWithRounding(max, vertical_bits), 0, max_input); + printf(" second pass output range: [%8d, %8d]\n", second_pass_min, + second_pass_max); + + // Output is Pixel so matches Pixel values. + assert(second_pass_min == 0); + assert(second_pass_max == max_input); + + const int compound_second_pass_min = + RightShiftWithRounding(min, compound_vertical_bits) + compound_offset; + const int compound_second_pass_max = + RightShiftWithRounding(max, compound_vertical_bits) + compound_offset; + + printf(" compound second pass output range: [%8d, %8d]\n", + compound_second_pass_min, compound_second_pass_max); + + if (bitdepth == 8) { + // 8bpp output is int16_t without an offset. + assert(compound_second_pass_min > INT16_MIN); + assert(compound_second_pass_max < INT16_MAX); + } else { + // 10bpp and 12bpp use the offset to fit inside uint16_t. + assert(compound_second_pass_min > 0); + assert(compound_second_pass_max < UINT16_MAX); + } + + printf("\n"); +} + +TEST(ConvolveTest, ShowRange) { + ShowRange<kBitdepth8>(); + ShowRange<kBitdepth10>(); + ShowRange<kBitdepth12>(); +} + +using ConvolveTest8bpp = ConvolveTest<8, uint8_t>; + +TEST_P(ConvolveTest8bpp, FixedValues) { + Test(true, 0); + Test(true, 1); + Test(true, 128); + Test(true, 255); +} + +TEST_P(ConvolveTest8bpp, RandomValues) { Test(false, 0); } + +TEST_P(ConvolveTest8bpp, DISABLED_Speed) { + const int num_runs = static_cast<int>(1.0e7 / (param_.width * param_.height)); + Test(false, 0, num_runs); +} + +const ConvolveTestParam kConvolveParam[] = { + ConvolveTestParam(2, 2), ConvolveTestParam(2, 4), + ConvolveTestParam(4, 2), ConvolveTestParam(4, 4), + ConvolveTestParam(4, 8), ConvolveTestParam(8, 2), + ConvolveTestParam(8, 4), ConvolveTestParam(8, 8), + ConvolveTestParam(8, 16), ConvolveTestParam(16, 8), + ConvolveTestParam(16, 16), ConvolveTestParam(16, 32), + ConvolveTestParam(32, 16), ConvolveTestParam(32, 32), + ConvolveTestParam(32, 64), ConvolveTestParam(64, 32), + ConvolveTestParam(64, 64), ConvolveTestParam(64, 128), + ConvolveTestParam(128, 64), ConvolveTestParam(128, 128), +}; + +const ConvolveTypeParam kConvolveTypeParam[] = { + ConvolveTypeParam(false, false, false, false), + ConvolveTypeParam(false, false, false, true), + ConvolveTypeParam(false, false, true, false), + ConvolveTypeParam(false, false, true, true), + ConvolveTypeParam(false, true, false, false), + ConvolveTypeParam(false, true, false, true), + ConvolveTypeParam(false, true, true, false), + ConvolveTypeParam(false, true, true, true), + ConvolveTypeParam(true, false, false, false), + ConvolveTypeParam(true, false, false, true), + ConvolveTypeParam(true, false, true, false), + ConvolveTypeParam(true, false, true, true), + ConvolveTypeParam(true, true, false, false), + ConvolveTypeParam(true, true, false, true), + ConvolveTypeParam(true, true, true, false), + ConvolveTypeParam(true, true, true, true), +}; + +INSTANTIATE_TEST_SUITE_P(C, ConvolveTest8bpp, + testing::Combine(testing::ValuesIn(kConvolveParam), + testing::ValuesIn(kConvolveTypeParam), + testing::Bool())); + +#if LIBGAV1_ENABLE_NEON +INSTANTIATE_TEST_SUITE_P(NEON, ConvolveTest8bpp, + testing::Combine(testing::ValuesIn(kConvolveParam), + testing::ValuesIn(kConvolveTypeParam), + testing::Bool())); +#endif // LIBGAV1_ENABLE_NEON + +#if LIBGAV1_ENABLE_SSE4_1 +INSTANTIATE_TEST_SUITE_P(SSE41, ConvolveTest8bpp, + testing::Combine(testing::ValuesIn(kConvolveParam), + testing::ValuesIn(kConvolveTypeParam), + testing::Bool())); +#endif // LIBGAV1_ENABLE_SSE4_1 + +#if LIBGAV1_ENABLE_AVX2 +INSTANTIATE_TEST_SUITE_P(AVX2, ConvolveTest8bpp, + testing::Combine(testing::ValuesIn(kConvolveParam), + testing::ValuesIn(kConvolveTypeParam), + testing::Bool())); +#endif // LIBGAV1_ENABLE_AVX2 + +#if LIBGAV1_MAX_BITDEPTH >= 10 +using ConvolveTest10bpp = ConvolveTest<10, uint16_t>; + +TEST_P(ConvolveTest10bpp, FixedValues) { + Test(true, 0); + Test(true, 1); + Test(true, 128); + Test(true, (1 << 10) - 1); +} + +TEST_P(ConvolveTest10bpp, RandomValues) { Test(false, 0); } + +TEST_P(ConvolveTest10bpp, DISABLED_Speed) { + const int num_runs = static_cast<int>(1.0e7 / (param_.width * param_.height)); + Test(false, 0, num_runs); +} + +INSTANTIATE_TEST_SUITE_P(C, ConvolveTest10bpp, + testing::Combine(testing::ValuesIn(kConvolveParam), + testing::ValuesIn(kConvolveTypeParam), + testing::Bool())); +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +} // namespace +} // namespace dsp +} // namespace libgav1 diff --git a/src/dsp/distance_weighted_blend_test.cc b/src/dsp/distance_weighted_blend_test.cc new file mode 100644 index 0000000..b3f3a2e --- /dev/null +++ b/src/dsp/distance_weighted_blend_test.cc @@ -0,0 +1,324 @@ +// Copyright 2020 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/distance_weighted_blend.h" + +#include <cstdint> +#include <ostream> +#include <string> +#include <type_traits> + +#include "absl/strings/match.h" +#include "absl/strings/str_format.h" +#include "absl/strings/string_view.h" +#include "absl/time/clock.h" +#include "absl/time/time.h" +#include "gtest/gtest.h" +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/utils/common.h" +#include "src/utils/constants.h" +#include "src/utils/cpu.h" +#include "src/utils/memory.h" +#include "tests/third_party/libvpx/acm_random.h" +#include "tests/utils.h" + +namespace libgav1 { +namespace dsp { +namespace { + +constexpr int kNumSpeedTests = 500000; + +constexpr int kQuantizedDistanceLookup[4][2] = { + {9, 7}, {11, 5}, {12, 4}, {13, 3}}; + +struct TestParam { + TestParam(int width, int height) : width(width), height(height) {} + int width; + int height; +}; + +std::ostream& operator<<(std::ostream& os, const TestParam& param) { + return os << "BlockSize" << param.width << "x" << param.height; +} + +template <int bitdepth, typename Pixel> +class DistanceWeightedBlendTest : public testing::TestWithParam<TestParam>, + public test_utils::MaxAlignedAllocable { + public: + DistanceWeightedBlendTest() = default; + ~DistanceWeightedBlendTest() override = default; + + void SetUp() override { + test_utils::ResetDspTable(bitdepth); + DistanceWeightedBlendInit_C(); + const dsp::Dsp* const dsp = dsp::GetDspTable(bitdepth); + ASSERT_NE(dsp, nullptr); + base_func_ = dsp->distance_weighted_blend; + const testing::TestInfo* const test_info = + testing::UnitTest::GetInstance()->current_test_info(); + const absl::string_view test_case = test_info->test_suite_name(); + if (absl::StartsWith(test_case, "C/")) { + base_func_ = nullptr; + } else if (absl::StartsWith(test_case, "SSE41/")) { + if ((GetCpuInfo() & kSSE4_1) != 0) { + DistanceWeightedBlendInit_SSE4_1(); + } + } else if (absl::StartsWith(test_case, "NEON/")) { + DistanceWeightedBlendInit_NEON(); + } else { + FAIL() << "Unrecognized architecture prefix in test case name: " + << test_case; + } + func_ = dsp->distance_weighted_blend; + } + + protected: + void Test(const char* digest, int num_tests); + + private: + using PredType = + typename std::conditional<bitdepth == 8, int16_t, uint16_t>::type; + static constexpr int kDestStride = kMaxSuperBlockSizeInPixels; + const int width_ = GetParam().width; + const int height_ = GetParam().height; + alignas(kMaxAlignment) PredType + source1_[kMaxSuperBlockSizeInPixels * kMaxSuperBlockSizeInPixels]; + alignas(kMaxAlignment) PredType + source2_[kMaxSuperBlockSizeInPixels * kMaxSuperBlockSizeInPixels]; + Pixel dest_[kMaxSuperBlockSizeInPixels * kMaxSuperBlockSizeInPixels] = {}; + Pixel reference_[kMaxSuperBlockSizeInPixels * kMaxSuperBlockSizeInPixels] = + {}; + dsp::DistanceWeightedBlendFunc base_func_; + dsp::DistanceWeightedBlendFunc func_; +}; + +template <int bitdepth, typename Pixel> +void DistanceWeightedBlendTest<bitdepth, Pixel>::Test(const char* digest, + int num_tests) { + if (func_ == nullptr) return; + libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed()); + PredType* src_1 = source1_; + PredType* src_2 = source2_; + + const int index = rnd.Rand8() & 3; + const uint8_t weight_0 = kQuantizedDistanceLookup[index][0]; + const uint8_t weight_1 = kQuantizedDistanceLookup[index][1]; + // In libgav1, predictors have an offset which are later subtracted and + // clipped in distance weighted blending. Therefore we add the offset + // here to match libaom's implementation. + for (int y = 0; y < height_; ++y) { + for (int x = 0; x < width_; ++x) { + // distance_weighted_blend is applied to compound prediction values. This + // implies a range far exceeding that of pixel values. + // The ranges include kCompoundOffset in 10bpp and 12bpp. + // see: src/dsp/convolve.cc & src/dsp/warp.cc. + static constexpr int kCompoundPredictionRange[3][2] = { + // 8bpp + {-5132, 9212}, + // 10bpp + {3988, 61532}, + // 12bpp + {3974, 61559}, + }; + constexpr int bitdepth_index = (bitdepth - 8) >> 1; + const int min_val = kCompoundPredictionRange[bitdepth_index][0]; + const int max_val = kCompoundPredictionRange[bitdepth_index][1]; + src_1[x] = static_cast<PredType>(rnd(max_val - min_val) + min_val); + src_2[x] = static_cast<PredType>(rnd(max_val - min_val) + min_val); + } + src_1 += width_; + src_2 += width_; + } + absl::Duration elapsed_time; + for (int i = 0; i < num_tests; ++i) { + const absl::Time start = absl::Now(); + func_(source1_, source2_, weight_0, weight_1, width_, height_, dest_, + sizeof(Pixel) * kDestStride); + elapsed_time += absl::Now() - start; + } + + test_utils::CheckMd5Digest( + "DistanceWeightedBlend", + absl::StrFormat("BlockSize%dx%d", width_, height_).c_str(), digest, dest_, + sizeof(dest_), elapsed_time); +} + +const TestParam kTestParam[] = { + TestParam(4, 4), TestParam(4, 8), TestParam(4, 16), + TestParam(8, 4), TestParam(8, 8), TestParam(8, 16), + TestParam(8, 32), TestParam(16, 4), TestParam(16, 8), + TestParam(16, 16), TestParam(16, 32), TestParam(16, 64), + TestParam(32, 8), TestParam(32, 16), TestParam(32, 32), + TestParam(32, 64), TestParam(32, 128), TestParam(64, 16), + TestParam(64, 32), TestParam(64, 64), TestParam(64, 128), + TestParam(128, 32), TestParam(128, 64), TestParam(128, 128), +}; + +const char* GetDistanceWeightedBlendDigest8bpp(const TestParam block_size) { + static const char* const kDigestsWidth4[] = { + "ebf389f724f8ab46a2cac895e4e073ca", + "09acd567b6b12c8cf8eb51d8b86eb4bf", + "57bb4d65695d8ec6752f2bd8686b64fd", + }; + static const char* const kDigestsWidth8[] = { + "270905ac76f9a2cba8a552eb0bf7c8c1", + "f0801c8574d2c271ef2bbea77a1d7352", + "e761b580e3312be33a227492a233ce72", + "ff214dab1a7e98e2285961d6421720c6", + }; + static const char* const kDigestsWidth16[] = { + "4f712609a36e817f9752326d58562ff8", "14243f5c5f7c7104160c1f2cef0a0fbc", + "3ac3f3161b7c8dd8436b02abfdde104a", "81a00b704e0e41a5dbe6436ac70c098d", + "af8fd02017c7acdff788be742d700baa", + }; + static const char* const kDigestsWidth32[] = { + "ee34332c66a6d6ed8ce64031aafe776c", "b5e3d22bd2dbdb624c8b86a1afb5ce6d", + "607ffc22098d81b7e37a7bf62f4af5d3", "3823dbf043b4682f56d5ca698e755ea5", + "57f7e8d1e67645269ce760a2c8da4afc", + }; + static const char* const kDigestsWidth64[] = { + "4acf556b921956c2bc24659cd5128401", + "a298c544c9c3b27924b4c23cc687ea5a", + "539e2df267782ce61c70103b23b7d922", + "3b0cb2a0b5d384efee4d81401025bec1", + }; + static const char* const kDigestsWidth128[] = { + "d71ee689a40ff5f390d07717df4b7233", + "8b56b636dd712c2f8d138badb7219991", + "8cfc8836908902b8f915639b7bff45b3", + }; + const int height_index = + FloorLog2(block_size.height) - FloorLog2(block_size.width) + 2; + switch (block_size.width) { + case 4: + return kDigestsWidth4[height_index - 2]; + case 8: + return kDigestsWidth8[height_index - 1]; + case 16: + return kDigestsWidth16[height_index]; + case 32: + return kDigestsWidth32[height_index]; + case 64: + return kDigestsWidth64[height_index]; + default: + EXPECT_EQ(block_size.width, 128) + << "Unknown width parameter: " << block_size.width; + return kDigestsWidth128[height_index]; + } +} + +using DistanceWeightedBlendTest8bpp = DistanceWeightedBlendTest<8, uint8_t>; + +TEST_P(DistanceWeightedBlendTest8bpp, Blending) { + Test(GetDistanceWeightedBlendDigest8bpp(GetParam()), 1); +} + +TEST_P(DistanceWeightedBlendTest8bpp, DISABLED_Speed) { + Test(GetDistanceWeightedBlendDigest8bpp(GetParam()), kNumSpeedTests); +} + +INSTANTIATE_TEST_SUITE_P(C, DistanceWeightedBlendTest8bpp, + testing::ValuesIn(kTestParam)); + +#if LIBGAV1_ENABLE_NEON +INSTANTIATE_TEST_SUITE_P(NEON, DistanceWeightedBlendTest8bpp, + testing::ValuesIn(kTestParam)); +#endif + +#if LIBGAV1_ENABLE_SSE4_1 +INSTANTIATE_TEST_SUITE_P(SSE41, DistanceWeightedBlendTest8bpp, + testing::ValuesIn(kTestParam)); +#endif + +#if LIBGAV1_MAX_BITDEPTH >= 10 +const char* GetDistanceWeightedBlendDigest10bpp(const TestParam block_size) { + static const char* const kDigestsWidth4[] = { + "55f594b56e16d5c401274affebbcc3d3", + "69df14da4bb33a8f7d7087921008e919", + "1b61f33604c54015794198a13bfebf46", + }; + static const char* const kDigestsWidth8[] = { + "825a938185b152f7cf09bf1c0723ce2b", + "85ea315c51d979bc9b45834d6b40ec6f", + "92ebde208e8c39f7ec6de2de82182dbb", + "520f84716db5b43684dbb703806383fe", + }; + static const char* const kDigestsWidth16[] = { + "12ca23e3e2930005a0511646e8c83da4", "6208694a6744f4a3906f58c1add670e3", + "a33d63889df989a3bbf84ff236614267", "34830846ecb0572a98bbd192fed02b16", + "34bb2f79c0bd7f9a80691b8af597f2a8", + }; + static const char* const kDigestsWidth32[] = { + "fa97f2d0e3143f1f44d3ac018b0d696d", "3df4a22456c9ab6ed346ab1b9750ae7d", + "6276a058b35c6131bc0c94a4b4a37ebc", "9ca42da5d2d5eb339df03ae2c7a26914", + "2ff0dc010a7b40830fb47423a9beb894", + }; + static const char* const kDigestsWidth64[] = { + "800e692c520f99223bc24c1ac95a0166", + "818b6d20426585ef7fe844015a03aaf5", + "fb48691ccfff083e01d74826e88e613f", + "0bd350bc5bc604a224d77a5f5a422698", + }; + static const char* const kDigestsWidth128[] = { + "02aac5d5669c1245da876c5440c4d829", + "a130840813cd6bd69d09bcf5f8d0180f", + "6ece1846bea55e8f8f2ed7fbf73718de", + }; + const int height_index = + FloorLog2(block_size.height) - FloorLog2(block_size.width) + 2; + switch (block_size.width) { + case 4: + return kDigestsWidth4[height_index - 2]; + case 8: + return kDigestsWidth8[height_index - 1]; + case 16: + return kDigestsWidth16[height_index]; + case 32: + return kDigestsWidth32[height_index]; + case 64: + return kDigestsWidth64[height_index]; + default: + EXPECT_EQ(block_size.width, 128) + << "Unknown width parameter: " << block_size.width; + return kDigestsWidth128[height_index]; + } +} + +using DistanceWeightedBlendTest10bpp = DistanceWeightedBlendTest<10, uint16_t>; + +TEST_P(DistanceWeightedBlendTest10bpp, Blending) { + Test(GetDistanceWeightedBlendDigest10bpp(GetParam()), 1); +} + +TEST_P(DistanceWeightedBlendTest10bpp, DISABLED_Speed) { + Test(GetDistanceWeightedBlendDigest10bpp(GetParam()), kNumSpeedTests); +} + +INSTANTIATE_TEST_SUITE_P(C, DistanceWeightedBlendTest10bpp, + testing::ValuesIn(kTestParam)); + +#if LIBGAV1_ENABLE_SSE4_1 +INSTANTIATE_TEST_SUITE_P(SSE41, DistanceWeightedBlendTest10bpp, + testing::ValuesIn(kTestParam)); +#endif +#if LIBGAV1_ENABLE_NEON +INSTANTIATE_TEST_SUITE_P(NEON, DistanceWeightedBlendTest10bpp, + testing::ValuesIn(kTestParam)); +#endif +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +} // namespace +} // namespace dsp +} // namespace libgav1 diff --git a/src/dsp/dsp.cc b/src/dsp/dsp.cc index 5b54c4e..a3d7701 100644 --- a/src/dsp/dsp.cc +++ b/src/dsp/dsp.cc @@ -16,7 +16,6 @@ #include <mutex> // NOLINT (unapproved c++11 header) -#include "src/dsp/arm/weight_mask_neon.h" #include "src/dsp/average_blend.h" #include "src/dsp/cdef.h" #include "src/dsp/convolve.h" @@ -24,6 +23,10 @@ #include "src/dsp/film_grain.h" #include "src/dsp/intra_edge.h" #include "src/dsp/intrapred.h" +#include "src/dsp/intrapred_cfl.h" +#include "src/dsp/intrapred_directional.h" +#include "src/dsp/intrapred_filter.h" +#include "src/dsp/intrapred_smooth.h" #include "src/dsp/inverse_transform.h" #include "src/dsp/loop_filter.h" #include "src/dsp/loop_restoration.h" @@ -39,6 +42,30 @@ namespace libgav1 { namespace dsp_internal { +void DspInit_C() { + dsp::AverageBlendInit_C(); + dsp::CdefInit_C(); + dsp::ConvolveInit_C(); + dsp::DistanceWeightedBlendInit_C(); + dsp::FilmGrainInit_C(); + dsp::IntraEdgeInit_C(); + dsp::IntraPredCflInit_C(); + dsp::IntraPredDirectionalInit_C(); + dsp::IntraPredFilterInit_C(); + dsp::IntraPredInit_C(); + dsp::IntraPredSmoothInit_C(); + dsp::InverseTransformInit_C(); + dsp::LoopFilterInit_C(); + dsp::LoopRestorationInit_C(); + dsp::MaskBlendInit_C(); + dsp::MotionFieldProjectionInit_C(); + dsp::MotionVectorSearchInit_C(); + dsp::ObmcInit_C(); + dsp::SuperResInit_C(); + dsp::WarpInit_C(); + dsp::WeightMaskInit_C(); +} + dsp::Dsp* GetWritableDspTable(int bitdepth) { switch (bitdepth) { case 8: { @@ -62,23 +89,7 @@ namespace dsp { void DspInit() { static std::once_flag once; std::call_once(once, []() { - AverageBlendInit_C(); - CdefInit_C(); - ConvolveInit_C(); - DistanceWeightedBlendInit_C(); - FilmGrainInit_C(); - IntraEdgeInit_C(); - IntraPredInit_C(); - InverseTransformInit_C(); - LoopFilterInit_C(); - LoopRestorationInit_C(); - MaskBlendInit_C(); - MotionFieldProjectionInit_C(); - MotionVectorSearchInit_C(); - ObmcInit_C(); - SuperResInit_C(); - WarpInit_C(); - WeightMaskInit_C(); + dsp_internal::DspInit_C(); #if LIBGAV1_ENABLE_SSE4_1 || LIBGAV1_ENABLE_AVX2 const uint32_t cpu_features = GetCpuInfo(); #if LIBGAV1_ENABLE_SSE4_1 @@ -87,7 +98,11 @@ void DspInit() { CdefInit_SSE4_1(); ConvolveInit_SSE4_1(); DistanceWeightedBlendInit_SSE4_1(); + FilmGrainInit_SSE4_1(); IntraEdgeInit_SSE4_1(); + IntraPredCflInit_SSE4_1(); + IntraPredDirectionalInit_SSE4_1(); + IntraPredFilterInit_SSE4_1(); IntraPredInit_SSE4_1(); IntraPredCflInit_SSE4_1(); IntraPredSmoothInit_SSE4_1(); @@ -108,6 +123,7 @@ void DspInit() { #endif // LIBGAV1_ENABLE_SSE4_1 #if LIBGAV1_ENABLE_AVX2 if ((cpu_features & kAVX2) != 0) { + CdefInit_AVX2(); ConvolveInit_AVX2(); LoopRestorationInit_AVX2(); #if LIBGAV1_MAX_BITDEPTH >= 10 @@ -125,7 +141,7 @@ void DspInit() { IntraEdgeInit_NEON(); IntraPredCflInit_NEON(); IntraPredDirectionalInit_NEON(); - IntraPredFilterIntraInit_NEON(); + IntraPredFilterInit_NEON(); IntraPredInit_NEON(); IntraPredSmoothInit_NEON(); InverseTransformInit_NEON(); @@ -138,6 +154,9 @@ void DspInit() { SuperResInit_NEON(); WarpInit_NEON(); WeightMaskInit_NEON(); +#if LIBGAV1_MAX_BITDEPTH >= 10 + InverseTransformInit10bpp_NEON(); +#endif // LIBGAV1_MAX_BITDEPTH >= 10 #endif // LIBGAV1_ENABLE_NEON }); } diff --git a/src/dsp/dsp.h b/src/dsp/dsp.h index fcbac3a..153db7f 100644 --- a/src/dsp/dsp.h +++ b/src/dsp/dsp.h @@ -17,7 +17,7 @@ #ifndef LIBGAV1_SRC_DSP_DSP_H_ #define LIBGAV1_SRC_DSP_DSP_H_ -#include <cstddef> // ptrdiff_t +#include <cstddef> #include <cstdint> #include <cstdlib> @@ -372,8 +372,9 @@ using SuperResCoefficientsFunc = void (*)(int upscaled_width, // |coefficients| is the upscale filter used by each pixel in a row. It is not // used by the C function. // |source| is the input frame buffer. It will be line extended. +// |source_stride| is given in pixels. // |dest| is the output buffer. -// |stride| is given in pixels, and shared by |source| and |dest|. +// |dest_stride| is given in pixels. // |height| is the height of the block to be processed. // |downscaled_width| is the width of the input frame. // |upscaled_width| is the width of the output frame. @@ -381,9 +382,10 @@ using SuperResCoefficientsFunc = void (*)(int upscaled_width, // pixel. // |initial_subpixel_x| is a base offset from which |step| increments. using SuperResFunc = void (*)(const void* coefficients, void* source, - ptrdiff_t stride, int height, + ptrdiff_t source_stride, int height, int downscaled_width, int upscaled_width, - int initial_subpixel_x, int step, void* dest); + int initial_subpixel_x, int step, void* dest, + ptrdiff_t dest_stride); // Loop restoration function signature. Sections 7.16, 7.17. // |restoration_info| contains loop restoration information, such as filter @@ -391,14 +393,15 @@ using SuperResFunc = void (*)(const void* coefficients, void* source, // |source| is the input frame buffer, which is deblocked and cdef filtered. // |top_border| and |bottom_border| are the top and bottom borders. // |dest| is the output. -// |stride| is given in pixels, and shared by |source|, |top_border|, -// |bottom_border| and |dest|. +// |stride| is given in pixels, and shared by |source| and |dest|. +// |top_border_stride| and |bottom_border_stride| are given in pixels. // |restoration_buffer| contains buffers required for self guided filter and // wiener filter. They must be initialized before calling. using LoopRestorationFunc = void (*)( const RestorationUnitInfo& restoration_info, const void* source, - const void* top_border, const void* bottom_border, ptrdiff_t stride, - int width, int height, RestorationBuffer* restoration_buffer, void* dest); + ptrdiff_t stride, const void* top_border, ptrdiff_t top_border_stride, + const void* bottom_border, ptrdiff_t bottom_border_stride, int width, + int height, RestorationBuffer* restoration_buffer, void* dest); // Index 0 is Wiener Filter. // Index 1 is Self Guided Restoration Filter. @@ -900,6 +903,11 @@ namespace dsp_internal { (LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \ LIBGAV1_Dsp10bpp_##func == LIBGAV1_CPU_SSE4_1) +// Initializes C-only function pointers. Note some entries may be set to +// nullptr if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS is not defined. This is meant +// for use in tests only, it is not thread-safe. +void DspInit_C(); + // Returns the appropriate Dsp table for |bitdepth| or nullptr if one doesn't // exist. This version is meant for use by test or dsp/*Init() functions only. dsp::Dsp* GetWritableDspTable(int bitdepth); diff --git a/src/dsp/dsp_test.cc b/src/dsp/dsp_test.cc new file mode 100644 index 0000000..bf7b9f3 --- /dev/null +++ b/src/dsp/dsp_test.cc @@ -0,0 +1,248 @@ +// Copyright 2020 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/dsp.h" + +#include <algorithm> +#include <cstddef> +#include <cstdint> + +#include "absl/strings/str_cat.h" +#include "gtest/gtest.h" +#include "src/dsp/constants.h" +#include "src/utils/constants.h" +#include "src/utils/cpu.h" + +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +#include "tests/utils.h" +#endif + +namespace libgav1 { +namespace dsp { +namespace { + +// Maps 1D transform to the maximum valid size for the corresponding transform. +constexpr int kMax1DTransformSize[kNum1DTransforms] = { + k1DTransformSize64, // Dct. + k1DTransformSize16, // Adst. + k1DTransformSize32, // Identity. + k1DTransformSize4, // Wht. +}; + +void CheckTables(bool c_only) { +#if LIBGAV1_MAX_BITDEPTH >= 10 + static constexpr int kBitdepths[] = {kBitdepth8, kBitdepth10}; +#else + static constexpr int kBitdepths[] = {kBitdepth8}; +#endif + + for (const auto& bitdepth : kBitdepths) { + const Dsp* const dsp = GetDspTable(bitdepth); + ASSERT_NE(dsp, nullptr); + SCOPED_TRACE(absl::StrCat("bitdepth: ", bitdepth)); + for (int i = 0; i < kNumTransformSizes; ++i) { + for (int j = 0; j < kNumIntraPredictors; ++j) { + EXPECT_NE(dsp->intra_predictors[i][j], nullptr) + << "index [" << i << "][" << j << "]"; + } + } + EXPECT_NE(dsp->directional_intra_predictor_zone1, nullptr); + EXPECT_NE(dsp->directional_intra_predictor_zone2, nullptr); + EXPECT_NE(dsp->directional_intra_predictor_zone3, nullptr); + EXPECT_NE(dsp->filter_intra_predictor, nullptr); + for (int i = 0; i < kNumTransformSizes; ++i) { + if (std::max(kTransformWidth[i], kTransformHeight[i]) == 64) { + EXPECT_EQ(dsp->cfl_intra_predictors[i], nullptr) + << "index [" << i << "]"; + for (int j = 0; j < kNumSubsamplingTypes; ++j) { + EXPECT_EQ(dsp->cfl_subsamplers[i][j], nullptr) + << "index [" << i << "][" << j << "]"; + } + } else { + EXPECT_NE(dsp->cfl_intra_predictors[i], nullptr) + << "index [" << i << "]"; + for (int j = 0; j < kNumSubsamplingTypes; ++j) { + EXPECT_NE(dsp->cfl_subsamplers[i][j], nullptr) + << "index [" << i << "][" << j << "]"; + } + } + } + EXPECT_NE(dsp->intra_edge_filter, nullptr); + EXPECT_NE(dsp->intra_edge_upsampler, nullptr); + for (int i = 0; i < kNum1DTransforms; ++i) { + for (int j = 0; j < kNum1DTransformSizes; ++j) { + for (int k = 0; k < 2; ++k) { + if (j <= kMax1DTransformSize[i]) { + EXPECT_NE(dsp->inverse_transforms[i][j][k], nullptr) + << "index [" << i << "][" << j << "][" << k << "]"; + } else { + EXPECT_EQ(dsp->inverse_transforms[i][j][k], nullptr) + << "index [" << i << "][" << j << "][" << k << "]"; + } + } + } + } + for (int i = 0; i < kNumLoopFilterSizes; ++i) { + for (int j = 0; j < kNumLoopFilterTypes; ++j) { + EXPECT_NE(dsp->loop_filters[i][j], nullptr) + << "index [" << i << "][" << j << "]"; + } + } + for (int i = 0; i < 2; ++i) { + EXPECT_NE(dsp->loop_restorations[i], nullptr) << "index [" << i << "]"; + } + + bool super_res_coefficients_is_nonnull = LIBGAV1_ENABLE_NEON; +#if LIBGAV1_ENABLE_SSE4_1 + const uint32_t cpu_features = GetCpuInfo(); + super_res_coefficients_is_nonnull = (cpu_features & kSSE4_1) != 0; +#endif + if (c_only) super_res_coefficients_is_nonnull = false; + if (super_res_coefficients_is_nonnull) { + EXPECT_NE(dsp->super_res_coefficients, nullptr); + } else { + EXPECT_EQ(dsp->super_res_coefficients, nullptr); + } + + EXPECT_NE(dsp->super_res, nullptr); + EXPECT_NE(dsp->cdef_direction, nullptr); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + EXPECT_NE(dsp->cdef_filters[i][j], nullptr) + << "index [" << i << "][" << j << "]"; + } + } + for (auto convolve_func : dsp->convolve_scale) { + EXPECT_NE(convolve_func, nullptr); + } + for (int j = 0; j < 2; ++j) { + for (int k = 0; k < 2; ++k) { + for (int l = 0; l < 2; ++l) { + for (int m = 0; m < 2; ++m) { + if (j == 1 && k == 1) { + EXPECT_EQ(dsp->convolve[j][k][l][m], nullptr); + } else { + EXPECT_NE(dsp->convolve[j][k][l][m], nullptr); + } + } + } + } + } + for (const auto& m : dsp->mask_blend) { + for (int i = 0; i < 2; ++i) { + if (i == 0 || bitdepth >= 10) { + EXPECT_NE(m[i], nullptr); + } else { + EXPECT_EQ(m[i], nullptr); + } + } + } + for (const auto& m : dsp->inter_intra_mask_blend_8bpp) { + if (bitdepth == 8) { + EXPECT_NE(m, nullptr); + } else { + EXPECT_EQ(m, nullptr); + } + } + for (int i = kBlock4x4; i < kMaxBlockSizes; ++i) { + const int width_index = k4x4WidthLog2[i] - 1; + const int height_index = k4x4HeightLog2[i] - 1; + // Only block sizes >= 8x8 are handled with this function. + if (width_index < 0 || height_index < 0) continue; + + for (size_t j = 0; j < 2; ++j) { + EXPECT_NE(dsp->weight_mask[width_index][height_index][j], nullptr) + << ToString(static_cast<BlockSize>(i)) << " index [" << width_index + << "]" + << "[" << height_index << "][" << j << "]"; + } + } + + EXPECT_NE(dsp->average_blend, nullptr); + EXPECT_NE(dsp->distance_weighted_blend, nullptr); + for (int i = 0; i < kNumObmcDirections; ++i) { + EXPECT_NE(dsp->obmc_blend[i], nullptr) + << "index [" << ToString(static_cast<ObmcDirection>(i)) << "]"; + } + EXPECT_NE(dsp->warp, nullptr); + EXPECT_NE(dsp->warp_compound, nullptr); + + for (int i = 0; i < kNumAutoRegressionLags - 1; ++i) { + EXPECT_NE(dsp->film_grain.luma_auto_regression[i], nullptr) + << "index [" << i << "]"; + } + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < kNumAutoRegressionLags; ++j) { + if (i == 0 && j == 0) { + EXPECT_EQ(dsp->film_grain.chroma_auto_regression[i][j], nullptr) + << " index [" << i << "]" + << "[" << j << "]"; + } else { + EXPECT_NE(dsp->film_grain.chroma_auto_regression[i][j], nullptr) + << " index [" << i << "]" + << "[" << j << "]"; + } + } + EXPECT_NE(dsp->film_grain.construct_noise_stripes[i], nullptr) + << "index [" << i << "]"; + EXPECT_NE(dsp->film_grain.blend_noise_chroma[i], nullptr) + << "index [" << i << "]"; + } + EXPECT_NE(dsp->film_grain.construct_noise_image_overlap, nullptr); + EXPECT_NE(dsp->film_grain.initialize_scaling_lut, nullptr); + EXPECT_NE(dsp->film_grain.blend_noise_luma, nullptr); + + EXPECT_NE(dsp->motion_field_projection_kernel, nullptr); + EXPECT_NE(dsp->mv_projection_compound[0], nullptr); + EXPECT_NE(dsp->mv_projection_compound[1], nullptr); + EXPECT_NE(dsp->mv_projection_compound[2], nullptr); + EXPECT_NE(dsp->mv_projection_single[0], nullptr); + EXPECT_NE(dsp->mv_projection_single[1], nullptr); + EXPECT_NE(dsp->mv_projection_single[2], nullptr); + } +} + +TEST(Dsp, TablesArePopulated) { + DspInit(); + CheckTables(/*c_only=*/false); +} + +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +TEST(Dsp, TablesArePopulatedCOnly) { + test_utils::ResetDspTable(kBitdepth8); +#if LIBGAV1_MAX_BITDEPTH >= 10 + test_utils::ResetDspTable(kBitdepth10); +#endif + dsp_internal::DspInit_C(); + CheckTables(/*c_only=*/true); +} +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + +TEST(Dsp, GetDspTable) { + EXPECT_EQ(GetDspTable(1), nullptr); + EXPECT_NE(GetDspTable(8), nullptr); + EXPECT_EQ(dsp_internal::GetWritableDspTable(1), nullptr); + EXPECT_NE(dsp_internal::GetWritableDspTable(8), nullptr); +#if LIBGAV1_MAX_BITDEPTH >= 10 + EXPECT_NE(GetDspTable(10), nullptr); + EXPECT_NE(dsp_internal::GetWritableDspTable(10), nullptr); +#else + EXPECT_EQ(GetDspTable(10), nullptr); + EXPECT_EQ(dsp_internal::GetWritableDspTable(10), nullptr); +#endif +} + +} // namespace +} // namespace dsp +} // namespace libgav1 diff --git a/src/dsp/film_grain.h b/src/dsp/film_grain.h index fe93270..f75a354 100644 --- a/src/dsp/film_grain.h +++ b/src/dsp/film_grain.h @@ -25,6 +25,14 @@ // ARM: #include "src/dsp/arm/film_grain_neon.h" +// x86: +// Note includes should be sorted in logical order avx2/avx/sse4, etc. +// The order of includes is important as each tests for a superior version +// before setting the base. +// clang-format off +#include "src/dsp/x86/film_grain_sse4.h" +// clang-format on + // IWYU pragma: end_exports namespace libgav1 { diff --git a/src/dsp/intra_edge_test.cc b/src/dsp/intra_edge_test.cc new file mode 100644 index 0000000..90960c6 --- /dev/null +++ b/src/dsp/intra_edge_test.cc @@ -0,0 +1,504 @@ +// Copyright 2021 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/intra_edge.h" + +#include <cstdint> +#include <cstdio> +#include <ostream> + +#include "absl/strings/match.h" +#include "absl/strings/string_view.h" +#include "absl/time/clock.h" +#include "absl/time/time.h" +#include "gtest/gtest.h" +#include "src/dsp/dsp.h" +#include "src/utils/constants.h" +#include "src/utils/cpu.h" +#include "tests/third_party/libvpx/acm_random.h" +#include "tests/utils.h" + +namespace libgav1 { +namespace dsp { +namespace { + +const char kIntraEdge[] = "IntraEdge"; +const char kIntraEdgeFilterName[] = "Intra Edge Filter"; +const char kIntraEdgeUpsamplerName[] = "Intra Edge Upsampler"; + +constexpr int kIntraEdgeFilterTestMaxSize = 129; +constexpr int kIntraEdgeFilterTestFixedInput[kIntraEdgeFilterTestMaxSize] = { + 159, 208, 54, 136, 205, 124, 125, 165, 164, 63, 171, 143, 210, 236, 253, + 233, 139, 113, 66, 211, 133, 61, 91, 123, 187, 76, 110, 172, 61, 103, + 239, 147, 247, 120, 18, 106, 180, 159, 208, 54, 136, 205, 124, 125, 165, + 164, 63, 171, 143, 210, 236, 253, 233, 139, 113, 66, 211, 133, 61, 91, + 123, 187, 76, 110, 172, 61, 103, 239, 147, 247, 120, 18, 106, 180, 159, + 208, 54, 136, 205, 124, 125, 165, 164, 63, 171, 143, 210, 236, 253, 233, + 139, 113, 66, 211, 133, 61, 91, 123, 187, 76, 110, 172, 61, 103, 239, + 147, 247, 120, 18, 106, 180, 159, 208, 54, 136, 205, 124, 125, 165, 164, + 63, 171, 143, 210, 236, 253, 233, 139, 113, +}; +constexpr int kIntraEdgeUpsamplerTestFixedInput[] = { + 208, 54, 136, 205, 124, 125, 165, 164, 63, + 171, 143, 210, 236, 208, 54, 136, 205}; + +struct EdgeFilterParams { + int size; + int strength; +}; + +std::ostream& operator<<(std::ostream& os, const EdgeFilterParams& param) { + return os << "size: " << param.size << ", strength: " << param.strength; +} + +// Each size is paired with strength 1, 2, and 3. +// In general, the size is expressible as 2^n+1, but all sizes up to 129 are +// permissible. +constexpr EdgeFilterParams kIntraEdgeFilterParamList[] = { + {1, 1}, {1, 2}, {1, 3}, {2, 1}, {2, 2}, {2, 3}, {5, 1}, {5, 2}, + {5, 3}, {9, 1}, {9, 2}, {9, 3}, {17, 1}, {17, 2}, {17, 3}, {33, 1}, + {33, 2}, {33, 3}, {50, 1}, {50, 2}, {50, 3}, {55, 1}, {55, 2}, {55, 3}, + {65, 1}, {65, 2}, {65, 3}, {129, 1}, {129, 2}, {129, 3}}; + +template <int bitdepth, typename Pixel> +class IntraEdgeFilterTest : public testing::TestWithParam<EdgeFilterParams> { + public: + IntraEdgeFilterTest() = default; + IntraEdgeFilterTest(const IntraEdgeFilterTest&) = delete; + IntraEdgeFilterTest& operator=(const IntraEdgeFilterTest&) = delete; + ~IntraEdgeFilterTest() override = default; + + protected: + void SetUp() override { + test_utils::ResetDspTable(bitdepth); + IntraEdgeInit_C(); + + const Dsp* const dsp = GetDspTable(bitdepth); + ASSERT_NE(dsp, nullptr); + base_intra_edge_filter_ = dsp->intra_edge_filter; + + const testing::TestInfo* const test_info = + testing::UnitTest::GetInstance()->current_test_info(); + const absl::string_view test_case = test_info->test_suite_name(); + if (absl::StartsWith(test_case, "C/")) { + base_intra_edge_filter_ = nullptr; + } else if (absl::StartsWith(test_case, "SSE41/")) { + if ((GetCpuInfo() & kSSE4_1) != 0) { + IntraEdgeInit_SSE4_1(); + } + } else if (absl::StartsWith(test_case, "NEON/")) { + IntraEdgeInit_NEON(); + } else { + FAIL() << "Unrecognized architecture prefix in test case name: " + << test_case; + } + + cur_intra_edge_filter_ = dsp->intra_edge_filter; + } + + void TestFixedValues(const char* digest); + void TestRandomValues(int num_runs); + + Pixel buffer_[kIntraEdgeFilterTestMaxSize]; + Pixel base_buffer_[kIntraEdgeFilterTestMaxSize]; + int strength_ = GetParam().strength; + int size_ = GetParam().size; + + IntraEdgeFilterFunc base_intra_edge_filter_; + IntraEdgeFilterFunc cur_intra_edge_filter_; +}; + +template <int bitdepth, typename Pixel> +void IntraEdgeFilterTest<bitdepth, Pixel>::TestFixedValues( + const char* const digest) { + if (cur_intra_edge_filter_ == nullptr) return; + for (int i = 0; i < kIntraEdgeFilterTestMaxSize; ++i) { + buffer_[i] = kIntraEdgeFilterTestFixedInput[i]; + } + const absl::Time start = absl::Now(); + cur_intra_edge_filter_(buffer_, size_, strength_); + const absl::Duration elapsed_time = absl::Now() - start; + test_utils::CheckMd5Digest(kIntraEdge, kIntraEdgeFilterName, digest, buffer_, + kIntraEdgeFilterTestMaxSize * sizeof(buffer_[0]), + elapsed_time); +} + +template <int bitdepth, typename Pixel> +void IntraEdgeFilterTest<bitdepth, Pixel>::TestRandomValues(int num_runs) { + if (base_intra_edge_filter_ == nullptr) return; + if (cur_intra_edge_filter_ == nullptr) return; + libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed()); + absl::Duration elapsed_time; + absl::Duration base_elapsed_time; + for (int num_tests = 0; num_tests < num_runs; ++num_tests) { + for (int i = 0; i < kIntraEdgeFilterTestMaxSize; ++i) { + const Pixel val = rnd(bitdepth); + buffer_[i] = val; + base_buffer_[i] = val; + } + const absl::Time base_start = absl::Now(); + base_intra_edge_filter_(base_buffer_, size_, strength_); + base_elapsed_time += absl::Now() - base_start; + const absl::Time start = absl::Now(); + cur_intra_edge_filter_(buffer_, size_, strength_); + elapsed_time += absl::Now() - start; + } + if (num_runs > 1) { + printf("Mode %s[%31s] Size %3d Strength %d C: %5d us SIMD: %5d us %2.2fx\n", + kIntraEdge, kIntraEdgeFilterName, size_, strength_, + static_cast<int>(absl::ToInt64Microseconds(base_elapsed_time)), + static_cast<int>(absl::ToInt64Microseconds(elapsed_time)), + absl::ToDoubleMicroseconds(base_elapsed_time) / + absl::ToDoubleMicroseconds(elapsed_time)); + } else { + printf("Mode %s[%31s] Size %3d Strength %d\n", kIntraEdge, + kIntraEdgeFilterName, size_, strength_); + } + for (int i = 0; i < kIntraEdgeFilterTestMaxSize; ++i) { + EXPECT_EQ(buffer_[i], base_buffer_[i]) << "Mismatch in index: " << i; + } +} + +using IntraEdgeFilterTest8bpp = IntraEdgeFilterTest<8, uint8_t>; + +const char* GetIntraEdgeFilterDigest8bpp(int strength, int size) { + static const char* const kDigestsSize1[3] = { + "f7f681cf7047602fafc7fb416ecf46e1", "f7f681cf7047602fafc7fb416ecf46e1", + "f7f681cf7047602fafc7fb416ecf46e1"}; + static const char* const kDigestsSize2[3] = { + "cb24cc54900fb75d767f3de797451e43", "380c80c89e1e8cda81ee0d3d4b29b8b7", + "a7eb3dba95ff35c2df45a274afbc9772"}; + static const char* const kDigestsSize5[3] = { + "23380cb37688d4c3a8f70a276be65eed", "ec1e23d5b996a527ed3d45c0d552bf22", + "d313523d3b7646fdbb873c61ffe7a51a"}; + static const char* const kDigestsSize9[3] = { + "e79597e9d62893754fc77d80ca86329a", "f7644e9748984914100e7031c6432272", + "bdf4f16734c86338716fb436c196ecc6"}; + static const char* const kDigestsSize17[3] = { + "13ad15c833e850348eecb9fea4f3cadb", "e5988a72391250c702a8192893df40dd", + "8f68603598638fa33203fe1233d273b1"}; + static const char* const kDigestsSize33[3] = { + "51156da8f4d527e0c011040769987dbd", "eff17eaf73a7bb7fd4c921510ade9f67", + "aca87680e0649d0728091c92c6de8871"}; + static const char* const kDigestsSize50[3] = { + "87c1d43751125f1ea4987517a90d378d", "942a9d056231683bdfc52346b6b032c2", + "16a9148daf0e5f69808b9f0caa1ef110"}; + static const char* const kDigestsSize55[3] = { + "833480d74957fb0356dec5b09412eefa", "a307ef31f10affc3b7fb262d05f1b80a", + "0318b2fde088c472215fe155f3b48d36"}; + static const char* const kDigestsSize65[3] = { + "5000dada34ed2e6692bb44a4398ddf53", "8da6c776d897064ecd4a1e84aae92dd3", + "d7c71db339c28d33119974987b2f9d85"}; + static const char* const kDigestsSize129[3] = { + "bf174d8b45b8131404fd4a4686f8c117", "e81518d6d85eed2f1b18c59424561d6b", + "7306715602b0f5536771724a2f0a39bc"}; + + switch (size) { + case 1: + return kDigestsSize1[strength - 1]; + case 2: + return kDigestsSize2[strength - 1]; + case 5: + return kDigestsSize5[strength - 1]; + case 9: + return kDigestsSize9[strength - 1]; + case 17: + return kDigestsSize17[strength - 1]; + case 33: + return kDigestsSize33[strength - 1]; + case 50: + return kDigestsSize50[strength - 1]; + case 55: + return kDigestsSize55[strength - 1]; + case 65: + return kDigestsSize65[strength - 1]; + case 129: + return kDigestsSize129[strength - 1]; + default: + ADD_FAILURE() << "Unknown edge size: " << size; + return nullptr; + } +} + +TEST_P(IntraEdgeFilterTest8bpp, Correctness) { + TestFixedValues(GetIntraEdgeFilterDigest8bpp(strength_, size_)); + TestRandomValues(1); +} + +TEST_P(IntraEdgeFilterTest8bpp, DISABLED_Speed) { TestRandomValues(5e7); } + +#if LIBGAV1_MAX_BITDEPTH >= 10 +using IntraEdgeFilterTest10bpp = IntraEdgeFilterTest<10, uint16_t>; + +const char* GetIntraEdgeFilterDigest10bpp(int strength, int size) { + static const char* const kDigestsSize1[3] = { + "2d2088560e3ccb5b809c97f5299bb1c0", "2d2088560e3ccb5b809c97f5299bb1c0", + "2d2088560e3ccb5b809c97f5299bb1c0"}; + static const char* const kDigestsSize2[3] = { + "db3e785852e98fba18a1fb531f68466c", "8caea330489bc6ed0f99fbf769f53181", + "bcdd1b21f3baf5f6f29caea9ef93fb0c"}; + static const char* const kDigestsSize5[3] = { + "326f4193a62f5a959b86d95f5204608e", "4673e453203f75eae97ef44f43f098f2", + "48d516b06313683aca30e975ce6a3cad"}; + static const char* const kDigestsSize9[3] = { + "79217575a32e36a51d9dd40621af9c2d", "ccec1c16bc09b28ad6513c5e4c48b6d2", + "bb61aa9c5fa720c667a053769e7b7d08"}; + static const char* const kDigestsSize17[3] = { + "46d90e99ba46e89326a5fa547bcd9361", "824aee8950aecb356d5f4a91dbc90a7d", + "37d44d10a2545385af1da55f8c08564f"}; + static const char* const kDigestsSize33[3] = { + "c95108e06eb2aef61ecb6839af306edd", "832c695460b4dd2b85c5f8726e4470d1", + "994902f549eefd83fbcbf7ecb7dc5cca"}; + static const char* const kDigestsSize50[3] = { + "48119ef1436c3a4fe69d275bbaafedf8", "72c221c91c3df0a324ccbc9acea35f89", + "84e40aadcc416ef3f51cea3cc23b30c7"}; + static const char* const kDigestsSize55[3] = { + "6b68e4e0b00c4eb38a6d0d83c0f34658", "43a919f928a80379df5c9e07c9d8000d", + "7c320d55b11f93185b811bdaa379f2db"}; + static const char* const kDigestsSize65[3] = { + "c28de89cf9f3bc5a904647ab2c64caf7", "7ce63b1b28dce0624fc7586e8fb3ab8f", + "d06e6b88585f7f1a1f6af5bb59ee2180"}; + static const char* const kDigestsSize129[3] = { + "79160902c5c85004382d5ffa549b43cc", "3b0df95c3ca7b0b559b79234cf434738", + "500786d8561effec283d4f3d13886f8c"}; + + switch (size) { + case 1: + return kDigestsSize1[strength - 1]; + case 2: + return kDigestsSize2[strength - 1]; + case 5: + return kDigestsSize5[strength - 1]; + case 9: + return kDigestsSize9[strength - 1]; + case 17: + return kDigestsSize17[strength - 1]; + case 33: + return kDigestsSize33[strength - 1]; + case 50: + return kDigestsSize50[strength - 1]; + case 55: + return kDigestsSize55[strength - 1]; + case 65: + return kDigestsSize65[strength - 1]; + case 129: + return kDigestsSize129[strength - 1]; + default: + ADD_FAILURE() << "Unknown edge size: " << size; + return nullptr; + } +} + +TEST_P(IntraEdgeFilterTest10bpp, FixedInput) { + TestFixedValues(GetIntraEdgeFilterDigest10bpp(strength_, size_)); + TestRandomValues(1); +} + +TEST_P(IntraEdgeFilterTest10bpp, DISABLED_Speed) { TestRandomValues(5e7); } +#endif + +template <int bitdepth, typename Pixel> +class IntraEdgeUpsamplerTest : public testing::TestWithParam<int> { + public: + IntraEdgeUpsamplerTest() = default; + IntraEdgeUpsamplerTest(const IntraEdgeUpsamplerTest&) = delete; + IntraEdgeUpsamplerTest& operator=(const IntraEdgeUpsamplerTest&) = delete; + ~IntraEdgeUpsamplerTest() override = default; + + protected: + void SetUp() override { + test_utils::ResetDspTable(bitdepth); + IntraEdgeInit_C(); + + const Dsp* const dsp = GetDspTable(bitdepth); + ASSERT_NE(dsp, nullptr); + base_intra_edge_upsampler_ = dsp->intra_edge_upsampler; + const testing::TestInfo* const test_info = + testing::UnitTest::GetInstance()->current_test_info(); + const absl::string_view test_case = test_info->test_suite_name(); + if (absl::StartsWith(test_case, "C/")) { + base_intra_edge_upsampler_ = nullptr; + } else if (absl::StartsWith(test_case, "SSE41/")) { + if ((GetCpuInfo() & kSSE4_1) != 0) { + IntraEdgeInit_SSE4_1(); + } + } else if (absl::StartsWith(test_case, "NEON/")) { + IntraEdgeInit_NEON(); + } else { + FAIL() << "Unrecognized architecture prefix in test case name: " + << test_case; + } + cur_intra_edge_upsampler_ = dsp->intra_edge_upsampler; + } + + void TestFixedValues(const char* digest); + void TestRandomValues(int num_runs); + + Pixel buffer_[128]; + Pixel base_buffer_[128]; + int size_ = GetParam(); + + IntraEdgeUpsamplerFunc base_intra_edge_upsampler_; + IntraEdgeUpsamplerFunc cur_intra_edge_upsampler_; +}; + +template <int bitdepth, typename Pixel> +void IntraEdgeUpsamplerTest<bitdepth, Pixel>::TestFixedValues( + const char* const digest) { + if (cur_intra_edge_upsampler_ == nullptr) return; + buffer_[0] = 0; + for (int i = 0; i < size_ + 1; ++i) { + buffer_[i + 1] = kIntraEdgeUpsamplerTestFixedInput[i]; + } + const absl::Time start = absl::Now(); + cur_intra_edge_upsampler_(buffer_ + 2, size_); + const absl::Duration elapsed_time = absl::Now() - start; + test_utils::CheckMd5Digest(kIntraEdge, kIntraEdgeUpsamplerName, digest, + buffer_, (size_ * 2 + 1) * sizeof(buffer_[0]), + elapsed_time); +} + +template <int bitdepth, typename Pixel> +void IntraEdgeUpsamplerTest<bitdepth, Pixel>::TestRandomValues(int num_runs) { + if (base_intra_edge_upsampler_ == nullptr) return; + if (cur_intra_edge_upsampler_ == nullptr) return; + libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed()); + absl::Duration base_elapsed_time; + absl::Duration elapsed_time; + for (int num_tests = 0; num_tests < num_runs; ++num_tests) { + // Populate what will be buffer[-2..size] when passed to the upsample + // function. + buffer_[0] = 0; + base_buffer_[0] = 0; + for (int i = 1; i < size_ + 2; ++i) { + const Pixel val = rnd(bitdepth); + buffer_[i] = val; + base_buffer_[i] = val; + } + const absl::Time base_start = absl::Now(); + base_intra_edge_upsampler_(base_buffer_ + 2, size_); + base_elapsed_time += absl::Now() - base_start; + const absl::Time start = absl::Now(); + cur_intra_edge_upsampler_(buffer_ + 2, size_); + elapsed_time += absl::Now() - start; + } + if (num_runs > 1) { + printf("Mode %s[%31s] size %d C: %5d us SIMD: %5d us %2.2fx\n", kIntraEdge, + kIntraEdgeUpsamplerName, size_, + static_cast<int>(absl::ToInt64Microseconds(base_elapsed_time)), + static_cast<int>(absl::ToInt64Microseconds(elapsed_time)), + absl::ToDoubleMicroseconds(base_elapsed_time) / + absl::ToDoubleMicroseconds(elapsed_time)); + } else { + printf("Mode %s[%31s]: size %d \n", kIntraEdge, kIntraEdgeUpsamplerName, + size_); + } + + for (int i = 0; i < size_ * 2 + 1; ++i) { + EXPECT_EQ(buffer_[i], base_buffer_[i]) << "Mismatch in index: " << i; + } +} + +using IntraEdgeUpsamplerTest8bpp = IntraEdgeUpsamplerTest<8, uint8_t>; + +constexpr int kIntraEdgeUpsampleSizes[] = {4, 8, 12, 16}; + +const char* GetIntraEdgeUpsampleDigest8bpp(int size) { + switch (size) { + case 4: + return "aa9002e03f8d15eb26bbee76f40bb923"; + case 8: + return "cacfca86d65eff0d951eb21fc15f242a"; + case 12: + return "0529e00a1fa80bc866fa7662ad2d7b9f"; + case 16: + return "03e3b3e0ea438ea48ef05651c0a54986"; + default: + ADD_FAILURE() << "Unknown upsample size: " << size; + return ""; + } +} + +TEST_P(IntraEdgeUpsamplerTest8bpp, Correctness) { + TestFixedValues(GetIntraEdgeUpsampleDigest8bpp(size_)); + TestRandomValues(1); +} + +TEST_P(IntraEdgeUpsamplerTest8bpp, DISABLED_Speed) { TestRandomValues(5e7); } + +#if LIBGAV1_MAX_BITDEPTH >= 10 +using IntraEdgeUpsamplerTest10bpp = IntraEdgeUpsamplerTest<10, uint16_t>; + +const char* GetIntraEdgeUpsampleDigest10bpp(int size) { + switch (size) { + case 4: + return "341c6bb705a02bba65b34f92d8ca83cf"; + case 8: + return "fdbe4b3b341921dcb0edf00dfc4d7667"; + case 12: + return "ad69a491287495ec9973d4006d5ac461"; + case 16: + return "04acf32e517d80ce4c4958e711b9b890"; + default: + ADD_FAILURE() << "Unknown upsample size: " << size; + return ""; + } +} + +TEST_P(IntraEdgeUpsamplerTest10bpp, FixedInput) { + TestFixedValues(GetIntraEdgeUpsampleDigest10bpp(size_)); + TestRandomValues(1); +} + +TEST_P(IntraEdgeUpsamplerTest10bpp, DISABLED_Speed) { TestRandomValues(5e7); } +#endif + +INSTANTIATE_TEST_SUITE_P(C, IntraEdgeFilterTest8bpp, + testing::ValuesIn(kIntraEdgeFilterParamList)); +#if LIBGAV1_ENABLE_SSE4_1 +INSTANTIATE_TEST_SUITE_P(SSE41, IntraEdgeFilterTest8bpp, + testing::ValuesIn(kIntraEdgeFilterParamList)); +#endif +#if LIBGAV1_ENABLE_NEON +INSTANTIATE_TEST_SUITE_P(NEON, IntraEdgeFilterTest8bpp, + testing::ValuesIn(kIntraEdgeFilterParamList)); +#endif +INSTANTIATE_TEST_SUITE_P(C, IntraEdgeUpsamplerTest8bpp, + testing::ValuesIn(kIntraEdgeUpsampleSizes)); +#if LIBGAV1_ENABLE_SSE4_1 +INSTANTIATE_TEST_SUITE_P(SSE41, IntraEdgeUpsamplerTest8bpp, + testing::ValuesIn(kIntraEdgeUpsampleSizes)); +#endif +#if LIBGAV1_ENABLE_NEON +INSTANTIATE_TEST_SUITE_P(NEON, IntraEdgeUpsamplerTest8bpp, + testing::ValuesIn(kIntraEdgeUpsampleSizes)); +#endif + +#if LIBGAV1_MAX_BITDEPTH >= 10 +INSTANTIATE_TEST_SUITE_P(C, IntraEdgeFilterTest10bpp, + testing::ValuesIn(kIntraEdgeFilterParamList)); +INSTANTIATE_TEST_SUITE_P(C, IntraEdgeUpsamplerTest10bpp, + testing::ValuesIn(kIntraEdgeUpsampleSizes)); + +#if LIBGAV1_ENABLE_NEON +INSTANTIATE_TEST_SUITE_P(NEON, IntraEdgeFilterTest10bpp, + testing::ValuesIn(kIntraEdgeFilterParamList)); +INSTANTIATE_TEST_SUITE_P(NEON, IntraEdgeUpsamplerTest10bpp, + testing::ValuesIn(kIntraEdgeUpsampleSizes)); +#endif + +#endif +} // namespace +} // namespace dsp +} // namespace libgav1 diff --git a/src/dsp/intrapred.cc b/src/dsp/intrapred.cc index 4bcb580..4520c2c 100644 --- a/src/dsp/intrapred.cc +++ b/src/dsp/intrapred.cc @@ -19,21 +19,18 @@ #include <cstddef> #include <cstdint> #include <cstdlib> -#include <cstring> // memset +#include <cstring> #include "src/dsp/constants.h" #include "src/dsp/dsp.h" #include "src/utils/common.h" +#include "src/utils/constants.h" #include "src/utils/memory.h" namespace libgav1 { namespace dsp { namespace { -constexpr TransformSize kTransformSizesLargerThan32x32[] = { - kTransformSize16x64, kTransformSize32x64, kTransformSize64x16, - kTransformSize64x32, kTransformSize64x64}; - template <int block_width, int block_height, typename Pixel> struct IntraPredFuncs_C { IntraPredFuncs_C() = delete; @@ -50,12 +47,6 @@ struct IntraPredFuncs_C { const void* left_column); static void Paeth(void* dest, ptrdiff_t stride, const void* top_row, const void* left_column); - static void Smooth(void* dest, ptrdiff_t stride, const void* top_row, - const void* left_column); - static void SmoothVertical(void* dest, ptrdiff_t stride, const void* top_row, - const void* left_column); - static void SmoothHorizontal(void* dest, ptrdiff_t stride, - const void* top_row, const void* left_column); }; // Intra-predictors that require bitdepth. @@ -190,16 +181,6 @@ void IntraPredFuncs_C<block_width, block_height, Pixel>::Horizontal( } } -template <typename Pixel> -inline Pixel Average(Pixel a, Pixel b) { - return static_cast<Pixel>((a + b + 1) >> 1); -} - -template <typename Pixel> -inline Pixel Average(Pixel a, Pixel b, Pixel c) { - return static_cast<Pixel>((a + 2 * b + c + 2) >> 2); -} - // IntraPredFuncs_C::Paeth template <int block_width, int block_height, typename Pixel> void IntraPredFuncs_C<block_width, block_height, Pixel>::Paeth( @@ -238,110 +219,6 @@ void IntraPredFuncs_C<block_width, block_height, Pixel>::Paeth( } } -constexpr uint8_t kSmoothWeights[] = { - // block dimension = 4 - 255, 149, 85, 64, - // block dimension = 8 - 255, 197, 146, 105, 73, 50, 37, 32, - // block dimension = 16 - 255, 225, 196, 170, 145, 123, 102, 84, 68, 54, 43, 33, 26, 20, 17, 16, - // block dimension = 32 - 255, 240, 225, 210, 196, 182, 169, 157, 145, 133, 122, 111, 101, 92, 83, 74, - 66, 59, 52, 45, 39, 34, 29, 25, 21, 17, 14, 12, 10, 9, 8, 8, - // block dimension = 64 - 255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169, 163, 156, - 150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96, 91, 86, 82, 77, 73, - 69, 65, 61, 57, 54, 50, 47, 44, 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16, - 15, 13, 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4}; - -// IntraPredFuncs_C::Smooth -template <int block_width, int block_height, typename Pixel> -void IntraPredFuncs_C<block_width, block_height, Pixel>::Smooth( - void* const dest, ptrdiff_t stride, const void* const top_row, - const void* const left_column) { - const auto* const top = static_cast<const Pixel*>(top_row); - const auto* const left = static_cast<const Pixel*>(left_column); - const Pixel top_right = top[block_width - 1]; - const Pixel bottom_left = left[block_height - 1]; - static_assert( - block_width >= 4 && block_height >= 4, - "Weights for smooth predictor undefined for block width/height < 4"); - const uint8_t* const weights_x = kSmoothWeights + block_width - 4; - const uint8_t* const weights_y = kSmoothWeights + block_height - 4; - const uint16_t scale_value = (1 << kSmoothWeightScale); - auto* dst = static_cast<Pixel*>(dest); - stride /= sizeof(Pixel); - - for (int y = 0; y < block_height; ++y) { - for (int x = 0; x < block_width; ++x) { - assert(scale_value >= weights_y[y] && scale_value >= weights_x[x]); - uint32_t pred = weights_y[y] * top[x]; - pred += weights_x[x] * left[y]; - pred += static_cast<uint8_t>(scale_value - weights_y[y]) * bottom_left; - pred += static_cast<uint8_t>(scale_value - weights_x[x]) * top_right; - // The maximum value of pred with the rounder is 2^9 * (2^bitdepth - 1) - // + 256. With the descale there's no need for saturation. - dst[x] = static_cast<Pixel>( - RightShiftWithRounding(pred, kSmoothWeightScale + 1)); - } - dst += stride; - } -} - -// IntraPredFuncs_C::SmoothVertical -template <int block_width, int block_height, typename Pixel> -void IntraPredFuncs_C<block_width, block_height, Pixel>::SmoothVertical( - void* const dest, ptrdiff_t stride, const void* const top_row, - const void* const left_column) { - const auto* const top = static_cast<const Pixel*>(top_row); - const auto* const left = static_cast<const Pixel*>(left_column); - const Pixel bottom_left = left[block_height - 1]; - static_assert(block_height >= 4, - "Weights for smooth predictor undefined for block height < 4"); - const uint8_t* const weights_y = kSmoothWeights + block_height - 4; - const uint16_t scale_value = (1 << kSmoothWeightScale); - auto* dst = static_cast<Pixel*>(dest); - stride /= sizeof(Pixel); - - for (int y = 0; y < block_height; ++y) { - for (int x = 0; x < block_width; ++x) { - assert(scale_value >= weights_y[y]); - uint32_t pred = weights_y[y] * top[x]; - pred += static_cast<uint8_t>(scale_value - weights_y[y]) * bottom_left; - dst[x] = - static_cast<Pixel>(RightShiftWithRounding(pred, kSmoothWeightScale)); - } - dst += stride; - } -} - -// IntraPredFuncs_C::SmoothHorizontal -template <int block_width, int block_height, typename Pixel> -void IntraPredFuncs_C<block_width, block_height, Pixel>::SmoothHorizontal( - void* const dest, ptrdiff_t stride, const void* const top_row, - const void* const left_column) { - const auto* const top = static_cast<const Pixel*>(top_row); - const auto* const left = static_cast<const Pixel*>(left_column); - const Pixel top_right = top[block_width - 1]; - static_assert(block_width >= 4, - "Weights for smooth predictor undefined for block width < 4"); - const uint8_t* const weights_x = kSmoothWeights + block_width - 4; - const uint16_t scale_value = (1 << kSmoothWeightScale); - auto* dst = static_cast<Pixel*>(dest); - stride /= sizeof(Pixel); - - for (int y = 0; y < block_height; ++y) { - for (int x = 0; x < block_width; ++x) { - assert(scale_value >= weights_x[x]); - uint32_t pred = weights_x[x] * left[y]; - pred += static_cast<uint8_t>(scale_value - weights_x[x]) * top_right; - dst[x] = - static_cast<Pixel>(RightShiftWithRounding(pred, kSmoothWeightScale)); - } - dst += stride; - } -} - //------------------------------------------------------------------------------ // IntraPredBppFuncs_C template <int fill, typename Pixel> @@ -366,288 +243,7 @@ void IntraPredBppFuncs_C<block_width, block_height, bitdepth, Pixel>::DcFill( block_height); } -//------------------------------------------------------------------------------ -// FilterIntraPredictor_C - -template <int bitdepth, typename Pixel> -void FilterIntraPredictor_C(void* const dest, ptrdiff_t stride, - const void* const top_row, - const void* const left_column, - const FilterIntraPredictor pred, const int width, - const int height) { - const int kMaxPixel = (1 << bitdepth) - 1; - const auto* const top = static_cast<const Pixel*>(top_row); - const auto* const left = static_cast<const Pixel*>(left_column); - - assert(width <= 32 && height <= 32); - - Pixel buffer[3][33]; // cache 2 rows + top & left boundaries - memcpy(buffer[0], &top[-1], (width + 1) * sizeof(top[0])); - - auto* dst = static_cast<Pixel*>(dest); - stride /= sizeof(Pixel); - int row0 = 0, row2 = 2; - int ystep = 1; - int y = 0; - do { - buffer[1][0] = left[y]; - buffer[row2][0] = left[y + 1]; - int x = 1; - do { - const Pixel p0 = buffer[row0][x - 1]; // top-left - const Pixel p1 = buffer[row0][x + 0]; // top 0 - const Pixel p2 = buffer[row0][x + 1]; // top 1 - const Pixel p3 = buffer[row0][x + 2]; // top 2 - const Pixel p4 = buffer[row0][x + 3]; // top 3 - const Pixel p5 = buffer[1][x - 1]; // left 0 - const Pixel p6 = buffer[row2][x - 1]; // left 1 - for (int i = 0; i < 8; ++i) { - const int xoffset = i & 0x03; - const int yoffset = (i >> 2) * ystep; - const int value = kFilterIntraTaps[pred][i][0] * p0 + - kFilterIntraTaps[pred][i][1] * p1 + - kFilterIntraTaps[pred][i][2] * p2 + - kFilterIntraTaps[pred][i][3] * p3 + - kFilterIntraTaps[pred][i][4] * p4 + - kFilterIntraTaps[pred][i][5] * p5 + - kFilterIntraTaps[pred][i][6] * p6; - buffer[1 + yoffset][x + xoffset] = static_cast<Pixel>( - Clip3(RightShiftWithRounding(value, 4), 0, kMaxPixel)); - } - x += 4; - } while (x < width); - memcpy(dst, &buffer[1][1], width * sizeof(dst[0])); - dst += stride; - memcpy(dst, &buffer[row2][1], width * sizeof(dst[0])); - dst += stride; - - // The final row becomes the top for the next pass. - row0 ^= 2; - row2 ^= 2; - ystep = -ystep; - y += 2; - } while (y < height); -} - -//------------------------------------------------------------------------------ -// CflIntraPredictor_C - -// |luma| can be within +/-(((1 << bitdepth) - 1) << 3), inclusive. -// |alpha| can be -16 to 16 (inclusive). -template <int block_width, int block_height, int bitdepth, typename Pixel> -void CflIntraPredictor_C( - void* const dest, ptrdiff_t stride, - const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], - const int alpha) { - auto* dst = static_cast<Pixel*>(dest); - const int dc = dst[0]; - stride /= sizeof(Pixel); - const int max_value = (1 << bitdepth) - 1; - for (int y = 0; y < block_height; ++y) { - for (int x = 0; x < block_width; ++x) { - assert(luma[y][x] >= -(((1 << bitdepth) - 1) << 3)); - assert(luma[y][x] <= ((1 << bitdepth) - 1) << 3); - dst[x] = Clip3(dc + RightShiftWithRoundingSigned(alpha * luma[y][x], 6), - 0, max_value); - } - dst += stride; - } -} - -//------------------------------------------------------------------------------ -// CflSubsampler_C - -template <int block_width, int block_height, int bitdepth, typename Pixel, - int subsampling_x, int subsampling_y> -void CflSubsampler_C(int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], - const int max_luma_width, const int max_luma_height, - const void* const source, ptrdiff_t stride) { - assert(max_luma_width >= 4); - assert(max_luma_height >= 4); - const auto* src = static_cast<const Pixel*>(source); - stride /= sizeof(Pixel); - int sum = 0; - for (int y = 0; y < block_height; ++y) { - for (int x = 0; x < block_width; ++x) { - const ptrdiff_t luma_x = - std::min(x << subsampling_x, max_luma_width - (1 << subsampling_x)); - const ptrdiff_t luma_x_next = luma_x + stride; - luma[y][x] = - (src[luma_x] + ((subsampling_x != 0) ? src[luma_x + 1] : 0) + - ((subsampling_y != 0) ? (src[luma_x_next] + src[luma_x_next + 1]) - : 0)) - << (3 - subsampling_x - subsampling_y); - sum += luma[y][x]; - } - if ((y << subsampling_y) < (max_luma_height - (1 << subsampling_y))) { - src += stride << subsampling_y; - } - } - const int average = RightShiftWithRounding( - sum, FloorLog2(block_width) + FloorLog2(block_height)); - for (int y = 0; y < block_height; ++y) { - for (int x = 0; x < block_width; ++x) { - luma[y][x] -= average; - } - } -} - -//------------------------------------------------------------------------------ -// 7.11.2.4. Directional intra prediction process - -template <typename Pixel> -void DirectionalIntraPredictorZone1_C(void* const dest, ptrdiff_t stride, - const void* const top_row, - const int width, const int height, - const int xstep, - const bool upsampled_top) { - const auto* const top = static_cast<const Pixel*>(top_row); - auto* dst = static_cast<Pixel*>(dest); - stride /= sizeof(Pixel); - - assert(xstep > 0); - - // If xstep == 64 then |shift| always evaluates to 0 which sets |val| to - // |top[top_base_x]|. This corresponds to a 45 degree prediction. - if (xstep == 64) { - // 7.11.2.10. Intra edge upsample selection process - // if ( d <= 0 || d >= 40 ) useUpsample = 0 - // For |upsampled_top| the delta is |predictor_angle - 90|. Since the - // |predictor_angle| is 45 the delta is also 45. - assert(!upsampled_top); - const Pixel* top_ptr = top + 1; - for (int y = 0; y < height; ++y, dst += stride, ++top_ptr) { - memcpy(dst, top_ptr, sizeof(*top_ptr) * width); - } - return; - } - - const int upsample_shift = static_cast<int>(upsampled_top); - const int max_base_x = ((width + height) - 1) << upsample_shift; - const int scale_bits = 6 - upsample_shift; - const int base_step = 1 << upsample_shift; - int top_x = xstep; - int y = 0; - do { - int top_base_x = top_x >> scale_bits; - - if (top_base_x >= max_base_x) { - for (int i = y; i < height; ++i) { - Memset(dst, top[max_base_x], width); - dst += stride; - } - return; - } - - const int shift = ((top_x << upsample_shift) & 0x3F) >> 1; - int x = 0; - do { - if (top_base_x >= max_base_x) { - Memset(dst + x, top[max_base_x], width - x); - break; - } - - const int val = - top[top_base_x] * (32 - shift) + top[top_base_x + 1] * shift; - dst[x] = RightShiftWithRounding(val, 5); - top_base_x += base_step; - } while (++x < width); - - dst += stride; - top_x += xstep; - } while (++y < height); -} - -template <typename Pixel> -void DirectionalIntraPredictorZone2_C(void* const dest, ptrdiff_t stride, - const void* const top_row, - const void* const left_column, - const int width, const int height, - const int xstep, const int ystep, - const bool upsampled_top, - const bool upsampled_left) { - const auto* const top = static_cast<const Pixel*>(top_row); - const auto* const left = static_cast<const Pixel*>(left_column); - auto* dst = static_cast<Pixel*>(dest); - stride /= sizeof(Pixel); - - assert(xstep > 0); - assert(ystep > 0); - - const int upsample_top_shift = static_cast<int>(upsampled_top); - const int upsample_left_shift = static_cast<int>(upsampled_left); - const int scale_bits_x = 6 - upsample_top_shift; - const int scale_bits_y = 6 - upsample_left_shift; - const int min_base_x = -(1 << upsample_top_shift); - const int base_step_x = 1 << upsample_top_shift; - int y = 0; - int top_x = -xstep; - do { - int top_base_x = top_x >> scale_bits_x; - int left_y = (y << 6) - ystep; - int x = 0; - do { - int val; - if (top_base_x >= min_base_x) { - const int shift = ((top_x * (1 << upsample_top_shift)) & 0x3F) >> 1; - val = top[top_base_x] * (32 - shift) + top[top_base_x + 1] * shift; - } else { - // Note this assumes an arithmetic shift to handle negative values. - const int left_base_y = left_y >> scale_bits_y; - const int shift = ((left_y * (1 << upsample_left_shift)) & 0x3F) >> 1; - assert(left_base_y >= -(1 << upsample_left_shift)); - val = left[left_base_y] * (32 - shift) + left[left_base_y + 1] * shift; - } - dst[x] = RightShiftWithRounding(val, 5); - top_base_x += base_step_x; - left_y -= ystep; - } while (++x < width); - - top_x -= xstep; - dst += stride; - } while (++y < height); -} - -template <typename Pixel> -void DirectionalIntraPredictorZone3_C(void* const dest, ptrdiff_t stride, - const void* const left_column, - const int width, const int height, - const int ystep, - const bool upsampled_left) { - const auto* const left = static_cast<const Pixel*>(left_column); - stride /= sizeof(Pixel); - - assert(ystep > 0); - - const int upsample_shift = static_cast<int>(upsampled_left); - const int scale_bits = 6 - upsample_shift; - const int base_step = 1 << upsample_shift; - // Zone3 never runs out of left_column values. - assert((width + height - 1) << upsample_shift > // max_base_y - ((ystep * width) >> scale_bits) + - base_step * (height - 1)); // left_base_y - - int left_y = ystep; - int x = 0; - do { - auto* dst = static_cast<Pixel*>(dest); - - int left_base_y = left_y >> scale_bits; - int y = 0; - do { - const int shift = ((left_y << upsample_shift) & 0x3F) >> 1; - const int val = - left[left_base_y] * (32 - shift) + left[left_base_y + 1] * shift; - dst[x] = RightShiftWithRounding(val, 5); - dst += stride; - left_base_y += base_step; - } while (++y < height); - - left_y += ystep; - } while (++x < width); -} - -//------------------------------------------------------------------------------ +// ----------------------------------------------------------------------------- template <typename Pixel> struct IntraPredDefs { @@ -718,15 +314,7 @@ using Defs8bpp = IntraPredBppDefs<8, uint8_t>; dsp->intra_predictors[kTransformSize##W##x##H][kIntraPredictorHorizontal] = \ DEFS::_##W##x##H::Horizontal; \ dsp->intra_predictors[kTransformSize##W##x##H][kIntraPredictorPaeth] = \ - DEFS::_##W##x##H::Paeth; \ - dsp->intra_predictors[kTransformSize##W##x##H][kIntraPredictorSmooth] = \ - DEFS::_##W##x##H::Smooth; \ - dsp->intra_predictors[kTransformSize##W##x##H] \ - [kIntraPredictorSmoothVertical] = \ - DEFS::_##W##x##H::SmoothVertical; \ - dsp->intra_predictors[kTransformSize##W##x##H] \ - [kIntraPredictorSmoothHorizontal] = \ - DEFS::_##W##x##H::SmoothHorizontal + DEFS::_##W##x##H::Paeth #define INIT_INTRAPREDICTORS(DEFS, DEFSBPP) \ INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 4, 4); \ @@ -749,45 +337,11 @@ using Defs8bpp = IntraPredBppDefs<8, uint8_t>; INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 64, 32); \ INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 64, 64) -#define INIT_CFL_INTRAPREDICTOR_WxH(W, H, BITDEPTH, PIXEL) \ - dsp->cfl_intra_predictors[kTransformSize##W##x##H] = \ - CflIntraPredictor_C<W, H, BITDEPTH, PIXEL>; \ - dsp->cfl_subsamplers[kTransformSize##W##x##H][kSubsamplingType444] = \ - CflSubsampler_C<W, H, BITDEPTH, PIXEL, 0, 0>; \ - dsp->cfl_subsamplers[kTransformSize##W##x##H][kSubsamplingType422] = \ - CflSubsampler_C<W, H, BITDEPTH, PIXEL, 1, 0>; \ - dsp->cfl_subsamplers[kTransformSize##W##x##H][kSubsamplingType420] = \ - CflSubsampler_C<W, H, BITDEPTH, PIXEL, 1, 1> - -#define INIT_CFL_INTRAPREDICTORS(BITDEPTH, PIXEL) \ - INIT_CFL_INTRAPREDICTOR_WxH(4, 4, BITDEPTH, PIXEL); \ - INIT_CFL_INTRAPREDICTOR_WxH(4, 8, BITDEPTH, PIXEL); \ - INIT_CFL_INTRAPREDICTOR_WxH(4, 16, BITDEPTH, PIXEL); \ - INIT_CFL_INTRAPREDICTOR_WxH(8, 4, BITDEPTH, PIXEL); \ - INIT_CFL_INTRAPREDICTOR_WxH(8, 8, BITDEPTH, PIXEL); \ - INIT_CFL_INTRAPREDICTOR_WxH(8, 16, BITDEPTH, PIXEL); \ - INIT_CFL_INTRAPREDICTOR_WxH(8, 32, BITDEPTH, PIXEL); \ - INIT_CFL_INTRAPREDICTOR_WxH(16, 4, BITDEPTH, PIXEL); \ - INIT_CFL_INTRAPREDICTOR_WxH(16, 8, BITDEPTH, PIXEL); \ - INIT_CFL_INTRAPREDICTOR_WxH(16, 16, BITDEPTH, PIXEL); \ - INIT_CFL_INTRAPREDICTOR_WxH(16, 32, BITDEPTH, PIXEL); \ - INIT_CFL_INTRAPREDICTOR_WxH(32, 8, BITDEPTH, PIXEL); \ - INIT_CFL_INTRAPREDICTOR_WxH(32, 16, BITDEPTH, PIXEL); \ - INIT_CFL_INTRAPREDICTOR_WxH(32, 32, BITDEPTH, PIXEL) - void Init8bpp() { Dsp* const dsp = dsp_internal::GetWritableDspTable(8); assert(dsp != nullptr); #if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS INIT_INTRAPREDICTORS(Defs, Defs8bpp); - dsp->directional_intra_predictor_zone1 = - DirectionalIntraPredictorZone1_C<uint8_t>; - dsp->directional_intra_predictor_zone2 = - DirectionalIntraPredictorZone2_C<uint8_t>; - dsp->directional_intra_predictor_zone3 = - DirectionalIntraPredictorZone3_C<uint8_t>; - dsp->filter_intra_predictor = FilterIntraPredictor_C<8, uint8_t>; - INIT_CFL_INTRAPREDICTORS(8, uint8_t); #else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS #ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcFill dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcFill] = @@ -816,19 +370,6 @@ void Init8bpp() { dsp->intra_predictors[kTransformSize4x4][kIntraPredictorPaeth] = Defs::_4x4::Paeth; #endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmooth - dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmooth] = - Defs::_4x4::Smooth; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothVertical - dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothVertical] = - Defs::_4x4::SmoothVertical; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothHorizontal - dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothHorizontal] = - Defs::_4x4::SmoothHorizontal; -#endif - #ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcFill dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcFill] = Defs8bpp::_4x8::DcFill; @@ -856,19 +397,6 @@ void Init8bpp() { dsp->intra_predictors[kTransformSize4x8][kIntraPredictorPaeth] = Defs::_4x8::Paeth; #endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmooth - dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmooth] = - Defs::_4x8::Smooth; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothVertical - dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothVertical] = - Defs::_4x8::SmoothVertical; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothHorizontal - dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothHorizontal] = - Defs::_4x8::SmoothHorizontal; -#endif - #ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcFill dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcFill] = Defs8bpp::_4x16::DcFill; @@ -897,19 +425,6 @@ void Init8bpp() { dsp->intra_predictors[kTransformSize4x16][kIntraPredictorPaeth] = Defs::_4x16::Paeth; #endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmooth - dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmooth] = - Defs::_4x16::Smooth; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothVertical - dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothVertical] = - Defs::_4x16::SmoothVertical; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothHorizontal - dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothHorizontal] = - Defs::_4x16::SmoothHorizontal; -#endif - #ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcFill dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcFill] = Defs8bpp::_8x4::DcFill; @@ -937,19 +452,6 @@ void Init8bpp() { dsp->intra_predictors[kTransformSize8x4][kIntraPredictorPaeth] = Defs::_8x4::Paeth; #endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmooth - dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmooth] = - Defs::_8x4::Smooth; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothVertical - dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothVertical] = - Defs::_8x4::SmoothVertical; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothHorizontal - dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothHorizontal] = - Defs::_8x4::SmoothHorizontal; -#endif - #ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcFill dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcFill] = Defs8bpp::_8x8::DcFill; @@ -977,19 +479,6 @@ void Init8bpp() { dsp->intra_predictors[kTransformSize8x8][kIntraPredictorPaeth] = Defs::_8x8::Paeth; #endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmooth - dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmooth] = - Defs::_8x8::Smooth; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothVertical - dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothVertical] = - Defs::_8x8::SmoothVertical; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothHorizontal - dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothHorizontal] = - Defs::_8x8::SmoothHorizontal; -#endif - #ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcFill dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcFill] = Defs8bpp::_8x16::DcFill; @@ -1018,19 +507,6 @@ void Init8bpp() { dsp->intra_predictors[kTransformSize8x16][kIntraPredictorPaeth] = Defs::_8x16::Paeth; #endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmooth - dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmooth] = - Defs::_8x16::Smooth; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothVertical - dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothVertical] = - Defs::_8x16::SmoothVertical; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothHorizontal - dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothHorizontal] = - Defs::_8x16::SmoothHorizontal; -#endif - #ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcFill dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcFill] = Defs8bpp::_8x32::DcFill; @@ -1059,19 +535,6 @@ void Init8bpp() { dsp->intra_predictors[kTransformSize8x32][kIntraPredictorPaeth] = Defs::_8x32::Paeth; #endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmooth - dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmooth] = - Defs::_8x32::Smooth; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothVertical - dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothVertical] = - Defs::_8x32::SmoothVertical; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothHorizontal - dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothHorizontal] = - Defs::_8x32::SmoothHorizontal; -#endif - #ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcFill dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcFill] = Defs8bpp::_16x4::DcFill; @@ -1100,19 +563,6 @@ void Init8bpp() { dsp->intra_predictors[kTransformSize16x4][kIntraPredictorPaeth] = Defs::_16x4::Paeth; #endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmooth - dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmooth] = - Defs::_16x4::Smooth; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothVertical - dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothVertical] = - Defs::_16x4::SmoothVertical; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothHorizontal - dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothHorizontal] = - Defs::_16x4::SmoothHorizontal; -#endif - #ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcFill dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcFill] = Defs8bpp::_16x8::DcFill; @@ -1141,19 +591,6 @@ void Init8bpp() { dsp->intra_predictors[kTransformSize16x8][kIntraPredictorPaeth] = Defs::_16x8::Paeth; #endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmooth - dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmooth] = - Defs::_16x8::Smooth; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothVertical - dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothVertical] = - Defs::_16x8::SmoothVertical; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothHorizontal - dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothHorizontal] = - Defs::_16x8::SmoothHorizontal; -#endif - #ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcFill dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcFill] = Defs8bpp::_16x16::DcFill; @@ -1182,19 +619,6 @@ void Init8bpp() { dsp->intra_predictors[kTransformSize16x16][kIntraPredictorPaeth] = Defs::_16x16::Paeth; #endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmooth - dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmooth] = - Defs::_16x16::Smooth; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothVertical - dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothVertical] = - Defs::_16x16::SmoothVertical; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothHorizontal - dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothHorizontal] = - Defs::_16x16::SmoothHorizontal; -#endif - #ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcFill dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcFill] = Defs8bpp::_16x32::DcFill; @@ -1223,19 +647,6 @@ void Init8bpp() { dsp->intra_predictors[kTransformSize16x32][kIntraPredictorPaeth] = Defs::_16x32::Paeth; #endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmooth - dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmooth] = - Defs::_16x32::Smooth; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothVertical - dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothVertical] = - Defs::_16x32::SmoothVertical; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothHorizontal - dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothHorizontal] = - Defs::_16x32::SmoothHorizontal; -#endif - #ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcFill dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcFill] = Defs8bpp::_16x64::DcFill; @@ -1264,19 +675,6 @@ void Init8bpp() { dsp->intra_predictors[kTransformSize16x64][kIntraPredictorPaeth] = Defs::_16x64::Paeth; #endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmooth - dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmooth] = - Defs::_16x64::Smooth; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothVertical - dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothVertical] = - Defs::_16x64::SmoothVertical; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothHorizontal - dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothHorizontal] = - Defs::_16x64::SmoothHorizontal; -#endif - #ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcFill dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcFill] = Defs8bpp::_32x8::DcFill; @@ -1305,19 +703,6 @@ void Init8bpp() { dsp->intra_predictors[kTransformSize32x8][kIntraPredictorPaeth] = Defs::_32x8::Paeth; #endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmooth - dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmooth] = - Defs::_32x8::Smooth; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothVertical - dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothVertical] = - Defs::_32x8::SmoothVertical; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothHorizontal - dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothHorizontal] = - Defs::_32x8::SmoothHorizontal; -#endif - #ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcFill dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcFill] = Defs8bpp::_32x16::DcFill; @@ -1346,19 +731,6 @@ void Init8bpp() { dsp->intra_predictors[kTransformSize32x16][kIntraPredictorPaeth] = Defs::_32x16::Paeth; #endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmooth - dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmooth] = - Defs::_32x16::Smooth; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothVertical - dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothVertical] = - Defs::_32x16::SmoothVertical; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothHorizontal - dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothHorizontal] = - Defs::_32x16::SmoothHorizontal; -#endif - #ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcFill dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcFill] = Defs8bpp::_32x32::DcFill; @@ -1387,19 +759,6 @@ void Init8bpp() { dsp->intra_predictors[kTransformSize32x32][kIntraPredictorPaeth] = Defs::_32x32::Paeth; #endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmooth - dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmooth] = - Defs::_32x32::Smooth; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothVertical - dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothVertical] = - Defs::_32x32::SmoothVertical; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothHorizontal - dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothHorizontal] = - Defs::_32x32::SmoothHorizontal; -#endif - #ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcFill dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcFill] = Defs8bpp::_32x64::DcFill; @@ -1428,19 +787,6 @@ void Init8bpp() { dsp->intra_predictors[kTransformSize32x64][kIntraPredictorPaeth] = Defs::_32x64::Paeth; #endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmooth - dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmooth] = - Defs::_32x64::Smooth; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothVertical - dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothVertical] = - Defs::_32x64::SmoothVertical; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothHorizontal - dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothHorizontal] = - Defs::_32x64::SmoothHorizontal; -#endif - #ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcFill dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcFill] = Defs8bpp::_64x16::DcFill; @@ -1469,19 +815,6 @@ void Init8bpp() { dsp->intra_predictors[kTransformSize64x16][kIntraPredictorPaeth] = Defs::_64x16::Paeth; #endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmooth - dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmooth] = - Defs::_64x16::Smooth; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothVertical - dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothVertical] = - Defs::_64x16::SmoothVertical; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothHorizontal - dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothHorizontal] = - Defs::_64x16::SmoothHorizontal; -#endif - #ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcFill dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcFill] = Defs8bpp::_64x32::DcFill; @@ -1510,19 +843,6 @@ void Init8bpp() { dsp->intra_predictors[kTransformSize64x32][kIntraPredictorPaeth] = Defs::_64x32::Paeth; #endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmooth - dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmooth] = - Defs::_64x32::Smooth; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothVertical - dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothVertical] = - Defs::_64x32::SmoothVertical; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothHorizontal - dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothHorizontal] = - Defs::_64x32::SmoothHorizontal; -#endif - #ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcFill dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcFill] = Defs8bpp::_64x64::DcFill; @@ -1551,282 +871,7 @@ void Init8bpp() { dsp->intra_predictors[kTransformSize64x64][kIntraPredictorPaeth] = Defs::_64x64::Paeth; #endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmooth - dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmooth] = - Defs::_64x64::Smooth; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothVertical - dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothVertical] = - Defs::_64x64::SmoothVertical; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothHorizontal - dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothHorizontal] = - Defs::_64x64::SmoothHorizontal; -#endif - -#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1 - dsp->directional_intra_predictor_zone1 = - DirectionalIntraPredictorZone1_C<uint8_t>; -#endif -#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2 - dsp->directional_intra_predictor_zone2 = - DirectionalIntraPredictorZone2_C<uint8_t>; -#endif -#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3 - dsp->directional_intra_predictor_zone3 = - DirectionalIntraPredictorZone3_C<uint8_t>; -#endif - -#ifndef LIBGAV1_Dsp8bpp_FilterIntraPredictor - dsp->filter_intra_predictor = FilterIntraPredictor_C<8, uint8_t>; -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflIntraPredictor - dsp->cfl_intra_predictors[kTransformSize4x4] = - CflIntraPredictor_C<4, 4, 8, uint8_t>; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler444 - dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] = - CflSubsampler_C<4, 4, 8, uint8_t, 0, 0>; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler422 - dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType422] = - CflSubsampler_C<4, 4, 8, uint8_t, 1, 0>; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler420 - dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] = - CflSubsampler_C<4, 4, 8, uint8_t, 1, 1>; -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflIntraPredictor - dsp->cfl_intra_predictors[kTransformSize4x8] = - CflIntraPredictor_C<4, 8, 8, uint8_t>; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler444 - dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] = - CflSubsampler_C<4, 8, 8, uint8_t, 0, 0>; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler422 - dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType422] = - CflSubsampler_C<4, 8, 8, uint8_t, 1, 0>; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler420 - dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] = - CflSubsampler_C<4, 8, 8, uint8_t, 1, 1>; -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflIntraPredictor - dsp->cfl_intra_predictors[kTransformSize4x16] = - CflIntraPredictor_C<4, 16, 8, uint8_t>; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler444 - dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] = - CflSubsampler_C<4, 16, 8, uint8_t, 0, 0>; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler422 - dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType422] = - CflSubsampler_C<4, 16, 8, uint8_t, 1, 0>; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler420 - dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] = - CflSubsampler_C<4, 16, 8, uint8_t, 1, 1>; -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflIntraPredictor - dsp->cfl_intra_predictors[kTransformSize8x4] = - CflIntraPredictor_C<8, 4, 8, uint8_t>; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler444 - dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] = - CflSubsampler_C<8, 4, 8, uint8_t, 0, 0>; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler422 - dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType422] = - CflSubsampler_C<8, 4, 8, uint8_t, 1, 0>; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler420 - dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] = - CflSubsampler_C<8, 4, 8, uint8_t, 1, 1>; -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflIntraPredictor - dsp->cfl_intra_predictors[kTransformSize8x8] = - CflIntraPredictor_C<8, 8, 8, uint8_t>; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler444 - dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] = - CflSubsampler_C<8, 8, 8, uint8_t, 0, 0>; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler422 - dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType422] = - CflSubsampler_C<8, 8, 8, uint8_t, 1, 0>; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler420 - dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] = - CflSubsampler_C<8, 8, 8, uint8_t, 1, 1>; -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflIntraPredictor - dsp->cfl_intra_predictors[kTransformSize8x16] = - CflIntraPredictor_C<8, 16, 8, uint8_t>; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler444 - dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] = - CflSubsampler_C<8, 16, 8, uint8_t, 0, 0>; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler422 - dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType422] = - CflSubsampler_C<8, 16, 8, uint8_t, 1, 0>; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler420 - dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] = - CflSubsampler_C<8, 16, 8, uint8_t, 1, 1>; -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflIntraPredictor - dsp->cfl_intra_predictors[kTransformSize8x32] = - CflIntraPredictor_C<8, 32, 8, uint8_t>; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler444 - dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] = - CflSubsampler_C<8, 32, 8, uint8_t, 0, 0>; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler422 - dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType422] = - CflSubsampler_C<8, 32, 8, uint8_t, 1, 0>; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler420 - dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] = - CflSubsampler_C<8, 32, 8, uint8_t, 1, 1>; -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflIntraPredictor - dsp->cfl_intra_predictors[kTransformSize16x4] = - CflIntraPredictor_C<16, 4, 8, uint8_t>; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler444 - dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] = - CflSubsampler_C<16, 4, 8, uint8_t, 0, 0>; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler422 - dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType422] = - CflSubsampler_C<16, 4, 8, uint8_t, 1, 0>; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler420 - dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] = - CflSubsampler_C<16, 4, 8, uint8_t, 1, 1>; -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflIntraPredictor - dsp->cfl_intra_predictors[kTransformSize16x8] = - CflIntraPredictor_C<16, 8, 8, uint8_t>; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler444 - dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] = - CflSubsampler_C<16, 8, 8, uint8_t, 0, 0>; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler422 - dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType422] = - CflSubsampler_C<16, 8, 8, uint8_t, 1, 0>; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler420 - dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] = - CflSubsampler_C<16, 8, 8, uint8_t, 1, 1>; -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflIntraPredictor - dsp->cfl_intra_predictors[kTransformSize16x16] = - CflIntraPredictor_C<16, 16, 8, uint8_t>; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler444 - dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] = - CflSubsampler_C<16, 16, 8, uint8_t, 0, 0>; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler422 - dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType422] = - CflSubsampler_C<16, 16, 8, uint8_t, 1, 0>; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler420 - dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] = - CflSubsampler_C<16, 16, 8, uint8_t, 1, 1>; -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflIntraPredictor - dsp->cfl_intra_predictors[kTransformSize16x32] = - CflIntraPredictor_C<16, 32, 8, uint8_t>; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler444 - dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] = - CflSubsampler_C<16, 32, 8, uint8_t, 0, 0>; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler422 - dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType422] = - CflSubsampler_C<16, 32, 8, uint8_t, 1, 0>; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler420 - dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] = - CflSubsampler_C<16, 32, 8, uint8_t, 1, 1>; -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflIntraPredictor - dsp->cfl_intra_predictors[kTransformSize32x8] = - CflIntraPredictor_C<32, 8, 8, uint8_t>; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler444 - dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] = - CflSubsampler_C<32, 8, 8, uint8_t, 0, 0>; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler422 - dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType422] = - CflSubsampler_C<32, 8, 8, uint8_t, 1, 0>; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler420 - dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] = - CflSubsampler_C<32, 8, 8, uint8_t, 1, 1>; -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflIntraPredictor - dsp->cfl_intra_predictors[kTransformSize32x16] = - CflIntraPredictor_C<32, 16, 8, uint8_t>; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler444 - dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] = - CflSubsampler_C<32, 16, 8, uint8_t, 0, 0>; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler422 - dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType422] = - CflSubsampler_C<32, 16, 8, uint8_t, 1, 0>; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler420 - dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] = - CflSubsampler_C<32, 16, 8, uint8_t, 1, 1>; -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflIntraPredictor - dsp->cfl_intra_predictors[kTransformSize32x32] = - CflIntraPredictor_C<32, 32, 8, uint8_t>; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler444 - dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] = - CflSubsampler_C<32, 32, 8, uint8_t, 0, 0>; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler422 - dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType422] = - CflSubsampler_C<32, 32, 8, uint8_t, 1, 0>; -#endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler420 - dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] = - CflSubsampler_C<32, 32, 8, uint8_t, 1, 1>; -#endif #endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS - // Cfl predictors are available only for transform sizes with max(width, - // height) <= 32. Set all others to nullptr. - for (const auto i : kTransformSizesLargerThan32x32) { - dsp->cfl_intra_predictors[i] = nullptr; - for (int j = 0; j < kNumSubsamplingTypes; ++j) { - dsp->cfl_subsamplers[i][j] = nullptr; - } - } } // NOLINT(readability/fn_size) #if LIBGAV1_MAX_BITDEPTH >= 10 @@ -1838,14 +883,6 @@ void Init10bpp() { assert(dsp != nullptr); #if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS INIT_INTRAPREDICTORS(DefsHbd, Defs10bpp); - dsp->directional_intra_predictor_zone1 = - DirectionalIntraPredictorZone1_C<uint16_t>; - dsp->directional_intra_predictor_zone2 = - DirectionalIntraPredictorZone2_C<uint16_t>; - dsp->directional_intra_predictor_zone3 = - DirectionalIntraPredictorZone3_C<uint16_t>; - dsp->filter_intra_predictor = FilterIntraPredictor_C<10, uint16_t>; - INIT_CFL_INTRAPREDICTORS(10, uint16_t); #else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS #ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcFill dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcFill] = @@ -1875,19 +912,6 @@ void Init10bpp() { dsp->intra_predictors[kTransformSize4x4][kIntraPredictorPaeth] = DefsHbd::_4x4::Paeth; #endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorSmooth - dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmooth] = - DefsHbd::_4x4::Smooth; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorSmoothVertical - dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothVertical] = - DefsHbd::_4x4::SmoothVertical; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorSmoothHorizontal - dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothHorizontal] = - DefsHbd::_4x4::SmoothHorizontal; -#endif - #ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorDcFill dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcFill] = Defs10bpp::_4x8::DcFill; @@ -1916,19 +940,6 @@ void Init10bpp() { dsp->intra_predictors[kTransformSize4x8][kIntraPredictorPaeth] = DefsHbd::_4x8::Paeth; #endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorSmooth - dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmooth] = - DefsHbd::_4x8::Smooth; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorSmoothVertical - dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothVertical] = - DefsHbd::_4x8::SmoothVertical; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorSmoothHorizontal - dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothHorizontal] = - DefsHbd::_4x8::SmoothHorizontal; -#endif - #ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorDcFill dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcFill] = Defs10bpp::_4x16::DcFill; @@ -1957,19 +968,6 @@ void Init10bpp() { dsp->intra_predictors[kTransformSize4x16][kIntraPredictorPaeth] = DefsHbd::_4x16::Paeth; #endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorSmooth - dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmooth] = - DefsHbd::_4x16::Smooth; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorSmoothVertical - dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothVertical] = - DefsHbd::_4x16::SmoothVertical; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorSmoothHorizontal - dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothHorizontal] = - DefsHbd::_4x16::SmoothHorizontal; -#endif - #ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorDcFill dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcFill] = Defs10bpp::_8x4::DcFill; @@ -1998,19 +996,6 @@ void Init10bpp() { dsp->intra_predictors[kTransformSize8x4][kIntraPredictorPaeth] = DefsHbd::_8x4::Paeth; #endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorSmooth - dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmooth] = - DefsHbd::_8x4::Smooth; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorSmoothVertical - dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothVertical] = - DefsHbd::_8x4::SmoothVertical; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorSmoothHorizontal - dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothHorizontal] = - DefsHbd::_8x4::SmoothHorizontal; -#endif - #ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorDcFill dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcFill] = Defs10bpp::_8x8::DcFill; @@ -2039,19 +1024,6 @@ void Init10bpp() { dsp->intra_predictors[kTransformSize8x8][kIntraPredictorPaeth] = DefsHbd::_8x8::Paeth; #endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorSmooth - dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmooth] = - DefsHbd::_8x8::Smooth; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorSmoothVertical - dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothVertical] = - DefsHbd::_8x8::SmoothVertical; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorSmoothHorizontal - dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothHorizontal] = - DefsHbd::_8x8::SmoothHorizontal; -#endif - #ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorDcFill dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcFill] = Defs10bpp::_8x16::DcFill; @@ -2080,19 +1052,6 @@ void Init10bpp() { dsp->intra_predictors[kTransformSize8x16][kIntraPredictorPaeth] = DefsHbd::_8x16::Paeth; #endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorSmooth - dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmooth] = - DefsHbd::_8x16::Smooth; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorSmoothVertical - dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothVertical] = - DefsHbd::_8x16::SmoothVertical; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorSmoothHorizontal - dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothHorizontal] = - DefsHbd::_8x16::SmoothHorizontal; -#endif - #ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorDcFill dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcFill] = Defs10bpp::_8x32::DcFill; @@ -2121,19 +1080,6 @@ void Init10bpp() { dsp->intra_predictors[kTransformSize8x32][kIntraPredictorPaeth] = DefsHbd::_8x32::Paeth; #endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorSmooth - dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmooth] = - DefsHbd::_8x32::Smooth; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorSmoothVertical - dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothVertical] = - DefsHbd::_8x32::SmoothVertical; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorSmoothHorizontal - dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothHorizontal] = - DefsHbd::_8x32::SmoothHorizontal; -#endif - #ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorDcFill dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcFill] = Defs10bpp::_16x4::DcFill; @@ -2162,19 +1108,6 @@ void Init10bpp() { dsp->intra_predictors[kTransformSize16x4][kIntraPredictorPaeth] = DefsHbd::_16x4::Paeth; #endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorSmooth - dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmooth] = - DefsHbd::_16x4::Smooth; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorSmoothVertical - dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothVertical] = - DefsHbd::_16x4::SmoothVertical; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorSmoothHorizontal - dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothHorizontal] = - DefsHbd::_16x4::SmoothHorizontal; -#endif - #ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorDcFill dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcFill] = Defs10bpp::_16x8::DcFill; @@ -2203,19 +1136,6 @@ void Init10bpp() { dsp->intra_predictors[kTransformSize16x8][kIntraPredictorPaeth] = DefsHbd::_16x8::Paeth; #endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorSmooth - dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmooth] = - DefsHbd::_16x8::Smooth; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorSmoothVertical - dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothVertical] = - DefsHbd::_16x8::SmoothVertical; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorSmoothHorizontal - dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothHorizontal] = - DefsHbd::_16x8::SmoothHorizontal; -#endif - #ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorDcFill dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcFill] = Defs10bpp::_16x16::DcFill; @@ -2244,19 +1164,6 @@ void Init10bpp() { dsp->intra_predictors[kTransformSize16x16][kIntraPredictorPaeth] = DefsHbd::_16x16::Paeth; #endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorSmooth - dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmooth] = - DefsHbd::_16x16::Smooth; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorSmoothVertical - dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothVertical] = - DefsHbd::_16x16::SmoothVertical; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorSmoothHorizontal - dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothHorizontal] = - DefsHbd::_16x16::SmoothHorizontal; -#endif - #ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorDcFill dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcFill] = Defs10bpp::_16x32::DcFill; @@ -2285,19 +1192,6 @@ void Init10bpp() { dsp->intra_predictors[kTransformSize16x32][kIntraPredictorPaeth] = DefsHbd::_16x32::Paeth; #endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorSmooth - dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmooth] = - DefsHbd::_16x32::Smooth; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorSmoothVertical - dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothVertical] = - DefsHbd::_16x32::SmoothVertical; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorSmoothHorizontal - dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothHorizontal] = - DefsHbd::_16x32::SmoothHorizontal; -#endif - #ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorDcFill dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcFill] = Defs10bpp::_16x64::DcFill; @@ -2326,19 +1220,6 @@ void Init10bpp() { dsp->intra_predictors[kTransformSize16x64][kIntraPredictorPaeth] = DefsHbd::_16x64::Paeth; #endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorSmooth - dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmooth] = - DefsHbd::_16x64::Smooth; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorSmoothVertical - dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothVertical] = - DefsHbd::_16x64::SmoothVertical; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorSmoothHorizontal - dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothHorizontal] = - DefsHbd::_16x64::SmoothHorizontal; -#endif - #ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorDcFill dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcFill] = Defs10bpp::_32x8::DcFill; @@ -2367,19 +1248,6 @@ void Init10bpp() { dsp->intra_predictors[kTransformSize32x8][kIntraPredictorPaeth] = DefsHbd::_32x8::Paeth; #endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorSmooth - dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmooth] = - DefsHbd::_32x8::Smooth; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorSmoothVertical - dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothVertical] = - DefsHbd::_32x8::SmoothVertical; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorSmoothHorizontal - dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothHorizontal] = - DefsHbd::_32x8::SmoothHorizontal; -#endif - #ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorDcFill dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcFill] = Defs10bpp::_32x16::DcFill; @@ -2408,19 +1276,6 @@ void Init10bpp() { dsp->intra_predictors[kTransformSize32x16][kIntraPredictorPaeth] = DefsHbd::_32x16::Paeth; #endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorSmooth - dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmooth] = - DefsHbd::_32x16::Smooth; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorSmoothVertical - dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothVertical] = - DefsHbd::_32x16::SmoothVertical; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorSmoothHorizontal - dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothHorizontal] = - DefsHbd::_32x16::SmoothHorizontal; -#endif - #ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorDcFill dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcFill] = Defs10bpp::_32x32::DcFill; @@ -2449,19 +1304,6 @@ void Init10bpp() { dsp->intra_predictors[kTransformSize32x32][kIntraPredictorPaeth] = DefsHbd::_32x32::Paeth; #endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorSmooth - dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmooth] = - DefsHbd::_32x32::Smooth; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorSmoothVertical - dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothVertical] = - DefsHbd::_32x32::SmoothVertical; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorSmoothHorizontal - dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothHorizontal] = - DefsHbd::_32x32::SmoothHorizontal; -#endif - #ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorDcFill dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcFill] = Defs10bpp::_32x64::DcFill; @@ -2490,19 +1332,6 @@ void Init10bpp() { dsp->intra_predictors[kTransformSize32x64][kIntraPredictorPaeth] = DefsHbd::_32x64::Paeth; #endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorSmooth - dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmooth] = - DefsHbd::_32x64::Smooth; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorSmoothVertical - dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothVertical] = - DefsHbd::_32x64::SmoothVertical; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorSmoothHorizontal - dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothHorizontal] = - DefsHbd::_32x64::SmoothHorizontal; -#endif - #ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorDcFill dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcFill] = Defs10bpp::_64x16::DcFill; @@ -2531,19 +1360,6 @@ void Init10bpp() { dsp->intra_predictors[kTransformSize64x16][kIntraPredictorPaeth] = DefsHbd::_64x16::Paeth; #endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorSmooth - dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmooth] = - DefsHbd::_64x16::Smooth; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorSmoothVertical - dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothVertical] = - DefsHbd::_64x16::SmoothVertical; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorSmoothHorizontal - dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothHorizontal] = - DefsHbd::_64x16::SmoothHorizontal; -#endif - #ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorDcFill dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcFill] = Defs10bpp::_64x32::DcFill; @@ -2572,19 +1388,6 @@ void Init10bpp() { dsp->intra_predictors[kTransformSize64x32][kIntraPredictorPaeth] = DefsHbd::_64x32::Paeth; #endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorSmooth - dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmooth] = - DefsHbd::_64x32::Smooth; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorSmoothVertical - dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothVertical] = - DefsHbd::_64x32::SmoothVertical; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorSmoothHorizontal - dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothHorizontal] = - DefsHbd::_64x32::SmoothHorizontal; -#endif - #ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorDcFill dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcFill] = Defs10bpp::_64x64::DcFill; @@ -2613,291 +1416,12 @@ void Init10bpp() { dsp->intra_predictors[kTransformSize64x64][kIntraPredictorPaeth] = DefsHbd::_64x64::Paeth; #endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorSmooth - dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmooth] = - DefsHbd::_64x64::Smooth; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorSmoothVertical - dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothVertical] = - DefsHbd::_64x64::SmoothVertical; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorSmoothHorizontal - dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothHorizontal] = - DefsHbd::_64x64::SmoothHorizontal; -#endif - -#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone1 - dsp->directional_intra_predictor_zone1 = - DirectionalIntraPredictorZone1_C<uint16_t>; -#endif -#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone2 - dsp->directional_intra_predictor_zone2 = - DirectionalIntraPredictorZone2_C<uint16_t>; -#endif -#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone3 - dsp->directional_intra_predictor_zone3 = - DirectionalIntraPredictorZone3_C<uint16_t>; -#endif - -#ifndef LIBGAV1_Dsp10bpp_FilterIntraPredictor - dsp->filter_intra_predictor = FilterIntraPredictor_C<10, uint16_t>; -#endif - -#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflIntraPredictor - dsp->cfl_intra_predictors[kTransformSize4x4] = - CflIntraPredictor_C<4, 4, 10, uint16_t>; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler444 - dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] = - CflSubsampler_C<4, 4, 10, uint16_t, 0, 0>; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler422 - dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType422] = - CflSubsampler_C<4, 4, 10, uint16_t, 1, 0>; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler420 - dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] = - CflSubsampler_C<4, 4, 10, uint16_t, 1, 1>; -#endif - -#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflIntraPredictor - dsp->cfl_intra_predictors[kTransformSize4x8] = - CflIntraPredictor_C<4, 8, 10, uint16_t>; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler444 - dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] = - CflSubsampler_C<4, 8, 10, uint16_t, 0, 0>; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler422 - dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType422] = - CflSubsampler_C<4, 8, 10, uint16_t, 1, 0>; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler420 - dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] = - CflSubsampler_C<4, 8, 10, uint16_t, 1, 1>; -#endif - -#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflIntraPredictor - dsp->cfl_intra_predictors[kTransformSize4x16] = - CflIntraPredictor_C<4, 16, 10, uint16_t>; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler444 - dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] = - CflSubsampler_C<4, 16, 10, uint16_t, 0, 0>; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler422 - dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType422] = - CflSubsampler_C<4, 16, 10, uint16_t, 1, 0>; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler420 - dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] = - CflSubsampler_C<4, 16, 10, uint16_t, 1, 1>; -#endif - -#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflIntraPredictor - dsp->cfl_intra_predictors[kTransformSize8x4] = - CflIntraPredictor_C<8, 4, 10, uint16_t>; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler444 - dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] = - CflSubsampler_C<8, 4, 10, uint16_t, 0, 0>; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler422 - dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType422] = - CflSubsampler_C<8, 4, 10, uint16_t, 1, 0>; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler420 - dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] = - CflSubsampler_C<8, 4, 10, uint16_t, 1, 1>; -#endif - -#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflIntraPredictor - dsp->cfl_intra_predictors[kTransformSize8x8] = - CflIntraPredictor_C<8, 8, 10, uint16_t>; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler444 - dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] = - CflSubsampler_C<8, 8, 10, uint16_t, 0, 0>; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler422 - dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType422] = - CflSubsampler_C<8, 8, 10, uint16_t, 1, 0>; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler420 - dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] = - CflSubsampler_C<8, 8, 10, uint16_t, 1, 1>; -#endif - -#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflIntraPredictor - dsp->cfl_intra_predictors[kTransformSize8x16] = - CflIntraPredictor_C<8, 16, 10, uint16_t>; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler444 - dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] = - CflSubsampler_C<8, 16, 10, uint16_t, 0, 0>; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler422 - dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType422] = - CflSubsampler_C<8, 16, 10, uint16_t, 1, 0>; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler420 - dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] = - CflSubsampler_C<8, 16, 10, uint16_t, 1, 1>; -#endif - -#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflIntraPredictor - dsp->cfl_intra_predictors[kTransformSize8x32] = - CflIntraPredictor_C<8, 32, 10, uint16_t>; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler444 - dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] = - CflSubsampler_C<8, 32, 10, uint16_t, 0, 0>; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler422 - dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType422] = - CflSubsampler_C<8, 32, 10, uint16_t, 1, 0>; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler420 - dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] = - CflSubsampler_C<8, 32, 10, uint16_t, 1, 1>; -#endif - -#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflIntraPredictor - dsp->cfl_intra_predictors[kTransformSize16x4] = - CflIntraPredictor_C<16, 4, 10, uint16_t>; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler444 - dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] = - CflSubsampler_C<16, 4, 10, uint16_t, 0, 0>; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler422 - dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType422] = - CflSubsampler_C<16, 4, 10, uint16_t, 1, 0>; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler420 - dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] = - CflSubsampler_C<16, 4, 10, uint16_t, 1, 1>; -#endif - -#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflIntraPredictor - dsp->cfl_intra_predictors[kTransformSize16x8] = - CflIntraPredictor_C<16, 8, 10, uint16_t>; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler444 - dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] = - CflSubsampler_C<16, 8, 10, uint16_t, 0, 0>; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler422 - dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType422] = - CflSubsampler_C<16, 8, 10, uint16_t, 1, 0>; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler420 - dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] = - CflSubsampler_C<16, 8, 10, uint16_t, 1, 1>; -#endif - -#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflIntraPredictor - dsp->cfl_intra_predictors[kTransformSize16x16] = - CflIntraPredictor_C<16, 16, 10, uint16_t>; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler444 - dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] = - CflSubsampler_C<16, 16, 10, uint16_t, 0, 0>; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler422 - dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType422] = - CflSubsampler_C<16, 16, 10, uint16_t, 1, 0>; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler420 - dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] = - CflSubsampler_C<16, 16, 10, uint16_t, 1, 1>; -#endif - -#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflIntraPredictor - dsp->cfl_intra_predictors[kTransformSize16x32] = - CflIntraPredictor_C<16, 32, 10, uint16_t>; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler444 - dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] = - CflSubsampler_C<16, 32, 10, uint16_t, 0, 0>; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler422 - dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType422] = - CflSubsampler_C<16, 32, 10, uint16_t, 1, 0>; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler420 - dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] = - CflSubsampler_C<16, 32, 10, uint16_t, 1, 1>; -#endif - -#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflIntraPredictor - dsp->cfl_intra_predictors[kTransformSize32x8] = - CflIntraPredictor_C<32, 8, 10, uint16_t>; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler444 - dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] = - CflSubsampler_C<32, 8, 10, uint16_t, 0, 0>; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler422 - dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType422] = - CflSubsampler_C<32, 8, 10, uint16_t, 1, 0>; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler420 - dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] = - CflSubsampler_C<32, 8, 10, uint16_t, 1, 1>; -#endif - -#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflIntraPredictor - dsp->cfl_intra_predictors[kTransformSize32x16] = - CflIntraPredictor_C<32, 16, 10, uint16_t>; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler444 - dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] = - CflSubsampler_C<32, 16, 10, uint16_t, 0, 0>; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler422 - dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType422] = - CflSubsampler_C<32, 16, 10, uint16_t, 1, 0>; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler420 - dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] = - CflSubsampler_C<32, 16, 10, uint16_t, 1, 1>; -#endif - -#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflIntraPredictor - dsp->cfl_intra_predictors[kTransformSize32x32] = - CflIntraPredictor_C<32, 32, 10, uint16_t>; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler444 - dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] = - CflSubsampler_C<32, 32, 10, uint16_t, 0, 0>; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler422 - dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType422] = - CflSubsampler_C<32, 32, 10, uint16_t, 1, 0>; -#endif -#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler420 - dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] = - CflSubsampler_C<32, 32, 10, uint16_t, 1, 1>; -#endif - #endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS - // Cfl predictors are available only for transform sizes with max(width, - // height) <= 32. Set all others to nullptr. - for (const auto i : kTransformSizesLargerThan32x32) { - dsp->cfl_intra_predictors[i] = nullptr; - for (int j = 0; j < kNumSubsamplingTypes; ++j) { - dsp->cfl_subsamplers[i][j] = nullptr; - } - } } // NOLINT(readability/fn_size) #endif // LIBGAV1_MAX_BITDEPTH >= 10 -#undef INIT_CFL_INTRAPREDICTOR_WxH -#undef INIT_CFL_INTRAPREDICTORS #undef INIT_INTRAPREDICTORS_WxH #undef INIT_INTRAPREDICTORS - } // namespace void IntraPredInit_C() { diff --git a/src/dsp/intrapred.h b/src/dsp/intrapred.h index c5286ef..2cb625d 100644 --- a/src/dsp/intrapred.h +++ b/src/dsp/intrapred.h @@ -38,9 +38,7 @@ namespace libgav1 { namespace dsp { -// Initializes Dsp::intra_predictors, Dsp::directional_intra_predictor_zone*, -// Dsp::cfl_intra_predictors, Dsp::cfl_subsamplers and -// Dsp::filter_intra_predictor. This function is not thread-safe. +// Initializes Dsp::intra_predictors. This function is not thread-safe. void IntraPredInit_C(); } // namespace dsp diff --git a/src/dsp/intrapred_cfl.cc b/src/dsp/intrapred_cfl.cc new file mode 100644 index 0000000..948c0c0 --- /dev/null +++ b/src/dsp/intrapred_cfl.cc @@ -0,0 +1,654 @@ +// Copyright 2021 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/intrapred_cfl.h" + +#include <algorithm> +#include <cassert> +#include <cstddef> +#include <cstdint> +#include <cstdlib> +#include <cstring> + +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/utils/common.h" +#include "src/utils/constants.h" + +namespace libgav1 { +namespace dsp { +namespace { + +constexpr TransformSize kTransformSizesLargerThan32x32[] = { + kTransformSize16x64, kTransformSize32x64, kTransformSize64x16, + kTransformSize64x32, kTransformSize64x64}; + +//------------------------------------------------------------------------------ +// CflIntraPredictor_C + +// |luma| can be within +/-(((1 << bitdepth) - 1) << 3), inclusive. +// |alpha| can be -16 to 16 (inclusive). +template <int block_width, int block_height, int bitdepth, typename Pixel> +void CflIntraPredictor_C( + void* const dest, ptrdiff_t stride, + const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int alpha) { + auto* dst = static_cast<Pixel*>(dest); + const int dc = dst[0]; + stride /= sizeof(Pixel); + const int max_value = (1 << bitdepth) - 1; + for (int y = 0; y < block_height; ++y) { + for (int x = 0; x < block_width; ++x) { + assert(luma[y][x] >= -(((1 << bitdepth) - 1) << 3)); + assert(luma[y][x] <= ((1 << bitdepth) - 1) << 3); + dst[x] = Clip3(dc + RightShiftWithRoundingSigned(alpha * luma[y][x], 6), + 0, max_value); + } + dst += stride; + } +} + +//------------------------------------------------------------------------------ +// CflSubsampler_C + +template <int block_width, int block_height, int bitdepth, typename Pixel, + int subsampling_x, int subsampling_y> +void CflSubsampler_C(int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int max_luma_width, const int max_luma_height, + const void* const source, ptrdiff_t stride) { + assert(max_luma_width >= 4); + assert(max_luma_height >= 4); + const auto* src = static_cast<const Pixel*>(source); + stride /= sizeof(Pixel); + int sum = 0; + for (int y = 0; y < block_height; ++y) { + for (int x = 0; x < block_width; ++x) { + const ptrdiff_t luma_x = + std::min(x << subsampling_x, max_luma_width - (1 << subsampling_x)); + const ptrdiff_t luma_x_next = luma_x + stride; + luma[y][x] = + (src[luma_x] + ((subsampling_x != 0) ? src[luma_x + 1] : 0) + + ((subsampling_y != 0) ? (src[luma_x_next] + src[luma_x_next + 1]) + : 0)) + << (3 - subsampling_x - subsampling_y); + sum += luma[y][x]; + } + if ((y << subsampling_y) < (max_luma_height - (1 << subsampling_y))) { + src += stride << subsampling_y; + } + } + const int average = RightShiftWithRounding( + sum, FloorLog2(block_width) + FloorLog2(block_height)); + for (int y = 0; y < block_height; ++y) { + for (int x = 0; x < block_width; ++x) { + luma[y][x] -= average; + } + } +} + +//------------------------------------------------------------------------------ + +// Initializes dsp entries for kTransformSize|W|x|H|. +#define INIT_CFL_INTRAPREDICTOR_WxH(W, H, BITDEPTH, PIXEL) \ + dsp->cfl_intra_predictors[kTransformSize##W##x##H] = \ + CflIntraPredictor_C<W, H, BITDEPTH, PIXEL>; \ + dsp->cfl_subsamplers[kTransformSize##W##x##H][kSubsamplingType444] = \ + CflSubsampler_C<W, H, BITDEPTH, PIXEL, 0, 0>; \ + dsp->cfl_subsamplers[kTransformSize##W##x##H][kSubsamplingType422] = \ + CflSubsampler_C<W, H, BITDEPTH, PIXEL, 1, 0>; \ + dsp->cfl_subsamplers[kTransformSize##W##x##H][kSubsamplingType420] = \ + CflSubsampler_C<W, H, BITDEPTH, PIXEL, 1, 1> + +#define INIT_CFL_INTRAPREDICTORS(BITDEPTH, PIXEL) \ + INIT_CFL_INTRAPREDICTOR_WxH(4, 4, BITDEPTH, PIXEL); \ + INIT_CFL_INTRAPREDICTOR_WxH(4, 8, BITDEPTH, PIXEL); \ + INIT_CFL_INTRAPREDICTOR_WxH(4, 16, BITDEPTH, PIXEL); \ + INIT_CFL_INTRAPREDICTOR_WxH(8, 4, BITDEPTH, PIXEL); \ + INIT_CFL_INTRAPREDICTOR_WxH(8, 8, BITDEPTH, PIXEL); \ + INIT_CFL_INTRAPREDICTOR_WxH(8, 16, BITDEPTH, PIXEL); \ + INIT_CFL_INTRAPREDICTOR_WxH(8, 32, BITDEPTH, PIXEL); \ + INIT_CFL_INTRAPREDICTOR_WxH(16, 4, BITDEPTH, PIXEL); \ + INIT_CFL_INTRAPREDICTOR_WxH(16, 8, BITDEPTH, PIXEL); \ + INIT_CFL_INTRAPREDICTOR_WxH(16, 16, BITDEPTH, PIXEL); \ + INIT_CFL_INTRAPREDICTOR_WxH(16, 32, BITDEPTH, PIXEL); \ + INIT_CFL_INTRAPREDICTOR_WxH(32, 8, BITDEPTH, PIXEL); \ + INIT_CFL_INTRAPREDICTOR_WxH(32, 16, BITDEPTH, PIXEL); \ + INIT_CFL_INTRAPREDICTOR_WxH(32, 32, BITDEPTH, PIXEL) + +void Init8bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(8); + assert(dsp != nullptr); +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + INIT_CFL_INTRAPREDICTORS(8, uint8_t); +#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + static_cast<void>(dsp); +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize4x4] = + CflIntraPredictor_C<4, 4, 8, uint8_t>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] = + CflSubsampler_C<4, 4, 8, uint8_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType422] = + CflSubsampler_C<4, 4, 8, uint8_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] = + CflSubsampler_C<4, 4, 8, uint8_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize4x8] = + CflIntraPredictor_C<4, 8, 8, uint8_t>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] = + CflSubsampler_C<4, 8, 8, uint8_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType422] = + CflSubsampler_C<4, 8, 8, uint8_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] = + CflSubsampler_C<4, 8, 8, uint8_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize4x16] = + CflIntraPredictor_C<4, 16, 8, uint8_t>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] = + CflSubsampler_C<4, 16, 8, uint8_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType422] = + CflSubsampler_C<4, 16, 8, uint8_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] = + CflSubsampler_C<4, 16, 8, uint8_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize8x4] = + CflIntraPredictor_C<8, 4, 8, uint8_t>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] = + CflSubsampler_C<8, 4, 8, uint8_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType422] = + CflSubsampler_C<8, 4, 8, uint8_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] = + CflSubsampler_C<8, 4, 8, uint8_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize8x8] = + CflIntraPredictor_C<8, 8, 8, uint8_t>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] = + CflSubsampler_C<8, 8, 8, uint8_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType422] = + CflSubsampler_C<8, 8, 8, uint8_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] = + CflSubsampler_C<8, 8, 8, uint8_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize8x16] = + CflIntraPredictor_C<8, 16, 8, uint8_t>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] = + CflSubsampler_C<8, 16, 8, uint8_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType422] = + CflSubsampler_C<8, 16, 8, uint8_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] = + CflSubsampler_C<8, 16, 8, uint8_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize8x32] = + CflIntraPredictor_C<8, 32, 8, uint8_t>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] = + CflSubsampler_C<8, 32, 8, uint8_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType422] = + CflSubsampler_C<8, 32, 8, uint8_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] = + CflSubsampler_C<8, 32, 8, uint8_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize16x4] = + CflIntraPredictor_C<16, 4, 8, uint8_t>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] = + CflSubsampler_C<16, 4, 8, uint8_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType422] = + CflSubsampler_C<16, 4, 8, uint8_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] = + CflSubsampler_C<16, 4, 8, uint8_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize16x8] = + CflIntraPredictor_C<16, 8, 8, uint8_t>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] = + CflSubsampler_C<16, 8, 8, uint8_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType422] = + CflSubsampler_C<16, 8, 8, uint8_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] = + CflSubsampler_C<16, 8, 8, uint8_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize16x16] = + CflIntraPredictor_C<16, 16, 8, uint8_t>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] = + CflSubsampler_C<16, 16, 8, uint8_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType422] = + CflSubsampler_C<16, 16, 8, uint8_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] = + CflSubsampler_C<16, 16, 8, uint8_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize16x32] = + CflIntraPredictor_C<16, 32, 8, uint8_t>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] = + CflSubsampler_C<16, 32, 8, uint8_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType422] = + CflSubsampler_C<16, 32, 8, uint8_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] = + CflSubsampler_C<16, 32, 8, uint8_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize32x8] = + CflIntraPredictor_C<32, 8, 8, uint8_t>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] = + CflSubsampler_C<32, 8, 8, uint8_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType422] = + CflSubsampler_C<32, 8, 8, uint8_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] = + CflSubsampler_C<32, 8, 8, uint8_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize32x16] = + CflIntraPredictor_C<32, 16, 8, uint8_t>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] = + CflSubsampler_C<32, 16, 8, uint8_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType422] = + CflSubsampler_C<32, 16, 8, uint8_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] = + CflSubsampler_C<32, 16, 8, uint8_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize32x32] = + CflIntraPredictor_C<32, 32, 8, uint8_t>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] = + CflSubsampler_C<32, 32, 8, uint8_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType422] = + CflSubsampler_C<32, 32, 8, uint8_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] = + CflSubsampler_C<32, 32, 8, uint8_t, 1, 1>; +#endif +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + // Cfl predictors are available only for transform sizes with max(width, + // height) <= 32. Set all others to nullptr. + for (const auto i : kTransformSizesLargerThan32x32) { + dsp->cfl_intra_predictors[i] = nullptr; + for (int j = 0; j < kNumSubsamplingTypes; ++j) { + dsp->cfl_subsamplers[i][j] = nullptr; + } + } +} // NOLINT(readability/fn_size) + +#if LIBGAV1_MAX_BITDEPTH >= 10 +void Init10bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(10); + assert(dsp != nullptr); +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + INIT_CFL_INTRAPREDICTORS(10, uint16_t); +#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize4x4] = + CflIntraPredictor_C<4, 4, 10, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] = + CflSubsampler_C<4, 4, 10, uint16_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType422] = + CflSubsampler_C<4, 4, 10, uint16_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] = + CflSubsampler_C<4, 4, 10, uint16_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize4x8] = + CflIntraPredictor_C<4, 8, 10, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] = + CflSubsampler_C<4, 8, 10, uint16_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType422] = + CflSubsampler_C<4, 8, 10, uint16_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] = + CflSubsampler_C<4, 8, 10, uint16_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize4x16] = + CflIntraPredictor_C<4, 16, 10, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] = + CflSubsampler_C<4, 16, 10, uint16_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType422] = + CflSubsampler_C<4, 16, 10, uint16_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] = + CflSubsampler_C<4, 16, 10, uint16_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize8x4] = + CflIntraPredictor_C<8, 4, 10, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] = + CflSubsampler_C<8, 4, 10, uint16_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType422] = + CflSubsampler_C<8, 4, 10, uint16_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] = + CflSubsampler_C<8, 4, 10, uint16_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize8x8] = + CflIntraPredictor_C<8, 8, 10, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] = + CflSubsampler_C<8, 8, 10, uint16_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType422] = + CflSubsampler_C<8, 8, 10, uint16_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] = + CflSubsampler_C<8, 8, 10, uint16_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize8x16] = + CflIntraPredictor_C<8, 16, 10, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] = + CflSubsampler_C<8, 16, 10, uint16_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType422] = + CflSubsampler_C<8, 16, 10, uint16_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] = + CflSubsampler_C<8, 16, 10, uint16_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize8x32] = + CflIntraPredictor_C<8, 32, 10, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] = + CflSubsampler_C<8, 32, 10, uint16_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType422] = + CflSubsampler_C<8, 32, 10, uint16_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] = + CflSubsampler_C<8, 32, 10, uint16_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize16x4] = + CflIntraPredictor_C<16, 4, 10, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] = + CflSubsampler_C<16, 4, 10, uint16_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType422] = + CflSubsampler_C<16, 4, 10, uint16_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] = + CflSubsampler_C<16, 4, 10, uint16_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize16x8] = + CflIntraPredictor_C<16, 8, 10, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] = + CflSubsampler_C<16, 8, 10, uint16_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType422] = + CflSubsampler_C<16, 8, 10, uint16_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] = + CflSubsampler_C<16, 8, 10, uint16_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize16x16] = + CflIntraPredictor_C<16, 16, 10, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] = + CflSubsampler_C<16, 16, 10, uint16_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType422] = + CflSubsampler_C<16, 16, 10, uint16_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] = + CflSubsampler_C<16, 16, 10, uint16_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize16x32] = + CflIntraPredictor_C<16, 32, 10, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] = + CflSubsampler_C<16, 32, 10, uint16_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType422] = + CflSubsampler_C<16, 32, 10, uint16_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] = + CflSubsampler_C<16, 32, 10, uint16_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize32x8] = + CflIntraPredictor_C<32, 8, 10, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] = + CflSubsampler_C<32, 8, 10, uint16_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType422] = + CflSubsampler_C<32, 8, 10, uint16_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] = + CflSubsampler_C<32, 8, 10, uint16_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize32x16] = + CflIntraPredictor_C<32, 16, 10, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] = + CflSubsampler_C<32, 16, 10, uint16_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType422] = + CflSubsampler_C<32, 16, 10, uint16_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] = + CflSubsampler_C<32, 16, 10, uint16_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize32x32] = + CflIntraPredictor_C<32, 32, 10, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] = + CflSubsampler_C<32, 32, 10, uint16_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType422] = + CflSubsampler_C<32, 32, 10, uint16_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] = + CflSubsampler_C<32, 32, 10, uint16_t, 1, 1>; +#endif + +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + // Cfl predictors are available only for transform sizes with max(width, + // height) <= 32. Set all others to nullptr. + for (const auto i : kTransformSizesLargerThan32x32) { + dsp->cfl_intra_predictors[i] = nullptr; + for (int j = 0; j < kNumSubsamplingTypes; ++j) { + dsp->cfl_subsamplers[i][j] = nullptr; + } + } +} // NOLINT(readability/fn_size) +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +#undef INIT_CFL_INTRAPREDICTOR_WxH +#undef INIT_CFL_INTRAPREDICTORS + +} // namespace + +void IntraPredCflInit_C() { + Init8bpp(); +#if LIBGAV1_MAX_BITDEPTH >= 10 + Init10bpp(); +#endif +} + +} // namespace dsp +} // namespace libgav1 diff --git a/src/dsp/intrapred_cfl.h b/src/dsp/intrapred_cfl.h new file mode 100644 index 0000000..4e8a11f --- /dev/null +++ b/src/dsp/intrapred_cfl.h @@ -0,0 +1,48 @@ +/* + * Copyright 2021 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_INTRAPRED_CFL_H_ +#define LIBGAV1_SRC_DSP_INTRAPRED_CFL_H_ + +// Pull in LIBGAV1_DspXXX defines representing the implementation status +// of each function. The resulting value of each can be used by each module to +// determine whether an implementation is needed at compile time. +// IWYU pragma: begin_exports + +// ARM: +#include "src/dsp/arm/intrapred_cfl_neon.h" + +// x86: +// Note includes should be sorted in logical order avx2/avx/sse4, etc. +// The order of includes is important as each tests for a superior version +// before setting the base. +// clang-format off +#include "src/dsp/x86/intrapred_cfl_sse4.h" +// clang-format on + +// IWYU pragma: end_exports + +namespace libgav1 { +namespace dsp { + +// Initializes Dsp::cfl_intra_predictors and Dsp::cfl_subsamplers. +// This function is not thread-safe. +void IntraPredCflInit_C(); + +} // namespace dsp +} // namespace libgav1 + +#endif // LIBGAV1_SRC_DSP_INTRAPRED_CFL_H_ diff --git a/src/dsp/intrapred_cfl_test.cc b/src/dsp/intrapred_cfl_test.cc new file mode 100644 index 0000000..e700a5b --- /dev/null +++ b/src/dsp/intrapred_cfl_test.cc @@ -0,0 +1,923 @@ +// Copyright 2021 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/intrapred_cfl.h" + +#include <cmath> +#include <cstddef> +#include <cstdint> +#include <cstring> +#include <memory> +#include <ostream> + +#include "absl/strings/match.h" +#include "absl/time/clock.h" +#include "absl/time/time.h" +#include "gtest/gtest.h" +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/utils/common.h" +#include "src/utils/constants.h" +#include "src/utils/cpu.h" +#include "src/utils/memory.h" +#include "tests/block_utils.h" +#include "tests/third_party/libvpx/acm_random.h" +#include "tests/utils.h" + +namespace libgav1 { +namespace dsp { +namespace { + +constexpr int kMaxBlockSize = 64; +constexpr int kTotalPixels = kMaxBlockSize * kMaxBlockSize; + +const char* const kCflIntraPredName = "kCflIntraPredictor"; + +template <int bitdepth, typename Pixel> +class IntraPredTestBase : public testing::TestWithParam<TransformSize>, + public test_utils::MaxAlignedAllocable { + public: + IntraPredTestBase() { + switch (tx_size_) { + case kNumTransformSizes: + EXPECT_NE(tx_size_, kNumTransformSizes); + break; + default: + block_width_ = kTransformWidth[tx_size_]; + block_height_ = kTransformHeight[tx_size_]; + break; + } + } + + IntraPredTestBase(const IntraPredTestBase&) = delete; + IntraPredTestBase& operator=(const IntraPredTestBase&) = delete; + ~IntraPredTestBase() override = default; + + protected: + struct IntraPredMem { + void Reset(libvpx_test::ACMRandom* rnd) { + ASSERT_NE(rnd, nullptr); + Pixel* const left = left_mem + 16; + Pixel* const top = top_mem + 16; + const int mask = (1 << bitdepth) - 1; + for (auto& r : ref_src) r = rnd->Rand16() & mask; + for (int i = 0; i < kMaxBlockSize; ++i) left[i] = rnd->Rand16() & mask; + for (int i = -1; i < kMaxBlockSize; ++i) top[i] = rnd->Rand16() & mask; + + // Some directional predictors require top-right, bottom-left. + for (int i = kMaxBlockSize; i < 2 * kMaxBlockSize; ++i) { + left[i] = rnd->Rand16() & mask; + top[i] = rnd->Rand16() & mask; + } + // TODO(jzern): reorder this and regenerate the digests after switching + // random number generators. + // Upsampling in the directional predictors extends left/top[-1] to [-2]. + left[-1] = rnd->Rand16() & mask; + left[-2] = rnd->Rand16() & mask; + top[-2] = rnd->Rand16() & mask; + memset(left_mem, 0, sizeof(left_mem[0]) * 14); + memset(top_mem, 0, sizeof(top_mem[0]) * 14); + memset(top_mem + kMaxBlockSize * 2 + 16, 0, + sizeof(top_mem[0]) * kTopMemPadding); + } + + // Set ref_src, top-left, top and left to |pixel|. + void Set(const Pixel pixel) { + Pixel* const left = left_mem + 16; + Pixel* const top = top_mem + 16; + for (auto& r : ref_src) r = pixel; + // Upsampling in the directional predictors extends left/top[-1] to [-2]. + for (int i = -2; i < 2 * kMaxBlockSize; ++i) { + left[i] = top[i] = pixel; + } + } + + // DirectionalZone1_Large() overreads up to 7 pixels in |top_mem|. + static constexpr int kTopMemPadding = 7; + alignas(kMaxAlignment) Pixel dst[kTotalPixels]; + alignas(kMaxAlignment) Pixel ref_src[kTotalPixels]; + alignas(kMaxAlignment) Pixel left_mem[kMaxBlockSize * 2 + 16]; + alignas( + kMaxAlignment) Pixel top_mem[kMaxBlockSize * 2 + 16 + kTopMemPadding]; + }; + + void SetUp() override { test_utils::ResetDspTable(bitdepth); } + + const TransformSize tx_size_ = GetParam(); + int block_width_; + int block_height_; + IntraPredMem intra_pred_mem_; +}; + +//------------------------------------------------------------------------------ +// CflIntraPredTest + +template <int bitdepth, typename Pixel> +class CflIntraPredTest : public IntraPredTestBase<bitdepth, Pixel> { + public: + CflIntraPredTest() = default; + CflIntraPredTest(const CflIntraPredTest&) = delete; + CflIntraPredTest& operator=(const CflIntraPredTest&) = delete; + ~CflIntraPredTest() override = default; + + protected: + using IntraPredTestBase<bitdepth, Pixel>::tx_size_; + using IntraPredTestBase<bitdepth, Pixel>::block_width_; + using IntraPredTestBase<bitdepth, Pixel>::block_height_; + using IntraPredTestBase<bitdepth, Pixel>::intra_pred_mem_; + + void SetUp() override { + IntraPredTestBase<bitdepth, Pixel>::SetUp(); + IntraPredCflInit_C(); + + const Dsp* const dsp = GetDspTable(bitdepth); + ASSERT_NE(dsp, nullptr); + base_cfl_intra_pred_ = dsp->cfl_intra_predictors[tx_size_]; + + const testing::TestInfo* const test_info = + testing::UnitTest::GetInstance()->current_test_info(); + const char* const test_case = test_info->test_suite_name(); + if (absl::StartsWith(test_case, "C/")) { + base_cfl_intra_pred_ = nullptr; + } else if (absl::StartsWith(test_case, "NEON/")) { + IntraPredCflInit_NEON(); + } else if (absl::StartsWith(test_case, "SSE41/")) { + if ((GetCpuInfo() & kSSE4_1) != 0) { + IntraPredCflInit_SSE4_1(); + } + } else { + FAIL() << "Unrecognized architecture prefix in test case name: " + << test_case; + } + + cur_cfl_intra_pred_ = dsp->cfl_intra_predictors[tx_size_]; + + if (cur_cfl_intra_pred_ == base_cfl_intra_pred_) { + cur_cfl_intra_pred_ = nullptr; + } + } + + // This test modifies intra_pred_mem_. + void TestSpeed(const char* digest, int num_runs); + void TestSaturatedValues(); + void TestRandomValues(); + + CflIntraPredictorFunc base_cfl_intra_pred_; + CflIntraPredictorFunc cur_cfl_intra_pred_; +}; + +template <int bitdepth, typename Pixel> +void CflIntraPredTest<bitdepth, Pixel>::TestSpeed(const char* const digest, + const int num_runs) { + if (cur_cfl_intra_pred_ == nullptr) return; + libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed()); + int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride] = {}; + const int alpha = rnd(33) - 16; + const int dc = rnd(1 << bitdepth); + const int max_luma = ((1 << bitdepth) - 1) << 3; + for (int i = 0; i < block_height_; ++i) { + for (int j = 0; j < block_width_; ++j) { + if (i < kCflLumaBufferStride && j < kCflLumaBufferStride) { + luma[i][j] = max_luma - rnd(max_luma << 1); + } + } + } + for (auto& r : intra_pred_mem_.ref_src) r = dc; + + absl::Duration elapsed_time; + for (int run = 0; run < num_runs; ++run) { + const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel); + memcpy(intra_pred_mem_.dst, intra_pred_mem_.ref_src, + sizeof(intra_pred_mem_.dst)); + const absl::Time start = absl::Now(); + cur_cfl_intra_pred_(intra_pred_mem_.dst, stride, luma, alpha); + elapsed_time += absl::Now() - start; + } + test_utils::CheckMd5Digest(ToString(tx_size_), kCflIntraPredName, digest, + intra_pred_mem_.dst, sizeof(intra_pred_mem_.dst), + elapsed_time); +} + +template <int bitdepth, typename Pixel> +void CflIntraPredTest<bitdepth, Pixel>::TestSaturatedValues() { + // Skip the 'C' test case as this is used as the reference. + if (base_cfl_intra_pred_ == nullptr) return; + + int16_t luma_buffer[kCflLumaBufferStride][kCflLumaBufferStride]; + for (auto& line : luma_buffer) { + for (auto& luma : line) luma = ((1 << bitdepth) - 1) << 3; + } + + libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed()); + static constexpr int kSaturatedAlpha[] = {-16, 16}; + for (const int alpha : kSaturatedAlpha) { + for (auto& r : intra_pred_mem_.ref_src) r = (1 << bitdepth) - 1; + memcpy(intra_pred_mem_.dst, intra_pred_mem_.ref_src, + sizeof(intra_pred_mem_.dst)); + const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel); + base_cfl_intra_pred_(intra_pred_mem_.ref_src, stride, luma_buffer, alpha); + cur_cfl_intra_pred_(intra_pred_mem_.dst, stride, luma_buffer, alpha); + if (!test_utils::CompareBlocks(intra_pred_mem_.dst, intra_pred_mem_.ref_src, + block_width_, block_height_, kMaxBlockSize, + kMaxBlockSize, true)) { + ADD_FAILURE() << "Result from optimized version of CFL with alpha " + << alpha << " differs from reference."; + break; + } + } +} + +template <int bitdepth, typename Pixel> +void CflIntraPredTest<bitdepth, Pixel>::TestRandomValues() { + // Skip the 'C' test case as this is used as the reference. + if (base_cfl_intra_pred_ == nullptr) return; + int16_t luma_buffer[kCflLumaBufferStride][kCflLumaBufferStride]; + + const int max_luma = ((1 << bitdepth) - 1) << 3; + // Use an alternate seed to differentiate this test from TestSpeed(). + libvpx_test::ACMRandom rnd(test_utils::kAlternateDeterministicSeed); + for (auto& line : luma_buffer) { + for (auto& luma : line) luma = max_luma - rnd(max_luma << 1); + } + const int dc = rnd(1 << bitdepth); + for (auto& r : intra_pred_mem_.ref_src) r = dc; + static constexpr int kSaturatedAlpha[] = {-16, 16}; + for (const int alpha : kSaturatedAlpha) { + intra_pred_mem_.Reset(&rnd); + memcpy(intra_pred_mem_.dst, intra_pred_mem_.ref_src, + sizeof(intra_pred_mem_.dst)); + const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel); + base_cfl_intra_pred_(intra_pred_mem_.ref_src, stride, luma_buffer, alpha); + cur_cfl_intra_pred_(intra_pred_mem_.dst, stride, luma_buffer, alpha); + if (!test_utils::CompareBlocks(intra_pred_mem_.dst, intra_pred_mem_.ref_src, + block_width_, block_height_, kMaxBlockSize, + kMaxBlockSize, true)) { + ADD_FAILURE() << "Result from optimized version of CFL with alpha " + << alpha << " differs from reference."; + break; + } + } +} + +template <int bitdepth, typename Pixel, SubsamplingType subsampling_type> +class CflSubsamplerTest : public IntraPredTestBase<bitdepth, Pixel> { + public: + CflSubsamplerTest() = default; + CflSubsamplerTest(const CflSubsamplerTest&) = delete; + CflSubsamplerTest& operator=(const CflSubsamplerTest&) = delete; + ~CflSubsamplerTest() override = default; + + protected: + using IntraPredTestBase<bitdepth, Pixel>::tx_size_; + using IntraPredTestBase<bitdepth, Pixel>::block_width_; + using IntraPredTestBase<bitdepth, Pixel>::block_height_; + using IntraPredTestBase<bitdepth, Pixel>::intra_pred_mem_; + + void SetUp() override { + IntraPredTestBase<bitdepth, Pixel>::SetUp(); + IntraPredCflInit_C(); + + const Dsp* const dsp = GetDspTable(bitdepth); + ASSERT_NE(dsp, nullptr); + base_cfl_subsampler_ = dsp->cfl_subsamplers[tx_size_][subsampling_type]; + + const testing::TestInfo* const test_info = + testing::UnitTest::GetInstance()->current_test_info(); + const char* const test_case = test_info->test_suite_name(); + if (absl::StartsWith(test_case, "C/")) { + base_cfl_subsampler_ = nullptr; + } else if (absl::StartsWith(test_case, "NEON/")) { + IntraPredCflInit_NEON(); + } else if (absl::StartsWith(test_case, "SSE41/")) { + if ((GetCpuInfo() & kSSE4_1) != 0) { + IntraPredCflInit_SSE4_1(); + } + } else { + FAIL() << "Unrecognized architecture prefix in test case name: " + << test_case; + } + cur_cfl_subsampler_ = dsp->cfl_subsamplers[tx_size_][subsampling_type]; + } + + // This test modifies intra_pred_mem_. + void TestSpeed(const char* digest, int num_runs); + void TestSaturatedValues(); + void TestRandomValues(); + + enum SubsamplingType SubsamplingType() const { return subsampling_type; } + + CflSubsamplerFunc base_cfl_subsampler_; + CflSubsamplerFunc cur_cfl_subsampler_; +}; + +// There is no case where both source and output have lowest height or width +// when that dimension is subsampled. +int GetLumaWidth(int block_width, SubsamplingType subsampling_type) { + if (block_width == 4) { + const int width_shift = + static_cast<int>(subsampling_type != kSubsamplingType444); + return block_width << width_shift; + } + return block_width; +} + +int GetLumaHeight(int block_height, SubsamplingType subsampling_type) { + if (block_height == 4) { + const int height_shift = + static_cast<int>(subsampling_type == kSubsamplingType420); + return block_height << height_shift; + } + return block_height; +} + +template <int bitdepth, typename Pixel, SubsamplingType subsampling_type> +void CflSubsamplerTest<bitdepth, Pixel, subsampling_type>::TestSpeed( + const char* const digest, const int num_runs) { + // C declines initializing the table in normal circumstances because there are + // assembly implementations. + if (cur_cfl_subsampler_ == nullptr) return; + libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed()); + + const int width = GetLumaWidth(block_width_, subsampling_type); + const int height = GetLumaHeight(block_height_, subsampling_type); + Pixel* src = intra_pred_mem_.ref_src; + for (int i = 0; i < height; ++i) { + for (int j = 0; j < width; ++j) { + src[j] = rnd.RandRange(1 << bitdepth); + } + src += kMaxBlockSize; + } + const absl::Time start = absl::Now(); + int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride] = {}; + const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel); + for (int run = 0; run < num_runs; ++run) { + cur_cfl_subsampler_(luma, width, height, intra_pred_mem_.ref_src, stride); + } + const absl::Duration elapsed_time = absl::Now() - start; + test_utils::CheckMd5Digest(ToString(tx_size_), kCflIntraPredName, digest, + luma, sizeof(luma), elapsed_time); +} + +template <int bitdepth, typename Pixel, SubsamplingType subsampling_type> +void CflSubsamplerTest<bitdepth, Pixel, + subsampling_type>::TestSaturatedValues() { + if (base_cfl_subsampler_ == nullptr) return; + const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel); + for (int width = GetLumaWidth(block_width_, subsampling_type); width > 0; + width -= 8) { + for (int height = GetLumaHeight(block_height_, subsampling_type); + height > 0; height -= 8) { + Pixel* src = intra_pred_mem_.ref_src; + for (int y = 0; y < height; ++y) { + Memset(src, (1 << bitdepth) - 1, width); + Memset(src + width, 0, kMaxBlockSize - width); + src += kMaxBlockSize; + } + Memset(intra_pred_mem_.ref_src + kMaxBlockSize * height, 0, + kMaxBlockSize * (kMaxBlockSize - height)); + + int16_t luma_base[kCflLumaBufferStride][kCflLumaBufferStride] = {}; + int16_t luma_cur[kCflLumaBufferStride][kCflLumaBufferStride] = {}; + base_cfl_subsampler_(luma_base, width, height, intra_pred_mem_.ref_src, + stride); + cur_cfl_subsampler_(luma_cur, width, height, intra_pred_mem_.ref_src, + stride); + if (!test_utils::CompareBlocks(reinterpret_cast<uint16_t*>(luma_cur[0]), + reinterpret_cast<uint16_t*>(luma_base[0]), + block_width_, block_height_, + kCflLumaBufferStride, kCflLumaBufferStride, + true)) { + FAIL() << "Result from optimized version of CFL subsampler" + << " differs from reference. max_luma_width: " << width + << " max_luma_height: " << height; + } + } + } +} + +template <int bitdepth, typename Pixel, SubsamplingType subsampling_type> +void CflSubsamplerTest<bitdepth, Pixel, subsampling_type>::TestRandomValues() { + if (base_cfl_subsampler_ == nullptr) return; + const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel); + // Use an alternate seed to differentiate this test from TestSpeed(). + libvpx_test::ACMRandom rnd(test_utils::kAlternateDeterministicSeed); + for (int width = GetLumaWidth(block_width_, subsampling_type); width > 0; + width -= 8) { + for (int height = GetLumaHeight(block_height_, subsampling_type); + height > 0; height -= 8) { + Pixel* src = intra_pred_mem_.ref_src; + for (int i = 0; i < height; ++i) { + for (int j = 0; j < width; ++j) { + src[j] = rnd.RandRange(1 << bitdepth); + } + Memset(src + width, 0, kMaxBlockSize - width); + src += kMaxBlockSize; + } + Memset(intra_pred_mem_.ref_src + kMaxBlockSize * height, 0, + kMaxBlockSize * (kMaxBlockSize - height)); + + int16_t luma_base[kCflLumaBufferStride][kCflLumaBufferStride] = {}; + int16_t luma_cur[kCflLumaBufferStride][kCflLumaBufferStride] = {}; + base_cfl_subsampler_(luma_base, width, height, intra_pred_mem_.ref_src, + stride); + cur_cfl_subsampler_(luma_cur, width, height, intra_pred_mem_.ref_src, + stride); + if (!test_utils::CompareBlocks(reinterpret_cast<uint16_t*>(luma_cur[0]), + reinterpret_cast<uint16_t*>(luma_base[0]), + block_width_, block_height_, + kCflLumaBufferStride, kCflLumaBufferStride, + true)) { + FAIL() << "Result from optimized version of CFL subsampler" + << " differs from reference. max_luma_width: " << width + << " max_luma_height: " << height; + } + } + } +} + +//------------------------------------------------------------------------------ + +using CflIntraPredTest8bpp = CflIntraPredTest<8, uint8_t>; + +const char* GetCflIntraPredDigest8bpp(TransformSize tx_size) { + static const char* const kDigest4x4 = "9ea7088e082867fd5ae394ca549fe1ed"; + static const char* const kDigest4x8 = "323b0b4784b6658da781398e61f2da3d"; + static const char* const kDigest4x16 = "99eb9c65f227ca7f71dcac24645a4fec"; + static const char* const kDigest8x4 = "e8e782e31c94f3974b87b93d455262d8"; + static const char* const kDigest8x8 = "23ab9fb65e7bbbdb985709e115115eb5"; + static const char* const kDigest8x16 = "52f5add2fc4bbb2ff893148645e95b9c"; + static const char* const kDigest8x32 = "283fdee9af8afdb76f72dd7339c92c3c"; + static const char* const kDigest16x4 = "eead35f515b1aa8b5175b283192b86e6"; + static const char* const kDigest16x8 = "5778e934254eaab04230bc370f64f778"; + static const char* const kDigest16x16 = "4e8ed38ccba0d62f1213171da2212ed3"; + static const char* const kDigest16x32 = "61a29bd7699e18ca6ea5641d1d023bfd"; + static const char* const kDigest32x8 = "7f31607bd4f9ec879aa47f4daf9c7bb0"; + static const char* const kDigest32x16 = "eb84dfab900fa6a90e132b186b4c6c36"; + static const char* const kDigest32x32 = "e0ff35d407cb214578d61ef419c94237"; + + switch (tx_size) { + case kTransformSize4x4: + return kDigest4x4; + case kTransformSize4x8: + return kDigest4x8; + case kTransformSize4x16: + return kDigest4x16; + case kTransformSize8x4: + return kDigest8x4; + case kTransformSize8x8: + return kDigest8x8; + case kTransformSize8x16: + return kDigest8x16; + case kTransformSize8x32: + return kDigest8x32; + case kTransformSize16x4: + return kDigest16x4; + case kTransformSize16x8: + return kDigest16x8; + case kTransformSize16x16: + return kDigest16x16; + case kTransformSize16x32: + return kDigest16x32; + case kTransformSize32x8: + return kDigest32x8; + case kTransformSize32x16: + return kDigest32x16; + case kTransformSize32x32: + return kDigest32x32; + default: + ADD_FAILURE() << "Unknown transform size: " << tx_size; + return nullptr; + } +} + +TEST_P(CflIntraPredTest8bpp, DISABLED_Speed) { + const auto num_runs = + static_cast<int>(2.0e9 / (block_width_ * block_height_)); + TestSpeed(GetCflIntraPredDigest8bpp(tx_size_), num_runs); +} + +TEST_P(CflIntraPredTest8bpp, FixedInput) { + TestSpeed(GetCflIntraPredDigest8bpp(tx_size_), 1); +} + +TEST_P(CflIntraPredTest8bpp, Overflow) { TestSaturatedValues(); } + +TEST_P(CflIntraPredTest8bpp, Random) { TestRandomValues(); } + +//------------------------------------------------------------------------------ + +using CflSubsamplerTest8bpp444 = + CflSubsamplerTest<8, uint8_t, kSubsamplingType444>; +using CflSubsamplerTest8bpp422 = + CflSubsamplerTest<8, uint8_t, kSubsamplingType422>; +using CflSubsamplerTest8bpp420 = + CflSubsamplerTest<8, uint8_t, kSubsamplingType420>; + +const char* GetCflSubsamplerDigest8bpp(TransformSize tx_size, + SubsamplingType subsampling_type) { + static const char* const kDigests4x4[3] = { + "a8fa98d76cc3ccffcffc0d02dfae052c", "929cf2c23d926b500616797f8b1baf5b", + "1d03f091956838e7f2b113aabd8b9da9"}; + static const char* const kDigests4x8[3] = { + "717b84f867f413c87c90a7c5d0125c8c", "6ccd9f48842b1a802e128b46b8f4885d", + "68a334f5d2abecbc78562b3280b5fb0c"}; + static const char* const kDigests4x16[3] = { + "ecd1340b7e065dd8807fd9861abb7d99", "042c3fee17df7ef8fb8cef616f212a91", + "b0600f0bc3fbfc374bb3628360dcae5c"}; + static const char* const kDigests8x4[3] = { + "4ea5617f4ed8e9edc2fff88d0ab8e53f", "b02288905f218c9f54ce4a472ec7b22e", + "3522d3a4dd3839d1a86fb39b31a86d52"}; + static const char* const kDigests8x8[3] = { + "a0488493e6bcdb868713a95f9b4a0091", "ff6c1ac1d94fce63c282ba49186529bf", + "082e34ba04d04d7cd6fe408823987602"}; + static const char* const kDigests8x16[3] = { + "e01dd4bb21daaa6e991cd5b1e6f30300", "2a1b13f932e39cc5f561afea9956f47a", + "d8d266282cb7123f780bd7266e8f5913"}; + static const char* const kDigests8x32[3] = { + "0fc95e4ab798b95ccd2966ff75028b03", "6bc6e45ef2f664134449342fe76006ff", + "d294fb6399edaa267aa167407c0ebccb"}; + static const char* const kDigests16x4[3] = { + "4798c2cf649b786bd153ad88353d52aa", "43a4bfa3b8caf4b72f58c6a1d1054f64", + "a928ebbec2db1508c8831a440d82eb98"}; + static const char* const kDigests16x8[3] = { + "736b7f5b603cb34abcbe1b7e69b6ce93", "90422000ab20ecb519e4d277a9b3ea2b", + "c8e71c2fddbb850c5a50592ee5975368"}; + static const char* const kDigests16x16[3] = { + "4f15a694966ee50a9e987e9a0aa2423b", "9e31e2f5a7ce7bef738b135755e25dcd", + "2ffeed4d592a0455f6d888913969827f"}; + static const char* const kDigests16x32[3] = { + "3a10438bfe17ea39efad20608a0520eb", "79e8e8732a6ffc29dfbb0b3fc29c2883", + "185ca976ccbef7fb5f3f8c6aa22d5a79"}; + static const char* const kDigests32x8[3] = { + "683704f08839a15e42603e4977a3e815", "13d311635372aee8998fca1758e75e20", + "9847d88eaaa57c086a2e6aed583048d3"}; + static const char* const kDigests32x16[3] = { + "14b6761bf9f1156cf2496f532512aa99", "ee57bb7f0aa2302d29cdc1bfce72d5fc", + "a4189655fe714b82eb88cb5092c0ad76"}; + static const char* const kDigests32x32[3] = { + "dcfbe71b70a37418ccb90dbf27f04226", "c578556a584019c1bdc2d0c3b9fd0c88", + "db200bc8ccbeacd6a42d6b8e5ad1d931"}; + + switch (tx_size) { + case kTransformSize4x4: + return kDigests4x4[subsampling_type]; + case kTransformSize4x8: + return kDigests4x8[subsampling_type]; + case kTransformSize4x16: + return kDigests4x16[subsampling_type]; + case kTransformSize8x4: + return kDigests8x4[subsampling_type]; + case kTransformSize8x8: + return kDigests8x8[subsampling_type]; + case kTransformSize8x16: + return kDigests8x16[subsampling_type]; + case kTransformSize8x32: + return kDigests8x32[subsampling_type]; + case kTransformSize16x4: + return kDigests16x4[subsampling_type]; + case kTransformSize16x8: + return kDigests16x8[subsampling_type]; + case kTransformSize16x16: + return kDigests16x16[subsampling_type]; + case kTransformSize16x32: + return kDigests16x32[subsampling_type]; + case kTransformSize32x8: + return kDigests32x8[subsampling_type]; + case kTransformSize32x16: + return kDigests32x16[subsampling_type]; + case kTransformSize32x32: + return kDigests32x32[subsampling_type]; + default: + ADD_FAILURE() << "Unknown transform size: " << tx_size; + return nullptr; + } +} + +TEST_P(CflSubsamplerTest8bpp444, DISABLED_Speed) { + const auto num_runs = + static_cast<int>(2.0e9 / (block_width_ * block_height_)); + TestSpeed(GetCflSubsamplerDigest8bpp(tx_size_, SubsamplingType()), num_runs); +} + +TEST_P(CflSubsamplerTest8bpp444, FixedInput) { + TestSpeed(GetCflSubsamplerDigest8bpp(tx_size_, SubsamplingType()), 1); +} + +TEST_P(CflSubsamplerTest8bpp444, Overflow) { TestSaturatedValues(); } + +TEST_P(CflSubsamplerTest8bpp444, Random) { TestRandomValues(); } + +TEST_P(CflSubsamplerTest8bpp422, DISABLED_Speed) { + const auto num_runs = + static_cast<int>(2.0e9 / (block_width_ * block_height_)); + TestSpeed(GetCflSubsamplerDigest8bpp(tx_size_, SubsamplingType()), num_runs); +} + +TEST_P(CflSubsamplerTest8bpp422, FixedInput) { + TestSpeed(GetCflSubsamplerDigest8bpp(tx_size_, SubsamplingType()), 1); +} + +TEST_P(CflSubsamplerTest8bpp422, Overflow) { TestSaturatedValues(); } + +TEST_P(CflSubsamplerTest8bpp422, Random) { TestRandomValues(); } + +TEST_P(CflSubsamplerTest8bpp420, DISABLED_Speed) { + const auto num_runs = + static_cast<int>(2.0e9 / (block_width_ * block_height_)); + TestSpeed(GetCflSubsamplerDigest8bpp(tx_size_, SubsamplingType()), num_runs); +} + +TEST_P(CflSubsamplerTest8bpp420, FixedInput) { + TestSpeed(GetCflSubsamplerDigest8bpp(tx_size_, SubsamplingType()), 1); +} + +TEST_P(CflSubsamplerTest8bpp420, Overflow) { TestSaturatedValues(); } + +TEST_P(CflSubsamplerTest8bpp420, Random) { TestRandomValues(); } + +//------------------------------------------------------------------------------ + +#if LIBGAV1_MAX_BITDEPTH >= 10 +//------------------------------------------------------------------------------ + +using CflIntraPredTest10bpp = CflIntraPredTest<10, uint16_t>; + +const char* GetCflIntraPredDigest10bpp(TransformSize tx_size) { + static const char* const kDigest4x4 = "b4ca5f6fbb643a94eb05d59976d44c5d"; + static const char* const kDigest4x8 = "040139b76ee22af05c56baf887d3d43b"; + static const char* const kDigest4x16 = "4a1d59ace84ff07e68a0d30e9b1cebdd"; + static const char* const kDigest8x4 = "c2c149cea5fdcd18bfe5c19ec2a8aa90"; + static const char* const kDigest8x8 = "68ad90bd6f409548fa5551496b7cb0d0"; + static const char* const kDigest8x16 = "bdc54eff4de8c5d597b03afaa705d3fe"; + static const char* const kDigest8x32 = "362aebc6d68ff0d312d55dcd6a8a927d"; + static const char* const kDigest16x4 = "349e813aedd211581c5e64ba1938eaa7"; + static const char* const kDigest16x8 = "35c64f6da17f836618b5804185cf3eef"; + static const char* const kDigest16x16 = "95be0c78dbd8dda793c62c6635b4bfb7"; + static const char* const kDigest16x32 = "4752b9eda069854d3f5c56d3f2057e79"; + static const char* const kDigest32x8 = "dafc5e973e4b6a55861f4586a11b7dd1"; + static const char* const kDigest32x16 = "1e177ed3914a165183916aca1d01bb74"; + static const char* const kDigest32x32 = "4c9ab3cf9baa27bb34e29729dabc1ea6"; + + switch (tx_size) { + case kTransformSize4x4: + return kDigest4x4; + case kTransformSize4x8: + return kDigest4x8; + case kTransformSize4x16: + return kDigest4x16; + case kTransformSize8x4: + return kDigest8x4; + case kTransformSize8x8: + return kDigest8x8; + case kTransformSize8x16: + return kDigest8x16; + case kTransformSize8x32: + return kDigest8x32; + case kTransformSize16x4: + return kDigest16x4; + case kTransformSize16x8: + return kDigest16x8; + case kTransformSize16x16: + return kDigest16x16; + case kTransformSize16x32: + return kDigest16x32; + case kTransformSize32x8: + return kDigest32x8; + case kTransformSize32x16: + return kDigest32x16; + case kTransformSize32x32: + return kDigest32x32; + default: + ADD_FAILURE() << "Unknown transform size: " << tx_size; + return nullptr; + } +} + +TEST_P(CflIntraPredTest10bpp, DISABLED_Speed) { + const auto num_runs = + static_cast<int>(2.0e9 / (block_width_ * block_height_)); + TestSpeed(GetCflIntraPredDigest10bpp(tx_size_), num_runs); +} + +TEST_P(CflIntraPredTest10bpp, FixedInput) { + TestSpeed(GetCflIntraPredDigest10bpp(tx_size_), 1); +} + +TEST_P(CflIntraPredTest10bpp, Overflow) { TestSaturatedValues(); } + +TEST_P(CflIntraPredTest10bpp, Random) { TestRandomValues(); } + +//------------------------------------------------------------------------------ + +using CflSubsamplerTest10bpp444 = + CflSubsamplerTest<10, uint16_t, kSubsamplingType444>; +using CflSubsamplerTest10bpp422 = + CflSubsamplerTest<10, uint16_t, kSubsamplingType422>; +using CflSubsamplerTest10bpp420 = + CflSubsamplerTest<10, uint16_t, kSubsamplingType420>; + +const char* GetCflSubsamplerDigest10bpp(TransformSize tx_size, + SubsamplingType subsampling_type) { + static const char* const kDigests4x4[3] = { + "a8abcad9a6c9b046a100689135a108cb", "01081c2a0d0c15dabdbc725be5660451", + "93d1d9df2861240d88f5618e42178654"}; + static const char* const kDigests4x8[3] = { + "d1fd8cd0709ca6634ad85f3e331672e1", "0d603fcc910aca3db41fc7f64e826c27", + "cf88b6d1b7b025cfa0082361775aeb75"}; + static const char* const kDigests4x16[3] = { + "ce2e036a950388a564d8637b1416a6c6", "6c36c46cd72057a6b36bc12188b6d22c", + "0884a0e53384cd5173035ad8966d8f2f"}; + static const char* const kDigests8x4[3] = { + "174e961983ed71fb105ed71aa3f9daf5", "330946cc369a534618a1014b4e3f6f18", + "8070668aa389c1d09f8aaf43c1223e8c"}; + static const char* const kDigests8x8[3] = { + "86884feb35217010f73ccdbadecb635e", "b8cbc646e1bf1352e5b4b599eaef1193", + "4a1110382e56b42d3b7a4132bccc01ee"}; + static const char* const kDigests8x16[3] = { + "a694c4e1f89648ffb49efd6a1d35b300", "864b9da67d23a2f8284b28b2a1e5aa30", + "bd012ca1cea256dd02c231339a4cf200"}; + static const char* const kDigests8x32[3] = { + "60c42201bc24e518c1a3b3b6306d8125", "4d530e47c2b7555d5f311ee910d61842", + "71888b17b832ef55c0cd9449c0e6b077"}; + static const char* const kDigests16x4[3] = { + "6b6d5ae4cc294c070ce65ab31c5a7d4f", "0fbecee20d294939e7a0183c2b4a0b96", + "917cd884923139d5c05a11000722e3b6"}; + static const char* const kDigests16x8[3] = { + "688c41726d9ac35fb5b18c57bca76b9c", "d439a2e0a60d672b644cd1189e2858b9", + "edded6d166a77a6c3ff46fddc13f372f"}; + static const char* const kDigests16x16[3] = { + "feb2bad9f6bb3f60eaeaf6c1bfd89ca5", "d65cabce5fcd9a29d1dfc530e4764f3a", + "2f1a91898812d2c9320c7506b3a72eb4"}; + static const char* const kDigests16x32[3] = { + "6f23b1851444d29633e62ce77bf09559", "4a449fd078bd0c9657cdc24b709c0796", + "e44e18cb8bda2d34b52c96d5b6b510be"}; + static const char* const kDigests32x8[3] = { + "77bf9ba56f7e1d2f04068a8a00b139da", "a85a1dea82963dedab9a2f7ad4169b5f", + "d12746071bee96ddc075c6368bc9fbaf"}; + static const char* const kDigests32x16[3] = { + "cce3422f7f8cf57145f979359ac92f98", "1c18738d40bfa91296e5fdb7230bf9a7", + "02513142d109aee10f081cacfb33d1c5"}; + static const char* const kDigests32x32[3] = { + "789008e49d0276de186af968196dd4a7", "b8848b00968a7ba4787765b7214da05f", + "12d13828db57605b00ce99469489651d"}; + + switch (tx_size) { + case kTransformSize4x4: + return kDigests4x4[subsampling_type]; + case kTransformSize4x8: + return kDigests4x8[subsampling_type]; + case kTransformSize4x16: + return kDigests4x16[subsampling_type]; + case kTransformSize8x4: + return kDigests8x4[subsampling_type]; + case kTransformSize8x8: + return kDigests8x8[subsampling_type]; + case kTransformSize8x16: + return kDigests8x16[subsampling_type]; + case kTransformSize8x32: + return kDigests8x32[subsampling_type]; + case kTransformSize16x4: + return kDigests16x4[subsampling_type]; + case kTransformSize16x8: + return kDigests16x8[subsampling_type]; + case kTransformSize16x16: + return kDigests16x16[subsampling_type]; + case kTransformSize16x32: + return kDigests16x32[subsampling_type]; + case kTransformSize32x8: + return kDigests32x8[subsampling_type]; + case kTransformSize32x16: + return kDigests32x16[subsampling_type]; + case kTransformSize32x32: + return kDigests32x32[subsampling_type]; + default: + ADD_FAILURE() << "Unknown transform size: " << tx_size; + return nullptr; + } +} + +TEST_P(CflSubsamplerTest10bpp444, DISABLED_Speed) { + const auto num_runs = + static_cast<int>(2.0e9 / (block_width_ * block_height_)); + TestSpeed(GetCflSubsamplerDigest10bpp(tx_size_, SubsamplingType()), num_runs); +} + +TEST_P(CflSubsamplerTest10bpp444, FixedInput) { + TestSpeed(GetCflSubsamplerDigest10bpp(tx_size_, SubsamplingType()), 1); +} + +TEST_P(CflSubsamplerTest10bpp444, Overflow) { TestSaturatedValues(); } + +TEST_P(CflSubsamplerTest10bpp444, Random) { TestRandomValues(); } + +TEST_P(CflSubsamplerTest10bpp422, DISABLED_Speed) { + const auto num_runs = + static_cast<int>(2.0e9 / (block_width_ * block_height_)); + TestSpeed(GetCflSubsamplerDigest10bpp(tx_size_, SubsamplingType()), num_runs); +} + +TEST_P(CflSubsamplerTest10bpp422, FixedInput) { + TestSpeed(GetCflSubsamplerDigest10bpp(tx_size_, SubsamplingType()), 1); +} + +TEST_P(CflSubsamplerTest10bpp422, Overflow) { TestSaturatedValues(); } + +TEST_P(CflSubsamplerTest10bpp422, Random) { TestRandomValues(); } + +TEST_P(CflSubsamplerTest10bpp420, DISABLED_Speed) { + const auto num_runs = + static_cast<int>(2.0e9 / (block_width_ * block_height_)); + TestSpeed(GetCflSubsamplerDigest10bpp(tx_size_, SubsamplingType()), num_runs); +} + +TEST_P(CflSubsamplerTest10bpp420, FixedInput) { + TestSpeed(GetCflSubsamplerDigest10bpp(tx_size_, SubsamplingType()), 1); +} + +TEST_P(CflSubsamplerTest10bpp420, Overflow) { TestSaturatedValues(); } + +TEST_P(CflSubsamplerTest10bpp420, Random) { TestRandomValues(); } + +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +// Cfl predictors are available only for transform sizes with +// max(width, height) <= 32. +constexpr TransformSize kTransformSizesSmallerThan32x32[] = { + kTransformSize4x4, kTransformSize4x8, kTransformSize4x16, + kTransformSize8x4, kTransformSize8x8, kTransformSize8x16, + kTransformSize8x32, kTransformSize16x4, kTransformSize16x8, + kTransformSize16x16, kTransformSize16x32, kTransformSize32x8, + kTransformSize32x16, kTransformSize32x32}; + +INSTANTIATE_TEST_SUITE_P(C, CflIntraPredTest8bpp, + testing::ValuesIn(kTransformSizesSmallerThan32x32)); +INSTANTIATE_TEST_SUITE_P(C, CflSubsamplerTest8bpp444, + testing::ValuesIn(kTransformSizesSmallerThan32x32)); +INSTANTIATE_TEST_SUITE_P(C, CflSubsamplerTest8bpp422, + testing::ValuesIn(kTransformSizesSmallerThan32x32)); +INSTANTIATE_TEST_SUITE_P(C, CflSubsamplerTest8bpp420, + testing::ValuesIn(kTransformSizesSmallerThan32x32)); +#if LIBGAV1_ENABLE_SSE4_1 +INSTANTIATE_TEST_SUITE_P(SSE41, CflIntraPredTest8bpp, + testing::ValuesIn(kTransformSizesSmallerThan32x32)); +INSTANTIATE_TEST_SUITE_P(SSE41, CflSubsamplerTest8bpp444, + testing::ValuesIn(kTransformSizesSmallerThan32x32)); +INSTANTIATE_TEST_SUITE_P(SSE41, CflSubsamplerTest8bpp420, + testing::ValuesIn(kTransformSizesSmallerThan32x32)); +#endif // LIBGAV1_ENABLE_SSE4_1 +#if LIBGAV1_ENABLE_NEON +INSTANTIATE_TEST_SUITE_P(NEON, CflIntraPredTest8bpp, + testing::ValuesIn(kTransformSizesSmallerThan32x32)); +INSTANTIATE_TEST_SUITE_P(NEON, CflSubsamplerTest8bpp444, + testing::ValuesIn(kTransformSizesSmallerThan32x32)); +INSTANTIATE_TEST_SUITE_P(NEON, CflSubsamplerTest8bpp420, + testing::ValuesIn(kTransformSizesSmallerThan32x32)); +#endif // LIBGAV1_ENABLE_NEON + +#if LIBGAV1_MAX_BITDEPTH >= 10 +INSTANTIATE_TEST_SUITE_P(C, CflIntraPredTest10bpp, + testing::ValuesIn(kTransformSizesSmallerThan32x32)); +INSTANTIATE_TEST_SUITE_P(C, CflSubsamplerTest10bpp444, + testing::ValuesIn(kTransformSizesSmallerThan32x32)); +INSTANTIATE_TEST_SUITE_P(C, CflSubsamplerTest10bpp422, + testing::ValuesIn(kTransformSizesSmallerThan32x32)); +INSTANTIATE_TEST_SUITE_P(C, CflSubsamplerTest10bpp420, + testing::ValuesIn(kTransformSizesSmallerThan32x32)); +#if LIBGAV1_ENABLE_SSE4_1 +INSTANTIATE_TEST_SUITE_P(SSE41, CflIntraPredTest10bpp, + testing::ValuesIn(kTransformSizesSmallerThan32x32)); +INSTANTIATE_TEST_SUITE_P(SSE41, CflSubsamplerTest10bpp444, + testing::ValuesIn(kTransformSizesSmallerThan32x32)); +INSTANTIATE_TEST_SUITE_P(SSE41, CflSubsamplerTest10bpp420, + testing::ValuesIn(kTransformSizesSmallerThan32x32)); +#endif // LIBGAV1_ENABLE_SSE4_1 +#if LIBGAV1_ENABLE_NEON +INSTANTIATE_TEST_SUITE_P(NEON, CflIntraPredTest10bpp, + testing::ValuesIn(kTransformSizesSmallerThan32x32)); +INSTANTIATE_TEST_SUITE_P(NEON, CflSubsamplerTest10bpp444, + testing::ValuesIn(kTransformSizesSmallerThan32x32)); +INSTANTIATE_TEST_SUITE_P(NEON, CflSubsamplerTest10bpp420, + testing::ValuesIn(kTransformSizesSmallerThan32x32)); +#endif // LIBGAV1_ENABLE_NEON + +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +} // namespace +} // namespace dsp + +static std::ostream& operator<<(std::ostream& os, const TransformSize tx_size) { + return os << ToString(tx_size); +} + +} // namespace libgav1 diff --git a/src/dsp/intrapred_directional.cc b/src/dsp/intrapred_directional.cc new file mode 100644 index 0000000..e670769 --- /dev/null +++ b/src/dsp/intrapred_directional.cc @@ -0,0 +1,252 @@ +// Copyright 2021 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/intrapred_directional.h" + +#include <cassert> +#include <cstddef> +#include <cstdint> +#include <cstring> + +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/utils/common.h" +#include "src/utils/constants.h" +#include "src/utils/memory.h" + +namespace libgav1 { +namespace dsp { +namespace { + +//------------------------------------------------------------------------------ +// 7.11.2.4. Directional intra prediction process + +template <typename Pixel> +void DirectionalIntraPredictorZone1_C(void* const dest, ptrdiff_t stride, + const void* const top_row, + const int width, const int height, + const int xstep, + const bool upsampled_top) { + const auto* const top = static_cast<const Pixel*>(top_row); + auto* dst = static_cast<Pixel*>(dest); + stride /= sizeof(Pixel); + + assert(xstep > 0); + + // If xstep == 64 then |shift| always evaluates to 0 which sets |val| to + // |top[top_base_x]|. This corresponds to a 45 degree prediction. + if (xstep == 64) { + // 7.11.2.10. Intra edge upsample selection process + // if ( d <= 0 || d >= 40 ) useUpsample = 0 + // For |upsampled_top| the delta is |predictor_angle - 90|. Since the + // |predictor_angle| is 45 the delta is also 45. + assert(!upsampled_top); + const Pixel* top_ptr = top + 1; + for (int y = 0; y < height; ++y, dst += stride, ++top_ptr) { + memcpy(dst, top_ptr, sizeof(*top_ptr) * width); + } + return; + } + + const int upsample_shift = static_cast<int>(upsampled_top); + const int max_base_x = ((width + height) - 1) << upsample_shift; + const int scale_bits = 6 - upsample_shift; + const int base_step = 1 << upsample_shift; + int top_x = xstep; + int y = 0; + do { + int top_base_x = top_x >> scale_bits; + + if (top_base_x >= max_base_x) { + for (int i = y; i < height; ++i) { + Memset(dst, top[max_base_x], width); + dst += stride; + } + return; + } + + const int shift = ((top_x << upsample_shift) & 0x3F) >> 1; + int x = 0; + do { + if (top_base_x >= max_base_x) { + Memset(dst + x, top[max_base_x], width - x); + break; + } + + const int val = + top[top_base_x] * (32 - shift) + top[top_base_x + 1] * shift; + dst[x] = RightShiftWithRounding(val, 5 /*log2(32)*/); + top_base_x += base_step; + } while (++x < width); + + dst += stride; + top_x += xstep; + } while (++y < height); +} + +template <typename Pixel> +void DirectionalIntraPredictorZone2_C(void* const dest, ptrdiff_t stride, + const void* const top_row, + const void* const left_column, + const int width, const int height, + const int xstep, const int ystep, + const bool upsampled_top, + const bool upsampled_left) { + const auto* const top = static_cast<const Pixel*>(top_row); + const auto* const left = static_cast<const Pixel*>(left_column); + auto* dst = static_cast<Pixel*>(dest); + stride /= sizeof(Pixel); + + assert(xstep > 0); + assert(ystep > 0); + + const int upsample_top_shift = static_cast<int>(upsampled_top); + const int upsample_left_shift = static_cast<int>(upsampled_left); + const int scale_bits_x = 6 - upsample_top_shift; + const int scale_bits_y = 6 - upsample_left_shift; + const int min_base_x = -(1 << upsample_top_shift); + const int base_step_x = 1 << upsample_top_shift; + int y = 0; + int top_x = -xstep; + do { + int top_base_x = top_x >> scale_bits_x; + int left_y = (y << 6) - ystep; + int x = 0; + do { + int val; + if (top_base_x >= min_base_x) { + const int shift = ((top_x * (1 << upsample_top_shift)) & 0x3F) >> 1; + val = top[top_base_x] * (32 - shift) + top[top_base_x + 1] * shift; + } else { + // Note this assumes an arithmetic shift to handle negative values. + const int left_base_y = left_y >> scale_bits_y; + const int shift = ((left_y * (1 << upsample_left_shift)) & 0x3F) >> 1; + assert(left_base_y >= -(1 << upsample_left_shift)); + val = left[left_base_y] * (32 - shift) + left[left_base_y + 1] * shift; + } + dst[x] = RightShiftWithRounding(val, 5); + top_base_x += base_step_x; + left_y -= ystep; + } while (++x < width); + + top_x -= xstep; + dst += stride; + } while (++y < height); +} + +template <typename Pixel> +void DirectionalIntraPredictorZone3_C(void* const dest, ptrdiff_t stride, + const void* const left_column, + const int width, const int height, + const int ystep, + const bool upsampled_left) { + const auto* const left = static_cast<const Pixel*>(left_column); + stride /= sizeof(Pixel); + + assert(ystep > 0); + + const int upsample_shift = static_cast<int>(upsampled_left); + const int scale_bits = 6 - upsample_shift; + const int base_step = 1 << upsample_shift; + // Zone3 never runs out of left_column values. + assert((width + height - 1) << upsample_shift > // max_base_y + ((ystep * width) >> scale_bits) + + base_step * (height - 1)); // left_base_y + + int left_y = ystep; + int x = 0; + do { + auto* dst = static_cast<Pixel*>(dest); + + int left_base_y = left_y >> scale_bits; + int y = 0; + do { + const int shift = ((left_y << upsample_shift) & 0x3F) >> 1; + const int val = + left[left_base_y] * (32 - shift) + left[left_base_y + 1] * shift; + dst[x] = RightShiftWithRounding(val, 5); + dst += stride; + left_base_y += base_step; + } while (++y < height); + + left_y += ystep; + } while (++x < width); +} + +void Init8bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(8); + assert(dsp != nullptr); +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + dsp->directional_intra_predictor_zone1 = + DirectionalIntraPredictorZone1_C<uint8_t>; + dsp->directional_intra_predictor_zone2 = + DirectionalIntraPredictorZone2_C<uint8_t>; + dsp->directional_intra_predictor_zone3 = + DirectionalIntraPredictorZone3_C<uint8_t>; +#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + static_cast<void>(dsp); +#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1 + dsp->directional_intra_predictor_zone1 = + DirectionalIntraPredictorZone1_C<uint8_t>; +#endif +#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2 + dsp->directional_intra_predictor_zone2 = + DirectionalIntraPredictorZone2_C<uint8_t>; +#endif +#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3 + dsp->directional_intra_predictor_zone3 = + DirectionalIntraPredictorZone3_C<uint8_t>; +#endif +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +} + +#if LIBGAV1_MAX_BITDEPTH >= 10 +void Init10bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(10); + assert(dsp != nullptr); +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + dsp->directional_intra_predictor_zone1 = + DirectionalIntraPredictorZone1_C<uint16_t>; + dsp->directional_intra_predictor_zone2 = + DirectionalIntraPredictorZone2_C<uint16_t>; + dsp->directional_intra_predictor_zone3 = + DirectionalIntraPredictorZone3_C<uint16_t>; +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + static_cast<void>(dsp); +#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone1 + dsp->directional_intra_predictor_zone1 = + DirectionalIntraPredictorZone1_C<uint16_t>; +#endif +#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone2 + dsp->directional_intra_predictor_zone2 = + DirectionalIntraPredictorZone2_C<uint16_t>; +#endif +#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone3 + dsp->directional_intra_predictor_zone3 = + DirectionalIntraPredictorZone3_C<uint16_t>; +#endif +} +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +} // namespace + +void IntraPredDirectionalInit_C() { + Init8bpp(); +#if LIBGAV1_MAX_BITDEPTH >= 10 + Init10bpp(); +#endif +} + +} // namespace dsp +} // namespace libgav1 diff --git a/src/dsp/intrapred_directional.h b/src/dsp/intrapred_directional.h new file mode 100644 index 0000000..bcd1bc1 --- /dev/null +++ b/src/dsp/intrapred_directional.h @@ -0,0 +1,48 @@ +/* + * Copyright 2021 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_INTRAPRED_DIRECTIONAL_H_ +#define LIBGAV1_SRC_DSP_INTRAPRED_DIRECTIONAL_H_ + +// Pull in LIBGAV1_DspXXX defines representing the implementation status +// of each function. The resulting value of each can be used by each module to +// determine whether an implementation is needed at compile time. +// IWYU pragma: begin_exports + +// ARM: +#include "src/dsp/arm/intrapred_directional_neon.h" + +// x86: +// Note includes should be sorted in logical order avx2/avx/sse4, etc. +// The order of includes is important as each tests for a superior version +// before setting the base. +// clang-format off +#include "src/dsp/x86/intrapred_directional_sse4.h" +// clang-format on + +// IWYU pragma: end_exports + +namespace libgav1 { +namespace dsp { + +// Initializes Dsp::directional_intra_predictor_zone*. This function is not +// thread-safe. +void IntraPredDirectionalInit_C(); + +} // namespace dsp +} // namespace libgav1 + +#endif // LIBGAV1_SRC_DSP_INTRAPRED_DIRECTIONAL_H_ diff --git a/src/dsp/intrapred_directional_test.cc b/src/dsp/intrapred_directional_test.cc new file mode 100644 index 0000000..ebf9da0 --- /dev/null +++ b/src/dsp/intrapred_directional_test.cc @@ -0,0 +1,929 @@ +// Copyright 2021 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/intrapred_directional.h" + +#include <cmath> +#include <cstddef> +#include <cstdint> +#include <cstring> +#include <memory> +#include <ostream> + +#include "absl/strings/match.h" +#include "absl/time/clock.h" +#include "absl/time/time.h" +#include "gtest/gtest.h" +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/utils/common.h" +#include "src/utils/constants.h" +#include "src/utils/cpu.h" +#include "src/utils/memory.h" +#include "tests/block_utils.h" +#include "tests/third_party/libvpx/acm_random.h" +#include "tests/utils.h" + +namespace libgav1 { +namespace dsp { +namespace { + +constexpr int kMaxBlockSize = 64; +constexpr int kTotalPixels = kMaxBlockSize * kMaxBlockSize; +constexpr int kNumDirectionalIntraPredictors = 3; + +constexpr int kBaseAngles[] = {45, 67, 90, 113, 135, 157, 180, 203}; + +const char* const kDirectionalPredNames[kNumDirectionalIntraPredictors] = { + "kDirectionalIntraPredictorZone1", "kDirectionalIntraPredictorZone2", + "kDirectionalIntraPredictorZone3"}; + +int16_t GetDirectionalIntraPredictorDerivative(const int angle) { + EXPECT_GE(angle, 3); + EXPECT_LE(angle, 87); + return kDirectionalIntraPredictorDerivative[DivideBy2(angle) - 1]; +} + +template <int bitdepth, typename Pixel> +class IntraPredTestBase : public testing::TestWithParam<TransformSize>, + public test_utils::MaxAlignedAllocable { + public: + IntraPredTestBase() { + switch (tx_size_) { + case kNumTransformSizes: + EXPECT_NE(tx_size_, kNumTransformSizes); + break; + default: + block_width_ = kTransformWidth[tx_size_]; + block_height_ = kTransformHeight[tx_size_]; + break; + } + } + + IntraPredTestBase(const IntraPredTestBase&) = delete; + IntraPredTestBase& operator=(const IntraPredTestBase&) = delete; + ~IntraPredTestBase() override = default; + + protected: + struct IntraPredMem { + void Reset(libvpx_test::ACMRandom* rnd) { + ASSERT_NE(rnd, nullptr); + Pixel* const left = left_mem + 16; + Pixel* const top = top_mem + 16; + const int mask = (1 << bitdepth) - 1; + for (auto& r : ref_src) r = rnd->Rand16() & mask; + for (int i = 0; i < kMaxBlockSize; ++i) left[i] = rnd->Rand16() & mask; + for (int i = -1; i < kMaxBlockSize; ++i) top[i] = rnd->Rand16() & mask; + + // Some directional predictors require top-right, bottom-left. + for (int i = kMaxBlockSize; i < 2 * kMaxBlockSize; ++i) { + left[i] = rnd->Rand16() & mask; + top[i] = rnd->Rand16() & mask; + } + // TODO(jzern): reorder this and regenerate the digests after switching + // random number generators. + // Upsampling in the directional predictors extends left/top[-1] to [-2]. + left[-1] = rnd->Rand16() & mask; + left[-2] = rnd->Rand16() & mask; + top[-2] = rnd->Rand16() & mask; + memset(left_mem, 0, sizeof(left_mem[0]) * 14); + memset(top_mem, 0, sizeof(top_mem[0]) * 14); + memset(top_mem + kMaxBlockSize * 2 + 16, 0, + sizeof(top_mem[0]) * kTopMemPadding); + } + + // Set ref_src, top-left, top and left to |pixel|. + void Set(const Pixel pixel) { + Pixel* const left = left_mem + 16; + Pixel* const top = top_mem + 16; + for (auto& r : ref_src) r = pixel; + // Upsampling in the directional predictors extends left/top[-1] to [-2]. + for (int i = -2; i < 2 * kMaxBlockSize; ++i) { + left[i] = top[i] = pixel; + } + } + + // DirectionalZone1_Large() overreads up to 7 pixels in |top_mem|. + static constexpr int kTopMemPadding = 7; + alignas(kMaxAlignment) Pixel dst[kTotalPixels]; + alignas(kMaxAlignment) Pixel ref_src[kTotalPixels]; + alignas(kMaxAlignment) Pixel left_mem[kMaxBlockSize * 2 + 16]; + alignas( + kMaxAlignment) Pixel top_mem[kMaxBlockSize * 2 + 16 + kTopMemPadding]; + }; + + void SetUp() override { test_utils::ResetDspTable(bitdepth); } + + const TransformSize tx_size_ = GetParam(); + int block_width_; + int block_height_; + IntraPredMem intra_pred_mem_; +}; + +//------------------------------------------------------------------------------ +// DirectionalIntraPredTest + +template <int bitdepth, typename Pixel> +class DirectionalIntraPredTest : public IntraPredTestBase<bitdepth, Pixel> { + public: + DirectionalIntraPredTest() = default; + DirectionalIntraPredTest(const DirectionalIntraPredTest&) = delete; + DirectionalIntraPredTest& operator=(const DirectionalIntraPredTest&) = delete; + ~DirectionalIntraPredTest() override = default; + + protected: + using IntraPredTestBase<bitdepth, Pixel>::tx_size_; + using IntraPredTestBase<bitdepth, Pixel>::block_width_; + using IntraPredTestBase<bitdepth, Pixel>::block_height_; + using IntraPredTestBase<bitdepth, Pixel>::intra_pred_mem_; + + enum Zone { kZone1, kZone2, kZone3, kNumZones }; + + enum { kAngleDeltaStart = -9, kAngleDeltaStop = 9, kAngleDeltaStep = 3 }; + + void SetUp() override { + IntraPredTestBase<bitdepth, Pixel>::SetUp(); + IntraPredDirectionalInit_C(); + + const Dsp* const dsp = GetDspTable(bitdepth); + ASSERT_NE(dsp, nullptr); + base_directional_intra_pred_zone1_ = dsp->directional_intra_predictor_zone1; + base_directional_intra_pred_zone2_ = dsp->directional_intra_predictor_zone2; + base_directional_intra_pred_zone3_ = dsp->directional_intra_predictor_zone3; + + const testing::TestInfo* const test_info = + testing::UnitTest::GetInstance()->current_test_info(); + const char* const test_case = test_info->test_suite_name(); + if (absl::StartsWith(test_case, "C/")) { + base_directional_intra_pred_zone1_ = nullptr; + base_directional_intra_pred_zone2_ = nullptr; + base_directional_intra_pred_zone3_ = nullptr; + } else if (absl::StartsWith(test_case, "NEON/")) { + IntraPredDirectionalInit_NEON(); + } else if (absl::StartsWith(test_case, "SSE41/")) { + if ((GetCpuInfo() & kSSE4_1) != 0) { + IntraPredDirectionalInit_SSE4_1(); + } + } else { + FAIL() << "Unrecognized architecture prefix in test case name: " + << test_case; + } + + cur_directional_intra_pred_zone1_ = dsp->directional_intra_predictor_zone1; + cur_directional_intra_pred_zone2_ = dsp->directional_intra_predictor_zone2; + cur_directional_intra_pred_zone3_ = dsp->directional_intra_predictor_zone3; + + // Skip functions that haven't been specialized for this particular + // architecture. + if (cur_directional_intra_pred_zone1_ == + base_directional_intra_pred_zone1_) { + cur_directional_intra_pred_zone1_ = nullptr; + } + if (cur_directional_intra_pred_zone2_ == + base_directional_intra_pred_zone2_) { + cur_directional_intra_pred_zone2_ = nullptr; + } + if (cur_directional_intra_pred_zone3_ == + base_directional_intra_pred_zone3_) { + cur_directional_intra_pred_zone3_ = nullptr; + } + } + + bool IsEdgeUpsampled(int delta, const int filter_type) const { + delta = std::abs(delta); + if (delta == 0 || delta >= 40) return false; + const int block_wh = block_width_ + block_height_; + return (filter_type == 1) ? block_wh <= 8 : block_wh <= 16; + } + + // Returns the minimum and maximum (exclusive) range of angles that the + // predictor should be applied to. + void GetZoneAngleRange(const Zone zone, int* const min_angle, + int* const max_angle) const { + ASSERT_NE(min_angle, nullptr); + ASSERT_NE(max_angle, nullptr); + switch (zone) { + // The overall minimum angle comes from mode D45_PRED, yielding: + // min_angle = 45-(MAX_ANGLE_DELTA*ANGLE_STEP) = 36 + // The overall maximum angle comes from mode D203_PRED, yielding: + // max_angle = 203+(MAX_ANGLE_DELTA*ANGLE_STEP) = 212 + // The angles 180 and 90 are not permitted because they correspond to + // V_PRED and H_PRED, which are handled in distinct functions. + case kZone1: + *min_angle = 36; + *max_angle = 87; + break; + case kZone2: + *min_angle = 93; + *max_angle = 177; + break; + case kZone3: + *min_angle = 183; + *max_angle = 212; + break; + case kNumZones: + FAIL() << "Invalid zone value: " << zone; + break; + } + } + + // These tests modify intra_pred_mem_. + void TestSpeed(const char* const digests[kNumDirectionalIntraPredictors], + Zone zone, int num_runs); + void TestSaturatedValues(); + void TestRandomValues(); + + DirectionalIntraPredictorZone1Func base_directional_intra_pred_zone1_; + DirectionalIntraPredictorZone2Func base_directional_intra_pred_zone2_; + DirectionalIntraPredictorZone3Func base_directional_intra_pred_zone3_; + DirectionalIntraPredictorZone1Func cur_directional_intra_pred_zone1_; + DirectionalIntraPredictorZone2Func cur_directional_intra_pred_zone2_; + DirectionalIntraPredictorZone3Func cur_directional_intra_pred_zone3_; +}; + +template <int bitdepth, typename Pixel> +void DirectionalIntraPredTest<bitdepth, Pixel>::TestSpeed( + const char* const digests[kNumDirectionalIntraPredictors], const Zone zone, + const int num_runs) { + switch (zone) { + case kZone1: + if (cur_directional_intra_pred_zone1_ == nullptr) return; + break; + case kZone2: + if (cur_directional_intra_pred_zone2_ == nullptr) return; + break; + case kZone3: + if (cur_directional_intra_pred_zone3_ == nullptr) return; + break; + case kNumZones: + FAIL() << "Invalid zone value: " << zone; + break; + } + ASSERT_NE(digests, nullptr); + const Pixel* const left = intra_pred_mem_.left_mem + 16; + const Pixel* const top = intra_pred_mem_.top_mem + 16; + + libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed()); + intra_pred_mem_.Reset(&rnd); + + // Allocate separate blocks for each angle + filter + upsampled combination. + // Add a 1 pixel right border to test for overwrites. + static constexpr int kMaxZoneAngles = 27; // zone 2 + static constexpr int kMaxFilterTypes = 2; + static constexpr int kBlockBorder = 1; + static constexpr int kBorderSize = + kBlockBorder * kMaxZoneAngles * kMaxFilterTypes; + const int ref_stride = + kMaxZoneAngles * kMaxFilterTypes * block_width_ + kBorderSize; + const size_t ref_alloc_size = sizeof(Pixel) * ref_stride * block_height_; + + using AlignedPtr = std::unique_ptr<Pixel[], decltype(&AlignedFree)>; + AlignedPtr ref_src(static_cast<Pixel*>(AlignedAlloc(16, ref_alloc_size)), + &AlignedFree); + AlignedPtr dest(static_cast<Pixel*>(AlignedAlloc(16, ref_alloc_size)), + &AlignedFree); + ASSERT_NE(ref_src, nullptr); + ASSERT_NE(dest, nullptr); + + const int mask = (1 << bitdepth) - 1; + for (size_t i = 0; i < ref_alloc_size / sizeof(ref_src[0]); ++i) { + ref_src[i] = rnd.Rand16() & mask; + } + + int min_angle = 0, max_angle = 0; + ASSERT_NO_FATAL_FAILURE(GetZoneAngleRange(zone, &min_angle, &max_angle)); + + absl::Duration elapsed_time; + for (int run = 0; run < num_runs; ++run) { + Pixel* dst = dest.get(); + memcpy(dst, ref_src.get(), ref_alloc_size); + for (const auto& base_angle : kBaseAngles) { + for (int filter_type = 0; filter_type <= 1; ++filter_type) { + for (int angle_delta = kAngleDeltaStart; angle_delta <= kAngleDeltaStop; + angle_delta += kAngleDeltaStep) { + const int predictor_angle = base_angle + angle_delta; + if (predictor_angle < min_angle || predictor_angle > max_angle) { + continue; + } + + ASSERT_GT(predictor_angle, 0) << "base_angle: " << base_angle + << " angle_delta: " << angle_delta; + const bool upsampled_left = + IsEdgeUpsampled(predictor_angle - 180, filter_type); + const bool upsampled_top = + IsEdgeUpsampled(predictor_angle - 90, filter_type); + const ptrdiff_t stride = ref_stride * sizeof(ref_src[0]); + if (predictor_angle < 90) { + ASSERT_EQ(zone, kZone1); + const int xstep = + GetDirectionalIntraPredictorDerivative(predictor_angle); + const absl::Time start = absl::Now(); + cur_directional_intra_pred_zone1_(dst, stride, top, block_width_, + block_height_, xstep, + upsampled_top); + elapsed_time += absl::Now() - start; + } else if (predictor_angle < 180) { + ASSERT_EQ(zone, kZone2); + const int xstep = + GetDirectionalIntraPredictorDerivative(180 - predictor_angle); + const int ystep = + GetDirectionalIntraPredictorDerivative(predictor_angle - 90); + const absl::Time start = absl::Now(); + cur_directional_intra_pred_zone2_( + dst, stride, top, left, block_width_, block_height_, xstep, + ystep, upsampled_top, upsampled_left); + elapsed_time += absl::Now() - start; + } else { + ASSERT_EQ(zone, kZone3); + ASSERT_LT(predictor_angle, 270); + const int ystep = + GetDirectionalIntraPredictorDerivative(270 - predictor_angle); + const absl::Time start = absl::Now(); + cur_directional_intra_pred_zone3_(dst, stride, left, block_width_, + block_height_, ystep, + upsampled_left); + elapsed_time += absl::Now() - start; + } + dst += block_width_ + kBlockBorder; + } + } + } + } + + test_utils::CheckMd5Digest(ToString(tx_size_), kDirectionalPredNames[zone], + digests[zone], dest.get(), ref_alloc_size, + elapsed_time); +} + +template <int bitdepth, typename Pixel> +void DirectionalIntraPredTest<bitdepth, Pixel>::TestSaturatedValues() { + const Pixel* const left = intra_pred_mem_.left_mem + 16; + const Pixel* const top = intra_pred_mem_.top_mem + 16; + const auto kMaxPixel = static_cast<Pixel>((1 << bitdepth) - 1); + intra_pred_mem_.Set(kMaxPixel); + + for (int i = kZone1; i < kNumZones; ++i) { + switch (i) { + case kZone1: + if (cur_directional_intra_pred_zone1_ == nullptr) continue; + break; + case kZone2: + if (cur_directional_intra_pred_zone2_ == nullptr) continue; + break; + case kZone3: + if (cur_directional_intra_pred_zone3_ == nullptr) continue; + break; + case kNumZones: + FAIL() << "Invalid zone value: " << i; + break; + } + int min_angle = 0, max_angle = 0; + ASSERT_NO_FATAL_FAILURE( + GetZoneAngleRange(static_cast<Zone>(i), &min_angle, &max_angle)); + + for (const auto& base_angle : kBaseAngles) { + for (int filter_type = 0; filter_type <= 1; ++filter_type) { + for (int angle_delta = kAngleDeltaStart; angle_delta <= kAngleDeltaStop; + angle_delta += kAngleDeltaStep) { + const int predictor_angle = base_angle + angle_delta; + if (predictor_angle <= min_angle || predictor_angle >= max_angle) { + continue; + } + ASSERT_GT(predictor_angle, 0) << "base_angle: " << base_angle + << " angle_delta: " << angle_delta; + + memcpy(intra_pred_mem_.dst, intra_pred_mem_.ref_src, + sizeof(intra_pred_mem_.dst)); + + const bool upsampled_left = + IsEdgeUpsampled(predictor_angle - 180, filter_type); + const bool upsampled_top = + IsEdgeUpsampled(predictor_angle - 90, filter_type); + const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel); + if (predictor_angle < 90) { + const int xstep = + GetDirectionalIntraPredictorDerivative(predictor_angle); + cur_directional_intra_pred_zone1_(intra_pred_mem_.dst, stride, top, + block_width_, block_height_, + xstep, upsampled_top); + } else if (predictor_angle < 180) { + const int xstep = + GetDirectionalIntraPredictorDerivative(180 - predictor_angle); + const int ystep = + GetDirectionalIntraPredictorDerivative(predictor_angle - 90); + cur_directional_intra_pred_zone2_( + intra_pred_mem_.dst, stride, top, left, block_width_, + block_height_, xstep, ystep, upsampled_top, upsampled_left); + } else { + ASSERT_LT(predictor_angle, 270); + const int ystep = + GetDirectionalIntraPredictorDerivative(270 - predictor_angle); + cur_directional_intra_pred_zone3_(intra_pred_mem_.dst, stride, left, + block_width_, block_height_, + ystep, upsampled_left); + } + + if (!test_utils::CompareBlocks( + intra_pred_mem_.dst, intra_pred_mem_.ref_src, block_width_, + block_height_, kMaxBlockSize, kMaxBlockSize, true)) { + ADD_FAILURE() << "Expected " << kDirectionalPredNames[i] + << " (angle: " << predictor_angle + << " filter type: " << filter_type + << ") to produce a block containing '" + << static_cast<int>(kMaxPixel) << "'"; + return; + } + } + } + } + } +} + +template <int bitdepth, typename Pixel> +void DirectionalIntraPredTest<bitdepth, Pixel>::TestRandomValues() { + const Pixel* const left = intra_pred_mem_.left_mem + 16; + const Pixel* const top = intra_pred_mem_.top_mem + 16; + // Use an alternate seed to differentiate this test from TestSpeed(). + libvpx_test::ACMRandom rnd(test_utils::kAlternateDeterministicSeed); + + for (int i = kZone1; i < kNumZones; ++i) { + // Only run when there is a reference version (base) and a different + // optimized version (cur). + switch (i) { + case kZone1: + if (base_directional_intra_pred_zone1_ == nullptr || + cur_directional_intra_pred_zone1_ == nullptr) { + continue; + } + break; + case kZone2: + if (base_directional_intra_pred_zone2_ == nullptr || + cur_directional_intra_pred_zone2_ == nullptr) { + continue; + } + break; + case kZone3: + if (base_directional_intra_pred_zone3_ == nullptr || + cur_directional_intra_pred_zone3_ == nullptr) { + continue; + } + break; + case kNumZones: + FAIL() << "Invalid zone value: " << i; + break; + } + int min_angle = 0, max_angle = 0; + ASSERT_NO_FATAL_FAILURE( + GetZoneAngleRange(static_cast<Zone>(i), &min_angle, &max_angle)); + + for (const auto& base_angle : kBaseAngles) { + for (int n = 0; n < 1000; ++n) { + for (int filter_type = 0; filter_type <= 1; ++filter_type) { + for (int angle_delta = kAngleDeltaStart; + angle_delta <= kAngleDeltaStop; angle_delta += kAngleDeltaStep) { + const int predictor_angle = base_angle + angle_delta; + if (predictor_angle <= min_angle || predictor_angle >= max_angle) { + continue; + } + ASSERT_GT(predictor_angle, 0) << "base_angle: " << base_angle + << " angle_delta: " << angle_delta; + + intra_pred_mem_.Reset(&rnd); + memcpy(intra_pred_mem_.dst, intra_pred_mem_.ref_src, + sizeof(intra_pred_mem_.dst)); + + const bool upsampled_left = + IsEdgeUpsampled(predictor_angle - 180, filter_type); + const bool upsampled_top = + IsEdgeUpsampled(predictor_angle - 90, filter_type); + const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel); + if (predictor_angle < 90) { + const int xstep = + GetDirectionalIntraPredictorDerivative(predictor_angle); + base_directional_intra_pred_zone1_( + intra_pred_mem_.ref_src, stride, top, block_width_, + block_height_, xstep, upsampled_top); + cur_directional_intra_pred_zone1_( + intra_pred_mem_.dst, stride, top, block_width_, block_height_, + xstep, upsampled_top); + } else if (predictor_angle < 180) { + const int xstep = + GetDirectionalIntraPredictorDerivative(180 - predictor_angle); + const int ystep = + GetDirectionalIntraPredictorDerivative(predictor_angle - 90); + base_directional_intra_pred_zone2_( + intra_pred_mem_.ref_src, stride, top, left, block_width_, + block_height_, xstep, ystep, upsampled_top, upsampled_left); + cur_directional_intra_pred_zone2_( + intra_pred_mem_.dst, stride, top, left, block_width_, + block_height_, xstep, ystep, upsampled_top, upsampled_left); + } else { + ASSERT_LT(predictor_angle, 270); + const int ystep = + GetDirectionalIntraPredictorDerivative(270 - predictor_angle); + base_directional_intra_pred_zone3_( + intra_pred_mem_.ref_src, stride, left, block_width_, + block_height_, ystep, upsampled_left); + cur_directional_intra_pred_zone3_( + intra_pred_mem_.dst, stride, left, block_width_, + block_height_, ystep, upsampled_left); + } + + if (!test_utils::CompareBlocks( + intra_pred_mem_.dst, intra_pred_mem_.ref_src, block_width_, + block_height_, kMaxBlockSize, kMaxBlockSize, true)) { + ADD_FAILURE() << "Result from optimized version of " + << kDirectionalPredNames[i] + << " differs from reference at angle " + << predictor_angle << " with filter type " + << filter_type << " in iteration #" << n; + return; + } + } + } + } + } + } +} + +using DirectionalIntraPredTest8bpp = DirectionalIntraPredTest<8, uint8_t>; + +const char* const* GetDirectionalIntraPredDigests8bpp(TransformSize tx_size) { + static const char* const kDigests4x4[kNumDirectionalIntraPredictors] = { + "9cfc1da729ad08682e165826c29b280b", + "bb73539c7afbda7bddd2184723b932d6", + "9d2882800ffe948196e984a26a2da72c", + }; + static const char* const kDigests4x8[kNumDirectionalIntraPredictors] = { + "090efe6f83cc6fa301f65d3bbd5c38d2", + "d0fba4cdfb90f8bd293a94cae9db1a15", + "f7ad0eeab4389d0baa485d30fec87617", + }; + static const char* const kDigests4x16[kNumDirectionalIntraPredictors] = { + "1d32b33c75fe85248c48cdc8caa78d84", + "7000e18159443d366129a6cc6ef8fcee", + "06c02fac5f8575f687abb3f634eb0b4c", + }; + static const char* const kDigests8x4[kNumDirectionalIntraPredictors] = { + "1b591799685bc135982114b731293f78", + "5cd9099acb9f7b2618dafa6712666580", + "d023883efede88f99c19d006044d9fa1", + }; + static const char* const kDigests8x8[kNumDirectionalIntraPredictors] = { + "f1e46ecf62a2516852f30c5025adb7ea", + "864442a209c16998065af28d8cdd839a", + "411a6e554868982af577de69e53f12e8", + }; + static const char* const kDigests8x16[kNumDirectionalIntraPredictors] = { + "89278302be913a85cfb06feaea339459", + "6c42f1a9493490cd4529fd40729cec3c", + "2516b5e1c681e5dcb1acedd5f3d41106", + }; + static const char* const kDigests8x32[kNumDirectionalIntraPredictors] = { + "aea7078f3eeaa8afbfe6c959c9e676f1", + "cad30babf12729dda5010362223ba65c", + "ff384ebdc832007775af418a2aae1463", + }; + static const char* const kDigests16x4[kNumDirectionalIntraPredictors] = { + "964a821c313c831e12f4d32e616c0b55", + "adf6dad3a84ab4d16c16eea218bec57a", + "a54fa008d43895e523474686c48a81c2", + }; + static const char* const kDigests16x8[kNumDirectionalIntraPredictors] = { + "fe2851b4e4f9fcf924cf17d50415a4c0", + "50a0e279c481437ff315d08eb904c733", + "0682065c8fb6cbf9be4949316c87c9e5", + }; + static const char* const kDigests16x16[kNumDirectionalIntraPredictors] = { + "ef15503b1943642e7a0bace1616c0e11", + "bf1a4d3f855f1072a902a88ec6ce0350", + "7e87a03e29cd7fd843fd71b729a18f3f", + }; + static const char* const kDigests16x32[kNumDirectionalIntraPredictors] = { + "f7b636615d2e5bf289b5db452a6f188d", + "e95858c532c10d00b0ce7a02a02121dd", + "34a18ccf58ef490f32268e85ce8c7de4", + }; + static const char* const kDigests16x64[kNumDirectionalIntraPredictors] = { + "b250099986c2fab9670748598058846b", + "f25d80af4da862a9b6b72979f1e17cb4", + "5347dc7bc346733b4887f6c8ad5e0898", + }; + static const char* const kDigests32x8[kNumDirectionalIntraPredictors] = { + "72e4c9f8af043b1cb1263490351818ab", + "1fc010d2df011b9e4e3d0957107c78df", + "f4cbfa3ca941ef08b972a68d7e7bafc4", + }; + static const char* const kDigests32x16[kNumDirectionalIntraPredictors] = { + "37e5a1aaf7549d2bce08eece9d20f0f6", + "6a2794025d0aca414ab17baa3cf8251a", + "63dd37a6efdc91eeefef166c99ce2db1", + }; + static const char* const kDigests32x32[kNumDirectionalIntraPredictors] = { + "198aabc958992eb49cceab97d1acb43e", + "aee88b6c8bacfcf38799fe338e6c66e7", + "01e8f8f96696636f6d79d33951907a16", + }; + static const char* const kDigests32x64[kNumDirectionalIntraPredictors] = { + "0611390202c4f90f7add7aec763ded58", + "960240c7ceda2ccfac7c90b71460578a", + "7e7d97594aab8ad56e8c01c340335607", + }; + static const char* const kDigests64x16[kNumDirectionalIntraPredictors] = { + "7e1f567e7fc510757f2d89d638bc826f", + "c929d687352ce40a58670be2ce3c8c90", + "f6881e6a9ba3c3d3d730b425732656b1", + }; + static const char* const kDigests64x32[kNumDirectionalIntraPredictors] = { + "27b4c2a7081d4139f22003ba8b6dfdf2", + "301e82740866b9274108a04c872fa848", + "98d3aa4fef838f4abf00dac33806659f", + }; + static const char* const kDigests64x64[kNumDirectionalIntraPredictors] = { + "b31816db8fade3accfd975b21aa264c7", + "2adce01a03b9452633d5830e1a9b4e23", + "7b988fadba8b07c36e88d7be6b270494", + }; + + switch (tx_size) { + case kTransformSize4x4: + return kDigests4x4; + case kTransformSize4x8: + return kDigests4x8; + case kTransformSize4x16: + return kDigests4x16; + case kTransformSize8x4: + return kDigests8x4; + case kTransformSize8x8: + return kDigests8x8; + case kTransformSize8x16: + return kDigests8x16; + case kTransformSize8x32: + return kDigests8x32; + case kTransformSize16x4: + return kDigests16x4; + case kTransformSize16x8: + return kDigests16x8; + case kTransformSize16x16: + return kDigests16x16; + case kTransformSize16x32: + return kDigests16x32; + case kTransformSize16x64: + return kDigests16x64; + case kTransformSize32x8: + return kDigests32x8; + case kTransformSize32x16: + return kDigests32x16; + case kTransformSize32x32: + return kDigests32x32; + case kTransformSize32x64: + return kDigests32x64; + case kTransformSize64x16: + return kDigests64x16; + case kTransformSize64x32: + return kDigests64x32; + case kTransformSize64x64: + return kDigests64x64; + default: + ADD_FAILURE() << "Unknown transform size: " << tx_size; + return nullptr; + } +} + +TEST_P(DirectionalIntraPredTest8bpp, DISABLED_Speed) { + const auto num_runs = static_cast<int>(5e7 / (block_width_ * block_height_)); + for (int i = kZone1; i < kNumZones; ++i) { + TestSpeed(GetDirectionalIntraPredDigests8bpp(tx_size_), + static_cast<Zone>(i), num_runs); + } +} + +TEST_P(DirectionalIntraPredTest8bpp, FixedInput) { + for (int i = kZone1; i < kNumZones; ++i) { + TestSpeed(GetDirectionalIntraPredDigests8bpp(tx_size_), + static_cast<Zone>(i), 1); + } +} + +TEST_P(DirectionalIntraPredTest8bpp, Overflow) { TestSaturatedValues(); } +TEST_P(DirectionalIntraPredTest8bpp, Random) { TestRandomValues(); } + +//------------------------------------------------------------------------------ +#if LIBGAV1_MAX_BITDEPTH >= 10 + +using DirectionalIntraPredTest10bpp = DirectionalIntraPredTest<10, uint16_t>; + +const char* const* GetDirectionalIntraPredDigests10bpp(TransformSize tx_size) { + static const char* const kDigests4x4[kNumDirectionalIntraPredictors] = { + "a683f4d7ccd978737615f61ecb4d638d", + "90c94374eaf7e9501f197863937b8639", + "0d3969cd081523ac6a906eecc7980c43", + }; + static const char* const kDigests4x8[kNumDirectionalIntraPredictors] = { + "c3ffa2979b325644e4a56c882fe27347", + "1f61f5ee413a9a3b8d1d93869ec2aee0", + "4795ea944779ec4a783408769394d874", + }; + static const char* const kDigests4x16[kNumDirectionalIntraPredictors] = { + "45c3282c9aa51024c1d64a40f230aa45", + "5cd47dd69f8bd0b15365a0c5cfc0a49a", + "06336c507b05f98c1d6a21abc43e6182", + }; + static const char* const kDigests8x4[kNumDirectionalIntraPredictors] = { + "7370476ff0abbdc5e92f811b8879c861", + "a239a50adb28a4791b52a0dfff3bee06", + "4779a17f958a9ca04e8ec08c5aba1d36", + }; + static const char* const kDigests8x8[kNumDirectionalIntraPredictors] = { + "305463f346c376594f82aad8304e0362", + "0cd481e5bda286c87a645417569fd948", + "48c7899dc9b7163b0b1f61b3a2b4b73e", + }; + static const char* const kDigests8x16[kNumDirectionalIntraPredictors] = { + "5c18fd5339be90628c82b1fb6af50d5e", + "35eaa566ebd3bb7c903cfead5dc9ac78", + "9fdb0e790e5965810d02c02713c84071", + }; + static const char* const kDigests8x32[kNumDirectionalIntraPredictors] = { + "2168d6cc858c704748b7b343ced2ac3a", + "1d3ce273107447faafd2e55877e48ffb", + "d344164049d1fe9b65a3ae8764bbbd37", + }; + static const char* const kDigests16x4[kNumDirectionalIntraPredictors] = { + "dcef2cf51abe3fe150f388a14c762d30", + "6a810b289b1c14f8eab8ca1274e91ecd", + "c94da7c11f3fb11963d85c8804fce2d9", + }; + static const char* const kDigests16x8[kNumDirectionalIntraPredictors] = { + "50a0d08b0d99b7a574bad2cfb36efc39", + "2dcb55874db39da70c8ca1318559f9fe", + "6390bcd30ff3bc389ecc0a0952bea531", + }; + static const char* const kDigests16x16[kNumDirectionalIntraPredictors] = { + "7146c83c2620935606d49f3cb5876f41", + "2318ddf30c070a53c9b9cf199cd1b2c5", + "e9042e2124925aa7c1b6110617cb10e8", + }; + static const char* const kDigests16x32[kNumDirectionalIntraPredictors] = { + "c970f401de7b7c5bb4e3ad447fcbef8f", + "a18cc70730eecdaa31dbcf4306ff490f", + "32c1528ad4a576a2210399d6b4ccd46e", + }; + static const char* const kDigests16x64[kNumDirectionalIntraPredictors] = { + "00b3f0007da2e5d01380594a3d7162d5", + "1971af519e4a18967b7311f93efdd1b8", + "e6139769ce5a9c4982cfab9363004516", + }; + static const char* const kDigests32x8[kNumDirectionalIntraPredictors] = { + "08107ad971179cc9f465ae5966bd4901", + "b215212a3c0dfe9182c4f2e903d731f7", + "791274416a0da87c674e1ae318b3ce09", + }; + static const char* const kDigests32x16[kNumDirectionalIntraPredictors] = { + "94ea6cccae35b5d08799aa003ac08ccf", + "ae105e20e63fb55d4fd9d9e59dc62dde", + "973d0b2358ea585e4f486e7e645c5310", + }; + static const char* const kDigests32x32[kNumDirectionalIntraPredictors] = { + "d14c695c4853ddf5e5d8256bc1d1ed60", + "6bd0ebeb53adecc11442b1218b870cb7", + "e03bc402a9999aba8272275dce93e89f", + }; + static const char* const kDigests32x64[kNumDirectionalIntraPredictors] = { + "b21a8a8723758392ee659eeeae518a1e", + "e50285454896210ce44d6f04dfde05a7", + "f0f8ea0c6c2acc8d7d390927c3a90370", + }; + static const char* const kDigests64x16[kNumDirectionalIntraPredictors] = { + "ce51db16fd4fa56e601631397b098c89", + "aa87a8635e02c1e91d13158c61e443f6", + "4c1ee3afd46ef34bd711a34d0bf86f13", + }; + static const char* const kDigests64x32[kNumDirectionalIntraPredictors] = { + "25aaf5971e24e543e3e69a47254af777", + "eb6f444b3df127d69460778ab5bf8fc1", + "2f846cc0d506f90c0a58438600819817", + }; + static const char* const kDigests64x64[kNumDirectionalIntraPredictors] = { + "b26ce5b5f4b5d4a438b52e5987877fb8", + "35721a00a70938111939cf69988d928e", + "0af7ec35939483fac82c246a13845806", + }; + + switch (tx_size) { + case kTransformSize4x4: + return kDigests4x4; + case kTransformSize4x8: + return kDigests4x8; + case kTransformSize4x16: + return kDigests4x16; + case kTransformSize8x4: + return kDigests8x4; + case kTransformSize8x8: + return kDigests8x8; + case kTransformSize8x16: + return kDigests8x16; + case kTransformSize8x32: + return kDigests8x32; + case kTransformSize16x4: + return kDigests16x4; + case kTransformSize16x8: + return kDigests16x8; + case kTransformSize16x16: + return kDigests16x16; + case kTransformSize16x32: + return kDigests16x32; + case kTransformSize16x64: + return kDigests16x64; + case kTransformSize32x8: + return kDigests32x8; + case kTransformSize32x16: + return kDigests32x16; + case kTransformSize32x32: + return kDigests32x32; + case kTransformSize32x64: + return kDigests32x64; + case kTransformSize64x16: + return kDigests64x16; + case kTransformSize64x32: + return kDigests64x32; + case kTransformSize64x64: + return kDigests64x64; + default: + ADD_FAILURE() << "Unknown transform size: " << tx_size; + return nullptr; + } +} + +TEST_P(DirectionalIntraPredTest10bpp, DISABLED_Speed) { + const auto num_runs = static_cast<int>(5e7 / (block_width_ * block_height_)); + for (int i = kZone1; i < kNumZones; ++i) { + TestSpeed(GetDirectionalIntraPredDigests10bpp(tx_size_), + static_cast<Zone>(i), num_runs); + } +} + +TEST_P(DirectionalIntraPredTest10bpp, FixedInput) { + for (int i = kZone1; i < kNumZones; ++i) { + TestSpeed(GetDirectionalIntraPredDigests10bpp(tx_size_), + static_cast<Zone>(i), 1); + } +} + +TEST_P(DirectionalIntraPredTest10bpp, Overflow) { TestSaturatedValues(); } + +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +constexpr TransformSize kTransformSizes[] = { + kTransformSize4x4, kTransformSize4x8, kTransformSize4x16, + kTransformSize8x4, kTransformSize8x8, kTransformSize8x16, + kTransformSize8x32, kTransformSize16x4, kTransformSize16x8, + kTransformSize16x16, kTransformSize16x32, kTransformSize16x64, + kTransformSize32x8, kTransformSize32x16, kTransformSize32x32, + kTransformSize32x64, kTransformSize64x16, kTransformSize64x32, + kTransformSize64x64}; + +INSTANTIATE_TEST_SUITE_P(C, DirectionalIntraPredTest8bpp, + testing::ValuesIn(kTransformSizes)); +#if LIBGAV1_ENABLE_SSE4_1 +INSTANTIATE_TEST_SUITE_P(SSE41, DirectionalIntraPredTest8bpp, + testing::ValuesIn(kTransformSizes)); +#endif // LIBGAV1_ENABLE_SSE4_1 +#if LIBGAV1_ENABLE_NEON +INSTANTIATE_TEST_SUITE_P(NEON, DirectionalIntraPredTest8bpp, + testing::ValuesIn(kTransformSizes)); +#endif // LIBGAV1_ENABLE_NEON + +#if LIBGAV1_MAX_BITDEPTH >= 10 +INSTANTIATE_TEST_SUITE_P(C, DirectionalIntraPredTest10bpp, + testing::ValuesIn(kTransformSizes)); +#if LIBGAV1_ENABLE_SSE4_1 +INSTANTIATE_TEST_SUITE_P(SSE41, DirectionalIntraPredTest10bpp, + testing::ValuesIn(kTransformSizes)); +#endif // LIBGAV1_ENABLE_SSE4_1 +#if LIBGAV1_ENABLE_NEON +INSTANTIATE_TEST_SUITE_P(NEON, DirectionalIntraPredTest10bpp, + testing::ValuesIn(kTransformSizes)); +#endif // LIBGAV1_ENABLE_NEON + +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +} // namespace +} // namespace dsp + +static std::ostream& operator<<(std::ostream& os, const TransformSize tx_size) { + return os << ToString(tx_size); +} + +} // namespace libgav1 diff --git a/src/dsp/intrapred_filter.cc b/src/dsp/intrapred_filter.cc new file mode 100644 index 0000000..f4bd296 --- /dev/null +++ b/src/dsp/intrapred_filter.cc @@ -0,0 +1,144 @@ +// Copyright 2021 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/intrapred_filter.h" + +#include <algorithm> +#include <cassert> +#include <cstddef> +#include <cstdint> +#include <cstdlib> +#include <cstring> + +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/utils/common.h" +#include "src/utils/constants.h" +#include "src/utils/memory.h" + +namespace libgav1 { +namespace dsp { +namespace { + +//------------------------------------------------------------------------------ +// FilterIntraPredictor_C + +// The recursive filter applies a different filter to the top 4 and 2 left +// pixels to produce each pixel in a 4x2 sub-block. Each successive 4x2 uses the +// prediction output of the blocks above and to the left, unless they are +// adjacent to the |top_row| or |left_column|. The set of 8 filters is selected +// according to |pred|. +template <int bitdepth, typename Pixel> +void FilterIntraPredictor_C(void* const dest, ptrdiff_t stride, + const void* const top_row, + const void* const left_column, + const FilterIntraPredictor pred, const int width, + const int height) { + const int kMaxPixel = (1 << bitdepth) - 1; + const auto* const top = static_cast<const Pixel*>(top_row); + const auto* const left = static_cast<const Pixel*>(left_column); + + assert(width <= 32 && height <= 32); + + Pixel buffer[3][33]; // cache 2 rows + top & left boundaries + memcpy(buffer[0], &top[-1], (width + 1) * sizeof(top[0])); + + auto* dst = static_cast<Pixel*>(dest); + stride /= sizeof(Pixel); + int row0 = 0, row2 = 2; + int ystep = 1; + int y = 0; + do { + buffer[1][0] = left[y]; + buffer[row2][0] = left[y + 1]; + int x = 1; + do { + const Pixel p0 = buffer[row0][x - 1]; // top-left + const Pixel p1 = buffer[row0][x + 0]; // top 0 + const Pixel p2 = buffer[row0][x + 1]; // top 1 + const Pixel p3 = buffer[row0][x + 2]; // top 2 + const Pixel p4 = buffer[row0][x + 3]; // top 3 + const Pixel p5 = buffer[1][x - 1]; // left 0 + const Pixel p6 = buffer[row2][x - 1]; // left 1 + for (int i = 0; i < 8; ++i) { + const int xoffset = i & 0x03; + const int yoffset = (i >> 2) * ystep; + const int value = kFilterIntraTaps[pred][i][0] * p0 + + kFilterIntraTaps[pred][i][1] * p1 + + kFilterIntraTaps[pred][i][2] * p2 + + kFilterIntraTaps[pred][i][3] * p3 + + kFilterIntraTaps[pred][i][4] * p4 + + kFilterIntraTaps[pred][i][5] * p5 + + kFilterIntraTaps[pred][i][6] * p6; + // Section 7.11.2.3 specifies the right-hand side of the assignment as + // Clip1( Round2Signed( pr, INTRA_FILTER_SCALE_BITS ) ). + // Since Clip1() clips a negative value to 0, it is safe to replace + // Round2Signed() with Round2(). + buffer[1 + yoffset][x + xoffset] = static_cast<Pixel>( + Clip3(RightShiftWithRounding(value, 4), 0, kMaxPixel)); + } + x += 4; + } while (x < width); + memcpy(dst, &buffer[1][1], width * sizeof(dst[0])); + dst += stride; + memcpy(dst, &buffer[row2][1], width * sizeof(dst[0])); + dst += stride; + + // The final row becomes the top for the next pass. + row0 ^= 2; + row2 ^= 2; + ystep = -ystep; + y += 2; + } while (y < height); +} + +void Init8bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(8); + assert(dsp != nullptr); +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + dsp->filter_intra_predictor = FilterIntraPredictor_C<8, uint8_t>; +#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + static_cast<void>(dsp); +#ifndef LIBGAV1_Dsp8bpp_FilterIntraPredictor + dsp->filter_intra_predictor = FilterIntraPredictor_C<8, uint8_t>; +#endif +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +} + +#if LIBGAV1_MAX_BITDEPTH >= 10 +void Init10bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(10); + assert(dsp != nullptr); +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + dsp->filter_intra_predictor = FilterIntraPredictor_C<10, uint16_t>; +#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + static_cast<void>(dsp); +#ifndef LIBGAV1_Dsp10bpp_FilterIntraPredictor + dsp->filter_intra_predictor = FilterIntraPredictor_C<10, uint16_t>; +#endif +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +} +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +} // namespace + +void IntraPredFilterInit_C() { + Init8bpp(); +#if LIBGAV1_MAX_BITDEPTH >= 10 + Init10bpp(); +#endif +} + +} // namespace dsp +} // namespace libgav1 diff --git a/src/dsp/intrapred_filter.h b/src/dsp/intrapred_filter.h new file mode 100644 index 0000000..8146b82 --- /dev/null +++ b/src/dsp/intrapred_filter.h @@ -0,0 +1,49 @@ +/* + * Copyright 2021 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_INTRAPRED_FILTER_H_ +#define LIBGAV1_SRC_DSP_INTRAPRED_FILTER_H_ + +// Pull in LIBGAV1_DspXXX defines representing the implementation status +// of each function. The resulting value of each can be used by each module to +// determine whether an implementation is needed at compile time. +// IWYU pragma: begin_exports + +// ARM: +#include "src/dsp/arm/intrapred_filter_neon.h" + +// x86: +// Note includes should be sorted in logical order avx2/avx/sse4, etc. +// The order of includes is important as each tests for a superior version +// before setting the base. +// clang-format off +#include "src/dsp/x86/intrapred_filter_sse4.h" +// clang-format on + +// IWYU pragma: end_exports + +namespace libgav1 { +namespace dsp { + +// Initializes Dsp::intra_predictors, Dsp::directional_intra_predictor_zone*, +// Dsp::cfl_intra_predictors, Dsp::cfl_subsamplers and +// Dsp::filter_intra_predictor. This function is not thread-safe. +void IntraPredFilterInit_C(); + +} // namespace dsp +} // namespace libgav1 + +#endif // LIBGAV1_SRC_DSP_INTRAPRED_FILTER_H_ diff --git a/src/dsp/intrapred_filter_test.cc b/src/dsp/intrapred_filter_test.cc new file mode 100644 index 0000000..c420f0a --- /dev/null +++ b/src/dsp/intrapred_filter_test.cc @@ -0,0 +1,554 @@ +// Copyright 2021 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/intrapred_filter.h" + +#include <cmath> +#include <cstddef> +#include <cstdint> +#include <cstring> +#include <memory> +#include <ostream> + +#include "absl/strings/match.h" +#include "absl/time/clock.h" +#include "absl/time/time.h" +#include "gtest/gtest.h" +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/utils/common.h" +#include "src/utils/constants.h" +#include "src/utils/cpu.h" +#include "src/utils/memory.h" +#include "tests/block_utils.h" +#include "tests/third_party/libvpx/acm_random.h" +#include "tests/utils.h" + +namespace libgav1 { +namespace dsp { +namespace { + +constexpr int kMaxBlockSize = 64; +constexpr int kTotalPixels = kMaxBlockSize * kMaxBlockSize; + +const char* const kFilterIntraPredNames[kNumFilterIntraPredictors] = { + "kFilterIntraPredictorDc", "kFilterIntraPredictorVertical", + "kFilterIntraPredictorHorizontal", "kFilterIntraPredictorD157", + "kFilterIntraPredictorPaeth", +}; + +template <int bitdepth, typename Pixel> +class IntraPredTestBase : public testing::TestWithParam<TransformSize>, + public test_utils::MaxAlignedAllocable { + public: + IntraPredTestBase() { + switch (tx_size_) { + case kNumTransformSizes: + EXPECT_NE(tx_size_, kNumTransformSizes); + break; + default: + block_width_ = kTransformWidth[tx_size_]; + block_height_ = kTransformHeight[tx_size_]; + break; + } + } + + IntraPredTestBase(const IntraPredTestBase&) = delete; + IntraPredTestBase& operator=(const IntraPredTestBase&) = delete; + ~IntraPredTestBase() override = default; + + protected: + struct IntraPredMem { + void Reset(libvpx_test::ACMRandom* rnd) { + ASSERT_NE(rnd, nullptr); + Pixel* const left = left_mem + 16; + Pixel* const top = top_mem + 16; + const int mask = (1 << bitdepth) - 1; + for (auto& r : ref_src) r = rnd->Rand16() & mask; + for (int i = 0; i < kMaxBlockSize; ++i) left[i] = rnd->Rand16() & mask; + for (int i = -1; i < kMaxBlockSize; ++i) top[i] = rnd->Rand16() & mask; + + // Some directional predictors require top-right, bottom-left. + for (int i = kMaxBlockSize; i < 2 * kMaxBlockSize; ++i) { + left[i] = rnd->Rand16() & mask; + top[i] = rnd->Rand16() & mask; + } + // TODO(jzern): reorder this and regenerate the digests after switching + // random number generators. + // Upsampling in the directional predictors extends left/top[-1] to [-2]. + left[-1] = rnd->Rand16() & mask; + left[-2] = rnd->Rand16() & mask; + top[-2] = rnd->Rand16() & mask; + memset(left_mem, 0, sizeof(left_mem[0]) * 14); + memset(top_mem, 0, sizeof(top_mem[0]) * 14); + memset(top_mem + kMaxBlockSize * 2 + 16, 0, + sizeof(top_mem[0]) * kTopMemPadding); + } + + // Set ref_src, top-left, top and left to |pixel|. + void Set(const Pixel pixel) { + Pixel* const left = left_mem + 16; + Pixel* const top = top_mem + 16; + for (auto& r : ref_src) r = pixel; + // Upsampling in the directional predictors extends left/top[-1] to [-2]. + for (int i = -2; i < 2 * kMaxBlockSize; ++i) { + left[i] = top[i] = pixel; + } + } + + // DirectionalZone1_Large() overreads up to 7 pixels in |top_mem|. + static constexpr int kTopMemPadding = 7; + alignas(kMaxAlignment) Pixel dst[kTotalPixels]; + alignas(kMaxAlignment) Pixel ref_src[kTotalPixels]; + alignas(kMaxAlignment) Pixel left_mem[kMaxBlockSize * 2 + 16]; + alignas( + kMaxAlignment) Pixel top_mem[kMaxBlockSize * 2 + 16 + kTopMemPadding]; + }; + + void SetUp() override { test_utils::ResetDspTable(bitdepth); } + + const TransformSize tx_size_ = GetParam(); + int block_width_; + int block_height_; + IntraPredMem intra_pred_mem_; +}; + +//------------------------------------------------------------------------------ +// FilterIntraPredTest + +template <int bitdepth, typename Pixel> +class FilterIntraPredTest : public IntraPredTestBase<bitdepth, Pixel> { + public: + FilterIntraPredTest() = default; + FilterIntraPredTest(const FilterIntraPredTest&) = delete; + FilterIntraPredTest& operator=(const FilterIntraPredTest&) = delete; + ~FilterIntraPredTest() override = default; + + protected: + using IntraPredTestBase<bitdepth, Pixel>::tx_size_; + using IntraPredTestBase<bitdepth, Pixel>::block_width_; + using IntraPredTestBase<bitdepth, Pixel>::block_height_; + using IntraPredTestBase<bitdepth, Pixel>::intra_pred_mem_; + + void SetUp() override { + IntraPredTestBase<bitdepth, Pixel>::SetUp(); + IntraPredFilterInit_C(); + + const Dsp* const dsp = GetDspTable(bitdepth); + ASSERT_NE(dsp, nullptr); + base_filter_intra_pred_ = dsp->filter_intra_predictor; + + const testing::TestInfo* const test_info = + testing::UnitTest::GetInstance()->current_test_info(); + const char* const test_case = test_info->test_suite_name(); + if (absl::StartsWith(test_case, "C/")) { + // No need to compare C with itself. + base_filter_intra_pred_ = nullptr; + } else if (absl::StartsWith(test_case, "SSE41/")) { + if ((GetCpuInfo() & kSSE4_1) != 0) { + IntraPredFilterInit_SSE4_1(); + } + } else if (absl::StartsWith(test_case, "NEON/")) { + IntraPredFilterInit_NEON(); + } else { + FAIL() << "Unrecognized architecture prefix in test case name: " + << test_case; + } + + // Put the current architecture-specific implementation up for testing and + // comparison against C version. + cur_filter_intra_pred_ = dsp->filter_intra_predictor; + } + + // These tests modify intra_pred_mem_. + void TestSpeed(const char* const digests[kNumFilterIntraPredictors], + int num_runs); + void TestSaturatedValues(); + void TestRandomValues(); + + FilterIntraPredictorFunc base_filter_intra_pred_; + FilterIntraPredictorFunc cur_filter_intra_pred_; +}; + +template <int bitdepth, typename Pixel> +void FilterIntraPredTest<bitdepth, Pixel>::TestSpeed( + const char* const digests[kNumFilterIntraPredictors], const int num_runs) { + ASSERT_NE(digests, nullptr); + const Pixel* const left = intra_pred_mem_.left_mem + 16; + const Pixel* const top = intra_pred_mem_.top_mem + 16; + + libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed()); + intra_pred_mem_.Reset(&rnd); + + // IntraPredInit_C() leaves the filter function empty. + if (cur_filter_intra_pred_ == nullptr) return; + for (int i = 0; i < kNumFilterIntraPredictors; ++i) { + memcpy(intra_pred_mem_.dst, intra_pred_mem_.ref_src, + sizeof(intra_pred_mem_.dst)); + const absl::Time start = absl::Now(); + for (int run = 0; run < num_runs; ++run) { + const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel); + cur_filter_intra_pred_(intra_pred_mem_.dst, stride, top, left, + static_cast<FilterIntraPredictor>(i), block_width_, + block_height_); + } + const absl::Duration elapsed_time = absl::Now() - start; + test_utils::CheckMd5Digest(ToString(tx_size_), kFilterIntraPredNames[i], + digests[i], intra_pred_mem_.dst, + sizeof(intra_pred_mem_.dst), elapsed_time); + } +} + +template <int bitdepth, typename Pixel> +void FilterIntraPredTest<bitdepth, Pixel>::TestSaturatedValues() { + Pixel* const left = intra_pred_mem_.left_mem + 16; + Pixel* const top = intra_pred_mem_.top_mem + 16; + const auto kMaxPixel = static_cast<Pixel>((1 << bitdepth) - 1); + intra_pred_mem_.Set(kMaxPixel); + + // IntraPredInit_C() leaves the filter function empty. + if (cur_filter_intra_pred_ == nullptr) return; + for (int i = 0; i < kNumFilterIntraPredictors; ++i) { + memcpy(intra_pred_mem_.dst, intra_pred_mem_.ref_src, + sizeof(intra_pred_mem_.dst)); + const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel); + cur_filter_intra_pred_(intra_pred_mem_.dst, stride, top, left, + static_cast<FilterIntraPredictor>(i), block_width_, + block_height_); + if (!test_utils::CompareBlocks(intra_pred_mem_.dst, intra_pred_mem_.ref_src, + block_width_, block_height_, kMaxBlockSize, + kMaxBlockSize, true)) { + ADD_FAILURE() << "Expected " << kFilterIntraPredNames[i] + << " to produce a block containing '" + << static_cast<int>(kMaxPixel) << "'"; + } + } +} + +template <int bitdepth, typename Pixel> +void FilterIntraPredTest<bitdepth, Pixel>::TestRandomValues() { + // Skip the 'C' test case as this is used as the reference. + if (base_filter_intra_pred_ == nullptr) return; + + // Use an alternate seed to differentiate this test from TestSpeed(). + libvpx_test::ACMRandom rnd(test_utils::kAlternateDeterministicSeed); + for (int i = 0; i < kNumFilterIntraPredictors; ++i) { + // It may be worthwhile to temporarily increase this loop size when testing + // changes that specifically affect this test. + for (int n = 0; n < 10000; ++n) { + intra_pred_mem_.Reset(&rnd); + + memcpy(intra_pred_mem_.dst, intra_pred_mem_.ref_src, + sizeof(intra_pred_mem_.dst)); + const Pixel* const top = intra_pred_mem_.top_mem + 16; + const Pixel* const left = intra_pred_mem_.left_mem + 16; + const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel); + base_filter_intra_pred_(intra_pred_mem_.ref_src, stride, top, left, + static_cast<FilterIntraPredictor>(i), + block_width_, block_height_); + cur_filter_intra_pred_(intra_pred_mem_.dst, stride, top, left, + static_cast<FilterIntraPredictor>(i), block_width_, + block_height_); + if (!test_utils::CompareBlocks( + intra_pred_mem_.dst, intra_pred_mem_.ref_src, block_width_, + block_height_, kMaxBlockSize, kMaxBlockSize, true)) { + ADD_FAILURE() << "Result from optimized version of " + << kFilterIntraPredNames[i] + << " differs from reference in iteration #" << n; + break; + } + } + } +} + +//------------------------------------------------------------------------------ +using FilterIntraPredTest8bpp = FilterIntraPredTest<8, uint8_t>; + +const char* const* GetFilterIntraPredDigests8bpp(TransformSize tx_size) { + static const char* const kDigests4x4[kNumFilterIntraPredictors] = { + "a2486efcfb351d60a8941203073e89c6", "240716ae5ecaedc19edae1bdef49e05d", + "dacf4af66a966aca7c75abe24cd9ba99", "311888773676f3c2ae3334c4e0f141e5", + "2d3711616c8d8798f608e313cb07a72a", + }; + static const char* const kDigests4x8[kNumFilterIntraPredictors] = { + "1cb74ba1abc68d936e87c13511ed5fbf", "d64c2c08586a762dbdfa8e1150bede06", + "73e9d1a9b6fa3e96fbd65c7dce507529", "e3ae17d9338e5aa3420d31d0e2d7ee87", + "750dbfe3bc5508b7031957a1d315b8bc", + }; + static const char* const kDigests4x16[kNumFilterIntraPredictors] = { + "48a1060701bf68ec6342d6e24c10ef17", "0c91ff7988814d192ed95e840a87b4bf", + "efe586b891c8828c4116c9fbf50850cc", "a3bfa10be2b155826f107e9256ac3ba1", + "976273745b94a561fd52f5aa96fb280f", + }; + static const char* const kDigests8x4[kNumFilterIntraPredictors] = { + "73f82633aeb28db1d254d077edefd8a9", "8eee505cdb5828e33b67ff5572445dac", + "9b0f101c28c66a916079fe5ed33b4021", "47fd44a7e5a5b55f067908192698e25c", + "eab59a3710d9bdeca8fa03a15d3f95d6", + }; + static const char* const kDigests8x8[kNumFilterIntraPredictors] = { + "aa07b7a007c4c1d494ddb44a23c27bcd", "d27eee43f15dfcfe4c46cd46b681983b", + "1015d26022cf57acfdb11fd3f6b9ccb0", "4f0e00ef556fbcac2fb31e3b18869070", + "918c2553635763a0756b20154096bca6", + }; + static const char* const kDigests8x16[kNumFilterIntraPredictors] = { + "a8ac58b2efb02092035cca206dbf5fbe", "0b22b000b7f124b32545bc86dd9f0142", + "cd6a08e023cad301c084b6ec2999da63", "c017f5f4fa5c05e7638ae4db98512b13", + "893e6995522e23ed3d613ef3797ca580", + }; + static const char* const kDigests8x32[kNumFilterIntraPredictors] = { + "b3d5d4f09b778ae2b8cc0e9014c22320", "e473874a1e65228707489be9ca6477aa", + "91bda5a2d32780af345bb3d49324732f", "20f2ff26f004f02e8e2be49e6cadc32f", + "00c909b749e36142b133a7357271e83e", + }; + static const char* const kDigests16x4[kNumFilterIntraPredictors] = { + "ef252f074fc3f5367748436e676e78ca", "cd436d8803ea40db3a849e7c869855c7", + "9cd8601b5d66e61fd002f8b11bfa58d9", "b982f17ee36ef0d1c2cfea20197d5666", + "9e350d1cd65d520194281633f566810d", + }; + static const char* const kDigests16x8[kNumFilterIntraPredictors] = { + "9a7e0cf9b023a89ee619ee672ba2a219", "c20186bc642912ecd4d48bc4924a79b1", + "77de044f4c7f717f947a36fc0aa17946", "3f2fc68f11e6ee0220adb8d1ee085c8e", + "2f37e586769dfb88d9d4116b9c28c5ab", + }; + static const char* const kDigests16x16[kNumFilterIntraPredictors] = { + "36c5b85b9a6b1d2e8f44f09c81adfe9c", "78494ce3a6a78aa2879ad2e24d43a005", + "aa30cd29a74407dbec80161745161eb2", "ae2a0975ef166e05e5e8c3701bd19e93", + "6322fba6f3bcb1f6c8e78160d200809c", + }; + static const char* const kDigests16x32[kNumFilterIntraPredictors] = { + "82d54732c37424946bc73f5a78f64641", "071773c82869bb103c31e05f14ed3c2f", + "3a0094c150bd6e21ce1f17243b21e76b", "998ffef26fc65333ae407bbe9d41a252", + "6491add6b665aafc364c8c104a6a233d", + }; + static const char* const kDigests32x8[kNumFilterIntraPredictors] = { + "c60062105dd727e94f744c35f0d2156e", "36a9e4d543701c4c546016e35e9c4337", + "05a8d07fe271023e63febfb44814d114", "0a28606925519d1ed067d64761619dc8", + "bb8c34b143910ba49b01d13e94d936ac", + }; + static const char* const kDigests32x16[kNumFilterIntraPredictors] = { + "60e6caeec9194fcb409469e6e1393128", "5d764ead046443eb14f76822a569b056", + "b1bf22fcc282614354166fa1eb6e5f8b", "4b188e729fe49ae24100b3ddd8f17313", + "75f430fdea0b7b5b66866fd68a795a6a", + }; + static const char* const kDigests32x32[kNumFilterIntraPredictors] = { + "5bb91a37b1979866eb23b59dd352229d", "589aa983109500749609d7be1cb79711", + "5e8fb1927cdbe21143494b56b5d400f6", "9e28f741d19c64b2a0577d83546d32d9", + "73c73237a5d891096066b186abf96854", + }; + + switch (tx_size) { + case kTransformSize4x4: + return kDigests4x4; + case kTransformSize4x8: + return kDigests4x8; + case kTransformSize4x16: + return kDigests4x16; + case kTransformSize8x4: + return kDigests8x4; + case kTransformSize8x8: + return kDigests8x8; + case kTransformSize8x16: + return kDigests8x16; + case kTransformSize8x32: + return kDigests8x32; + case kTransformSize16x4: + return kDigests16x4; + case kTransformSize16x8: + return kDigests16x8; + case kTransformSize16x16: + return kDigests16x16; + case kTransformSize16x32: + return kDigests16x32; + case kTransformSize32x8: + return kDigests32x8; + case kTransformSize32x16: + return kDigests32x16; + case kTransformSize32x32: + return kDigests32x32; + default: + ADD_FAILURE() << "Unknown transform size: " << tx_size; + return nullptr; + } +} + +TEST_P(FilterIntraPredTest8bpp, DISABLED_Speed) { + const auto num_runs = + static_cast<int>(2.5e8 / (block_width_ * block_height_)); + TestSpeed(GetFilterIntraPredDigests8bpp(tx_size_), num_runs); +} + +TEST_P(FilterIntraPredTest8bpp, FixedInput) { + TestSpeed(GetFilterIntraPredDigests8bpp(tx_size_), 1); +} + +TEST_P(FilterIntraPredTest8bpp, Overflow) { TestSaturatedValues(); } +TEST_P(FilterIntraPredTest8bpp, Random) { TestRandomValues(); } + +//------------------------------------------------------------------------------ + +#if LIBGAV1_MAX_BITDEPTH >= 10 +using FilterIntraPredTest10bpp = FilterIntraPredTest<10, uint16_t>; + +const char* const* GetFilterIntraPredDigests10bpp(TransformSize tx_size) { + static const char* const kDigests4x4[kNumFilterIntraPredictors] = { + "13a9014d9e255cde8e3e85abf6ef5151", "aee33aa3f3baec87a8c019743fff40f1", + "fdd8ca2be424501f51fcdb603c2e757c", "aed00c082d1980d4bab45e9318b939f0", + "1b363db246aa5400f49479b7d5d41799", + }; + static const char* const kDigests4x8[kNumFilterIntraPredictors] = { + "e718b9e31ba3da0392fd4b6cfba5d882", "31ba22989cdc3bb80749685f42c6c697", + "6bc5b3a55b94018117569cfdced17bf9", "ec29979fb4936116493dfa1cfc93901c", + "c6bcf564e63c42148d9917f089566432", + }; + static const char* const kDigests4x16[kNumFilterIntraPredictors] = { + "404bddd88dff2c0414b5398287e54f18", "ff4fb3039cec6c9ffed6d259cbbfd854", + "7d6fa3ed9e728ff056a73c40bb6edeb6", "82845d942ad8048578e0037336905146", + "f3c07ea65db08c639136a5a9270f95ff", + }; + static const char* const kDigests8x4[kNumFilterIntraPredictors] = { + "2008981638f27ba9123973a733e46c3d", "47efecf1f7628cbd8c22e168fcceb5ce", + "04c857ffbd1edd6e2788b17410a4a39c", "deb0236c4277b4d7b174fba407e1c9d7", + "5b58567f94ae9fa930f700c68c17399d", + }; + static const char* const kDigests8x8[kNumFilterIntraPredictors] = { + "d9bab44a6d1373e758bfa0ee88239093", "29b10ddb32d9de2ff0cad6126f010ff6", + "1a03f9a18bdbab0811138cd969bf1f93", "e3273c24e77095ffa033a073f5bbcf7b", + "5187bb3df943d154cb01fb2f244ff86f", + }; + static const char* const kDigests8x16[kNumFilterIntraPredictors] = { + "a2199f792634a56f1c4e88510e408773", "8fd8a98969d19832975ee7131cca9dbb", + "d897380941f75b04b1327e63f136d7d6", "d36f52a157027d53b15b7c02a7983436", + "0a8c23047b0364f5687b62b01f043359", + }; + static const char* const kDigests8x32[kNumFilterIntraPredictors] = { + "5b74ea8e4f60151cf2db9b23d803a2e2", "e0d6bb5fa7d181589c31fcf2755d7c0b", + "42e590ffc88b8940b7aade22e13bbb6a", "e47c39ec1761aa7b5a9b1368ede7cfdc", + "6e963a89beac6f3a362c269d1017f9a8", + }; + static const char* const kDigests16x4[kNumFilterIntraPredictors] = { + "9eaa079622b5dd95ad3a8feb68fa9bbb", "17e3aa6a0034e9eedcfc65b8ce6e7205", + "eac5a5337dbaf9bcbc3d320745c8e190", "c6ba9a7e518be04f725bc1dbd399c204", + "19020b82ce8bb49a511820c7e1d58e99", + }; + static const char* const kDigests16x8[kNumFilterIntraPredictors] = { + "2d2c3255d5dfc1479a5d82a7d5a0d42e", "0fbb4ee851b4ee58c6d30dd820d19e38", + "fa77a1b056e8dc8efb702c7832531b32", "186269ca219dc663ad9b4a53e011a54b", + "c12180a6dcde0c3579befbb5304ff70b", + }; + static const char* const kDigests16x16[kNumFilterIntraPredictors] = { + "dbb81d7ee7d3c83c271400d0160b2e83", "4da656a3ef238d90bb8339471a6fdb7e", + "d95006bf299b84a1b04e38d5fa8fb4f7", "742a03331f0fbd66c57df0ae31104aca", + "4d20aa440e38b6b7ac83c8c54d313169", + }; + static const char* const kDigests16x32[kNumFilterIntraPredictors] = { + "6247730c93789cc25bcb837781dfa05b", "9a93e14b06dd145e35ab21a0353bdebe", + "6c5866353e30296a67d9bd7a65d6998d", "389d7f038d7997871745bb1305156ff9", + "e7640d81f891e1d06e7da75c6ae74d93", + }; + static const char* const kDigests32x8[kNumFilterIntraPredictors] = { + "68f3a603b7c25dd78deffe91aef22834", "48c735e4aa951d6333d99e571bfeadc8", + "35239df0993a429fc599a3037c731e4b", "ba7dd72e04af1a1fc1b30784c11df783", + "78e9017f7434665d32ec59795aed0012", + }; + static const char* const kDigests32x16[kNumFilterIntraPredictors] = { + "8cf2f11f7f77901cb0c522ad191eb998", "204c76d68c5117b89b5c3a05d5548883", + "f3751e41e7a595f43d8aaf9a40644e05", "81ea1a7d608d7b91dd3ede0f87e750ee", + "b5951334dfbe6229d828e03cd2d98538", + }; + static const char* const kDigests32x32[kNumFilterIntraPredictors] = { + "9d8630188c3d1a4f28a6106e343c9380", "c6c92e059faa17163522409b7bf93230", + "62e4c959cb06ec661d98769981fbd555", "01e61673f11011571246668e36cc61c5", + "4530222ea1de546e202630fcf43f4526", + }; + + switch (tx_size) { + case kTransformSize4x4: + return kDigests4x4; + case kTransformSize4x8: + return kDigests4x8; + case kTransformSize4x16: + return kDigests4x16; + case kTransformSize8x4: + return kDigests8x4; + case kTransformSize8x8: + return kDigests8x8; + case kTransformSize8x16: + return kDigests8x16; + case kTransformSize8x32: + return kDigests8x32; + case kTransformSize16x4: + return kDigests16x4; + case kTransformSize16x8: + return kDigests16x8; + case kTransformSize16x16: + return kDigests16x16; + case kTransformSize16x32: + return kDigests16x32; + case kTransformSize32x8: + return kDigests32x8; + case kTransformSize32x16: + return kDigests32x16; + case kTransformSize32x32: + return kDigests32x32; + default: + ADD_FAILURE() << "Unknown transform size: " << tx_size; + return nullptr; + } +} + +TEST_P(FilterIntraPredTest10bpp, DISABLED_Speed) { + const auto num_runs = + static_cast<int>(2.5e8 / (block_width_ * block_height_)); + TestSpeed(GetFilterIntraPredDigests10bpp(tx_size_), num_runs); +} + +TEST_P(FilterIntraPredTest10bpp, FixedInput) { + TestSpeed(GetFilterIntraPredDigests10bpp(tx_size_), 1); +} + +TEST_P(FilterIntraPredTest10bpp, Overflow) { TestSaturatedValues(); } +#endif // LIBGAV1_MAX_BITDEPTH >= 10 +// Filter-intra and Cfl predictors are available only for transform sizes +// with max(width, height) <= 32. +constexpr TransformSize kTransformSizesSmallerThan32x32[] = { + kTransformSize4x4, kTransformSize4x8, kTransformSize4x16, + kTransformSize8x4, kTransformSize8x8, kTransformSize8x16, + kTransformSize8x32, kTransformSize16x4, kTransformSize16x8, + kTransformSize16x16, kTransformSize16x32, kTransformSize32x8, + kTransformSize32x16, kTransformSize32x32}; + +INSTANTIATE_TEST_SUITE_P(C, FilterIntraPredTest8bpp, + testing::ValuesIn(kTransformSizesSmallerThan32x32)); +#if LIBGAV1_ENABLE_SSE4_1 +INSTANTIATE_TEST_SUITE_P(SSE41, FilterIntraPredTest8bpp, + testing::ValuesIn(kTransformSizesSmallerThan32x32)); +#endif // LIBGAV1_ENABLE_SSE4_1 +#if LIBGAV1_ENABLE_NEON +INSTANTIATE_TEST_SUITE_P(NEON, FilterIntraPredTest8bpp, + testing::ValuesIn(kTransformSizesSmallerThan32x32)); +#endif // LIBGAV1_ENABLE_NEON + +#if LIBGAV1_MAX_BITDEPTH >= 10 +INSTANTIATE_TEST_SUITE_P(C, FilterIntraPredTest10bpp, + testing::ValuesIn(kTransformSizesSmallerThan32x32)); +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +} // namespace +} // namespace dsp + +static std::ostream& operator<<(std::ostream& os, const TransformSize tx_size) { + return os << ToString(tx_size); +} + +} // namespace libgav1 diff --git a/src/dsp/intrapred_smooth.cc b/src/dsp/intrapred_smooth.cc new file mode 100644 index 0000000..83c005e --- /dev/null +++ b/src/dsp/intrapred_smooth.cc @@ -0,0 +1,738 @@ +// Copyright 2021 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/intrapred_smooth.h" + +#include <algorithm> +#include <cassert> +#include <cstddef> +#include <cstdlib> +#include <cstring> + +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/utils/common.h" +#include "src/utils/constants.h" + +namespace libgav1 { +namespace dsp { +namespace { + +template <int block_width, int block_height, typename Pixel> +struct SmoothFuncs_C { + SmoothFuncs_C() = delete; + + static void Smooth(void* dest, ptrdiff_t stride, const void* top_row, + const void* left_column); + static void SmoothVertical(void* dest, ptrdiff_t stride, const void* top_row, + const void* left_column); + static void SmoothHorizontal(void* dest, ptrdiff_t stride, + const void* top_row, const void* left_column); +}; + +constexpr uint8_t kSmoothWeights[] = { + // block dimension = 4 + 255, 149, 85, 64, + // block dimension = 8 + 255, 197, 146, 105, 73, 50, 37, 32, + // block dimension = 16 + 255, 225, 196, 170, 145, 123, 102, 84, 68, 54, 43, 33, 26, 20, 17, 16, + // block dimension = 32 + 255, 240, 225, 210, 196, 182, 169, 157, 145, 133, 122, 111, 101, 92, 83, 74, + 66, 59, 52, 45, 39, 34, 29, 25, 21, 17, 14, 12, 10, 9, 8, 8, + // block dimension = 64 + 255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169, 163, 156, + 150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96, 91, 86, 82, 77, 73, + 69, 65, 61, 57, 54, 50, 47, 44, 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16, + 15, 13, 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4}; + +// SmoothFuncs_C::Smooth +template <int block_width, int block_height, typename Pixel> +void SmoothFuncs_C<block_width, block_height, Pixel>::Smooth( + void* const dest, ptrdiff_t stride, const void* const top_row, + const void* const left_column) { + const auto* const top = static_cast<const Pixel*>(top_row); + const auto* const left = static_cast<const Pixel*>(left_column); + const Pixel top_right = top[block_width - 1]; + const Pixel bottom_left = left[block_height - 1]; + static_assert( + block_width >= 4 && block_height >= 4, + "Weights for smooth predictor undefined for block width/height < 4"); + const uint8_t* const weights_x = kSmoothWeights + block_width - 4; + const uint8_t* const weights_y = kSmoothWeights + block_height - 4; + const uint16_t scale_value = (1 << kSmoothWeightScale); + auto* dst = static_cast<Pixel*>(dest); + stride /= sizeof(Pixel); + + for (int y = 0; y < block_height; ++y) { + for (int x = 0; x < block_width; ++x) { + assert(scale_value >= weights_y[y] && scale_value >= weights_x[x]); + uint32_t pred = weights_y[y] * top[x]; + pred += weights_x[x] * left[y]; + pred += static_cast<uint8_t>(scale_value - weights_y[y]) * bottom_left; + pred += static_cast<uint8_t>(scale_value - weights_x[x]) * top_right; + // The maximum value of pred with the rounder is 2^9 * (2^bitdepth - 1) + // + 256. With the descale there's no need for saturation. + dst[x] = static_cast<Pixel>( + RightShiftWithRounding(pred, kSmoothWeightScale + 1)); + } + dst += stride; + } +} + +// SmoothFuncs_C::SmoothVertical +template <int block_width, int block_height, typename Pixel> +void SmoothFuncs_C<block_width, block_height, Pixel>::SmoothVertical( + void* const dest, ptrdiff_t stride, const void* const top_row, + const void* const left_column) { + const auto* const top = static_cast<const Pixel*>(top_row); + const auto* const left = static_cast<const Pixel*>(left_column); + const Pixel bottom_left = left[block_height - 1]; + static_assert(block_height >= 4, + "Weights for smooth predictor undefined for block height < 4"); + const uint8_t* const weights_y = kSmoothWeights + block_height - 4; + const uint16_t scale_value = (1 << kSmoothWeightScale); + auto* dst = static_cast<Pixel*>(dest); + stride /= sizeof(Pixel); + + for (int y = 0; y < block_height; ++y) { + for (int x = 0; x < block_width; ++x) { + assert(scale_value >= weights_y[y]); + uint32_t pred = weights_y[y] * top[x]; + pred += static_cast<uint8_t>(scale_value - weights_y[y]) * bottom_left; + dst[x] = + static_cast<Pixel>(RightShiftWithRounding(pred, kSmoothWeightScale)); + } + dst += stride; + } +} + +// SmoothFuncs_C::SmoothHorizontal +template <int block_width, int block_height, typename Pixel> +void SmoothFuncs_C<block_width, block_height, Pixel>::SmoothHorizontal( + void* const dest, ptrdiff_t stride, const void* const top_row, + const void* const left_column) { + const auto* const top = static_cast<const Pixel*>(top_row); + const auto* const left = static_cast<const Pixel*>(left_column); + const Pixel top_right = top[block_width - 1]; + static_assert(block_width >= 4, + "Weights for smooth predictor undefined for block width < 4"); + const uint8_t* const weights_x = kSmoothWeights + block_width - 4; + const uint16_t scale_value = (1 << kSmoothWeightScale); + auto* dst = static_cast<Pixel*>(dest); + stride /= sizeof(Pixel); + + for (int y = 0; y < block_height; ++y) { + for (int x = 0; x < block_width; ++x) { + assert(scale_value >= weights_x[x]); + uint32_t pred = weights_x[x] * left[y]; + pred += static_cast<uint8_t>(scale_value - weights_x[x]) * top_right; + dst[x] = + static_cast<Pixel>(RightShiftWithRounding(pred, kSmoothWeightScale)); + } + dst += stride; + } +} + +// ----------------------------------------------------------------------------- + +template <typename Pixel> +struct SmoothDefs { + SmoothDefs() = delete; + + using _4x4 = SmoothFuncs_C<4, 4, Pixel>; + using _4x8 = SmoothFuncs_C<4, 8, Pixel>; + using _4x16 = SmoothFuncs_C<4, 16, Pixel>; + using _8x4 = SmoothFuncs_C<8, 4, Pixel>; + using _8x8 = SmoothFuncs_C<8, 8, Pixel>; + using _8x16 = SmoothFuncs_C<8, 16, Pixel>; + using _8x32 = SmoothFuncs_C<8, 32, Pixel>; + using _16x4 = SmoothFuncs_C<16, 4, Pixel>; + using _16x8 = SmoothFuncs_C<16, 8, Pixel>; + using _16x16 = SmoothFuncs_C<16, 16, Pixel>; + using _16x32 = SmoothFuncs_C<16, 32, Pixel>; + using _16x64 = SmoothFuncs_C<16, 64, Pixel>; + using _32x8 = SmoothFuncs_C<32, 8, Pixel>; + using _32x16 = SmoothFuncs_C<32, 16, Pixel>; + using _32x32 = SmoothFuncs_C<32, 32, Pixel>; + using _32x64 = SmoothFuncs_C<32, 64, Pixel>; + using _64x16 = SmoothFuncs_C<64, 16, Pixel>; + using _64x32 = SmoothFuncs_C<64, 32, Pixel>; + using _64x64 = SmoothFuncs_C<64, 64, Pixel>; +}; + +using Defs = SmoothDefs<uint8_t>; + +// Initializes dsp entries for kTransformSize|W|x|H| from |DEFS| of +// the same size. +#define INIT_SMOOTH_WxH(DEFS, W, H) \ + dsp->intra_predictors[kTransformSize##W##x##H][kIntraPredictorSmooth] = \ + DEFS::_##W##x##H::Smooth; \ + dsp->intra_predictors[kTransformSize##W##x##H] \ + [kIntraPredictorSmoothVertical] = \ + DEFS::_##W##x##H::SmoothVertical; \ + dsp->intra_predictors[kTransformSize##W##x##H] \ + [kIntraPredictorSmoothHorizontal] = \ + DEFS::_##W##x##H::SmoothHorizontal + +#define INIT_SMOOTH(DEFS) \ + INIT_SMOOTH_WxH(DEFS, 4, 4); \ + INIT_SMOOTH_WxH(DEFS, 4, 8); \ + INIT_SMOOTH_WxH(DEFS, 4, 16); \ + INIT_SMOOTH_WxH(DEFS, 8, 4); \ + INIT_SMOOTH_WxH(DEFS, 8, 8); \ + INIT_SMOOTH_WxH(DEFS, 8, 16); \ + INIT_SMOOTH_WxH(DEFS, 8, 32); \ + INIT_SMOOTH_WxH(DEFS, 16, 4); \ + INIT_SMOOTH_WxH(DEFS, 16, 8); \ + INIT_SMOOTH_WxH(DEFS, 16, 16); \ + INIT_SMOOTH_WxH(DEFS, 16, 32); \ + INIT_SMOOTH_WxH(DEFS, 16, 64); \ + INIT_SMOOTH_WxH(DEFS, 32, 8); \ + INIT_SMOOTH_WxH(DEFS, 32, 16); \ + INIT_SMOOTH_WxH(DEFS, 32, 32); \ + INIT_SMOOTH_WxH(DEFS, 32, 64); \ + INIT_SMOOTH_WxH(DEFS, 64, 16); \ + INIT_SMOOTH_WxH(DEFS, 64, 32); \ + INIT_SMOOTH_WxH(DEFS, 64, 64) + +void Init8bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(8); + assert(dsp != nullptr); +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + INIT_SMOOTH(Defs); +#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + static_cast<void>(dsp); +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmooth] = + Defs::_4x4::Smooth; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothVertical] = + Defs::_4x4::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothHorizontal] = + Defs::_4x4::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmooth] = + Defs::_4x8::Smooth; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothVertical] = + Defs::_4x8::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothHorizontal] = + Defs::_4x8::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmooth] = + Defs::_4x16::Smooth; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothVertical] = + Defs::_4x16::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothHorizontal] = + Defs::_4x16::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmooth] = + Defs::_8x4::Smooth; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothVertical] = + Defs::_8x4::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothHorizontal] = + Defs::_8x4::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmooth] = + Defs::_8x8::Smooth; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothVertical] = + Defs::_8x8::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothHorizontal] = + Defs::_8x8::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmooth] = + Defs::_8x16::Smooth; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothVertical] = + Defs::_8x16::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothHorizontal] = + Defs::_8x16::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmooth] = + Defs::_8x32::Smooth; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothVertical] = + Defs::_8x32::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothHorizontal] = + Defs::_8x32::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmooth] = + Defs::_16x4::Smooth; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothVertical] = + Defs::_16x4::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothHorizontal] = + Defs::_16x4::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmooth] = + Defs::_16x8::Smooth; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothVertical] = + Defs::_16x8::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothHorizontal] = + Defs::_16x8::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmooth] = + Defs::_16x16::Smooth; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothVertical] = + Defs::_16x16::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothHorizontal] = + Defs::_16x16::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmooth] = + Defs::_16x32::Smooth; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothVertical] = + Defs::_16x32::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothHorizontal] = + Defs::_16x32::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmooth] = + Defs::_16x64::Smooth; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothVertical] = + Defs::_16x64::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothHorizontal] = + Defs::_16x64::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmooth] = + Defs::_32x8::Smooth; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothVertical] = + Defs::_32x8::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothHorizontal] = + Defs::_32x8::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmooth] = + Defs::_32x16::Smooth; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothVertical] = + Defs::_32x16::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothHorizontal] = + Defs::_32x16::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmooth] = + Defs::_32x32::Smooth; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothVertical] = + Defs::_32x32::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothHorizontal] = + Defs::_32x32::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmooth] = + Defs::_32x64::Smooth; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothVertical] = + Defs::_32x64::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothHorizontal] = + Defs::_32x64::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmooth] = + Defs::_64x16::Smooth; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothVertical] = + Defs::_64x16::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothHorizontal] = + Defs::_64x16::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmooth] = + Defs::_64x32::Smooth; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothVertical] = + Defs::_64x32::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothHorizontal] = + Defs::_64x32::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmooth] = + Defs::_64x64::Smooth; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothVertical] = + Defs::_64x64::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothHorizontal] = + Defs::_64x64::SmoothHorizontal; +#endif +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +} // NOLINT(readability/fn_size) + +#if LIBGAV1_MAX_BITDEPTH >= 10 +using DefsHbd = SmoothDefs<uint16_t>; + +void Init10bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(10); + assert(dsp != nullptr); +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + INIT_SMOOTH(DefsHbd); +#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + static_cast<void>(dsp); +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmooth] = + DefsHbd::_4x4::Smooth; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothVertical] = + DefsHbd::_4x4::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothHorizontal] = + DefsHbd::_4x4::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmooth] = + DefsHbd::_4x8::Smooth; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothVertical] = + DefsHbd::_4x8::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothHorizontal] = + DefsHbd::_4x8::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmooth] = + DefsHbd::_4x16::Smooth; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothVertical] = + DefsHbd::_4x16::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothHorizontal] = + DefsHbd::_4x16::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmooth] = + DefsHbd::_8x4::Smooth; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothVertical] = + DefsHbd::_8x4::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothHorizontal] = + DefsHbd::_8x4::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmooth] = + DefsHbd::_8x8::Smooth; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothVertical] = + DefsHbd::_8x8::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothHorizontal] = + DefsHbd::_8x8::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmooth] = + DefsHbd::_8x16::Smooth; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothVertical] = + DefsHbd::_8x16::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothHorizontal] = + DefsHbd::_8x16::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmooth] = + DefsHbd::_8x32::Smooth; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothVertical] = + DefsHbd::_8x32::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothHorizontal] = + DefsHbd::_8x32::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmooth] = + DefsHbd::_16x4::Smooth; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothVertical] = + DefsHbd::_16x4::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothHorizontal] = + DefsHbd::_16x4::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmooth] = + DefsHbd::_16x8::Smooth; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothVertical] = + DefsHbd::_16x8::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothHorizontal] = + DefsHbd::_16x8::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmooth] = + DefsHbd::_16x16::Smooth; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothVertical] = + DefsHbd::_16x16::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothHorizontal] = + DefsHbd::_16x16::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmooth] = + DefsHbd::_16x32::Smooth; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothVertical] = + DefsHbd::_16x32::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothHorizontal] = + DefsHbd::_16x32::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmooth] = + DefsHbd::_16x64::Smooth; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothVertical] = + DefsHbd::_16x64::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothHorizontal] = + DefsHbd::_16x64::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmooth] = + DefsHbd::_32x8::Smooth; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothVertical] = + DefsHbd::_32x8::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothHorizontal] = + DefsHbd::_32x8::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmooth] = + DefsHbd::_32x16::Smooth; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothVertical] = + DefsHbd::_32x16::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothHorizontal] = + DefsHbd::_32x16::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmooth] = + DefsHbd::_32x32::Smooth; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothVertical] = + DefsHbd::_32x32::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothHorizontal] = + DefsHbd::_32x32::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmooth] = + DefsHbd::_32x64::Smooth; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothVertical] = + DefsHbd::_32x64::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothHorizontal] = + DefsHbd::_32x64::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmooth] = + DefsHbd::_64x16::Smooth; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothVertical] = + DefsHbd::_64x16::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothHorizontal] = + DefsHbd::_64x16::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmooth] = + DefsHbd::_64x32::Smooth; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothVertical] = + DefsHbd::_64x32::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothHorizontal] = + DefsHbd::_64x32::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmooth] = + DefsHbd::_64x64::Smooth; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothVertical] = + DefsHbd::_64x64::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothHorizontal] = + DefsHbd::_64x64::SmoothHorizontal; +#endif +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +} // NOLINT(readability/fn_size) +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +#undef INIT_SMOOTH_WxH +#undef INIT_SMOOTH +} // namespace + +void IntraPredSmoothInit_C() { + Init8bpp(); +#if LIBGAV1_MAX_BITDEPTH >= 10 + Init10bpp(); +#endif +} + +} // namespace dsp +} // namespace libgav1 diff --git a/src/dsp/intrapred_smooth.h b/src/dsp/intrapred_smooth.h new file mode 100644 index 0000000..6802003 --- /dev/null +++ b/src/dsp/intrapred_smooth.h @@ -0,0 +1,48 @@ +/* + * Copyright 2021 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_INTRAPRED_SMOOTH_H_ +#define LIBGAV1_SRC_DSP_INTRAPRED_SMOOTH_H_ + +// Pull in LIBGAV1_DspXXX defines representing the implementation status +// of each function. The resulting value of each can be used by each module to +// determine whether an implementation is needed at compile time. +// IWYU pragma: begin_exports + +// ARM: +#include "src/dsp/arm/intrapred_smooth_neon.h" + +// x86: +// Note includes should be sorted in logical order avx2/avx/sse4, etc. +// The order of includes is important as each tests for a superior version +// before setting the base. +// clang-format off +#include "src/dsp/x86/intrapred_smooth_sse4.h" +// clang-format on + +// IWYU pragma: end_exports + +namespace libgav1 { +namespace dsp { + +// Initializes Dsp::intra_predictors[][kIntraPredictorSmooth.*]. +// This function is not thread-safe. +void IntraPredSmoothInit_C(); + +} // namespace dsp +} // namespace libgav1 + +#endif // LIBGAV1_SRC_DSP_INTRAPRED_SMOOTH_H_ diff --git a/src/dsp/intrapred_test.cc b/src/dsp/intrapred_test.cc new file mode 100644 index 0000000..335aa2f --- /dev/null +++ b/src/dsp/intrapred_test.cc @@ -0,0 +1,710 @@ +// Copyright 2020 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/intrapred.h" + +#include <cmath> +#include <cstddef> +#include <cstdint> +#include <cstring> +#include <memory> +#include <ostream> + +#include "absl/strings/match.h" +#include "absl/time/clock.h" +#include "absl/time/time.h" +#include "gtest/gtest.h" +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/dsp/intrapred_smooth.h" +#include "src/utils/common.h" +#include "src/utils/constants.h" +#include "src/utils/cpu.h" +#include "src/utils/memory.h" +#include "tests/block_utils.h" +#include "tests/third_party/libvpx/acm_random.h" +#include "tests/utils.h" + +namespace libgav1 { +namespace dsp { +namespace { + +constexpr int kMaxBlockSize = 64; +constexpr int kTotalPixels = kMaxBlockSize * kMaxBlockSize; + +template <int bitdepth, typename Pixel> +class IntraPredTestBase : public testing::TestWithParam<TransformSize>, + public test_utils::MaxAlignedAllocable { + public: + IntraPredTestBase() { + switch (tx_size_) { + case kNumTransformSizes: + EXPECT_NE(tx_size_, kNumTransformSizes); + break; + default: + block_width_ = kTransformWidth[tx_size_]; + block_height_ = kTransformHeight[tx_size_]; + break; + } + } + + IntraPredTestBase(const IntraPredTestBase&) = delete; + IntraPredTestBase& operator=(const IntraPredTestBase&) = delete; + ~IntraPredTestBase() override = default; + + protected: + struct IntraPredMem { + void Reset(libvpx_test::ACMRandom* rnd) { + ASSERT_NE(rnd, nullptr); + Pixel* const left = left_mem + 16; + Pixel* const top = top_mem + 16; + const int mask = (1 << bitdepth) - 1; + for (auto& r : ref_src) r = rnd->Rand16() & mask; + for (int i = 0; i < kMaxBlockSize; ++i) left[i] = rnd->Rand16() & mask; + for (int i = -1; i < kMaxBlockSize; ++i) top[i] = rnd->Rand16() & mask; + + // Some directional predictors require top-right, bottom-left. + for (int i = kMaxBlockSize; i < 2 * kMaxBlockSize; ++i) { + left[i] = rnd->Rand16() & mask; + top[i] = rnd->Rand16() & mask; + } + // TODO(jzern): reorder this and regenerate the digests after switching + // random number generators. + // Upsampling in the directional predictors extends left/top[-1] to [-2]. + left[-1] = rnd->Rand16() & mask; + left[-2] = rnd->Rand16() & mask; + top[-2] = rnd->Rand16() & mask; + memset(left_mem, 0, sizeof(left_mem[0]) * 14); + memset(top_mem, 0, sizeof(top_mem[0]) * 14); + memset(top_mem + kMaxBlockSize * 2 + 16, 0, + sizeof(top_mem[0]) * kTopMemPadding); + } + + // Set ref_src, top-left, top and left to |pixel|. + void Set(const Pixel pixel) { + Pixel* const left = left_mem + 16; + Pixel* const top = top_mem + 16; + for (auto& r : ref_src) r = pixel; + // Upsampling in the directional predictors extends left/top[-1] to [-2]. + for (int i = -2; i < 2 * kMaxBlockSize; ++i) { + left[i] = top[i] = pixel; + } + } + + // DirectionalZone1_Large() overreads up to 7 pixels in |top_mem|. + static constexpr int kTopMemPadding = 7; + alignas(kMaxAlignment) Pixel dst[kTotalPixels]; + alignas(kMaxAlignment) Pixel ref_src[kTotalPixels]; + alignas(kMaxAlignment) Pixel left_mem[kMaxBlockSize * 2 + 16]; + alignas( + kMaxAlignment) Pixel top_mem[kMaxBlockSize * 2 + 16 + kTopMemPadding]; + }; + + void SetUp() override { test_utils::ResetDspTable(bitdepth); } + + const TransformSize tx_size_ = GetParam(); + int block_width_; + int block_height_; + IntraPredMem intra_pred_mem_; +}; + +//------------------------------------------------------------------------------ +// IntraPredTest + +template <int bitdepth, typename Pixel> +class IntraPredTest : public IntraPredTestBase<bitdepth, Pixel> { + public: + IntraPredTest() = default; + IntraPredTest(const IntraPredTest&) = delete; + IntraPredTest& operator=(const IntraPredTest&) = delete; + ~IntraPredTest() override = default; + + protected: + using IntraPredTestBase<bitdepth, Pixel>::tx_size_; + using IntraPredTestBase<bitdepth, Pixel>::block_width_; + using IntraPredTestBase<bitdepth, Pixel>::block_height_; + using IntraPredTestBase<bitdepth, Pixel>::intra_pred_mem_; + + void SetUp() override { + IntraPredTestBase<bitdepth, Pixel>::SetUp(); + IntraPredInit_C(); + IntraPredSmoothInit_C(); + + const Dsp* const dsp = GetDspTable(bitdepth); + ASSERT_NE(dsp, nullptr); + memcpy(base_intrapreds_, dsp->intra_predictors[tx_size_], + sizeof(base_intrapreds_)); + + const testing::TestInfo* const test_info = + testing::UnitTest::GetInstance()->current_test_info(); + const char* const test_case = test_info->test_suite_name(); + if (absl::StartsWith(test_case, "C/")) { + memset(base_intrapreds_, 0, sizeof(base_intrapreds_)); + } else if (absl::StartsWith(test_case, "SSE41/")) { + if ((GetCpuInfo() & kSSE4_1) != 0) { + IntraPredInit_SSE4_1(); + IntraPredSmoothInit_SSE4_1(); + } + } else if (absl::StartsWith(test_case, "NEON/")) { + IntraPredInit_NEON(); + IntraPredSmoothInit_NEON(); + } else { + FAIL() << "Unrecognized architecture prefix in test case name: " + << test_case; + } + + memcpy(cur_intrapreds_, dsp->intra_predictors[tx_size_], + sizeof(cur_intrapreds_)); + + for (int i = 0; i < kNumIntraPredictors; ++i) { + // skip functions that haven't been specialized for this particular + // architecture. + if (cur_intrapreds_[i] == base_intrapreds_[i]) { + cur_intrapreds_[i] = nullptr; + } + } + } + + // These tests modify intra_pred_mem_. + void TestSpeed(const char* const digests[kNumIntraPredictors], int num_runs); + void TestSaturatedValues(); + void TestRandomValues(); + + IntraPredictorFunc base_intrapreds_[kNumIntraPredictors]; + IntraPredictorFunc cur_intrapreds_[kNumIntraPredictors]; +}; + +template <int bitdepth, typename Pixel> +void IntraPredTest<bitdepth, Pixel>::TestSpeed( + const char* const digests[kNumIntraPredictors], const int num_runs) { + ASSERT_NE(digests, nullptr); + const auto* const left = + reinterpret_cast<const uint8_t*>(intra_pred_mem_.left_mem + 16); + const auto* const top = + reinterpret_cast<const uint8_t*>(intra_pred_mem_.top_mem + 16); + + libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed()); + intra_pred_mem_.Reset(&rnd); + + for (int i = 0; i < kNumIntraPredictors; ++i) { + if (cur_intrapreds_[i] == nullptr) continue; + memcpy(intra_pred_mem_.dst, intra_pred_mem_.ref_src, + sizeof(intra_pred_mem_.dst)); + const absl::Time start = absl::Now(); + for (int run = 0; run < num_runs; ++run) { + const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel); + cur_intrapreds_[i](intra_pred_mem_.dst, stride, top, left); + } + const absl::Duration elapsed_time = absl::Now() - start; + test_utils::CheckMd5Digest(ToString(tx_size_), + ToString(static_cast<IntraPredictor>(i)), + digests[i], intra_pred_mem_.dst, + sizeof(intra_pred_mem_.dst), elapsed_time); + } +} + +template <int bitdepth, typename Pixel> +void IntraPredTest<bitdepth, Pixel>::TestSaturatedValues() { + Pixel* const left = intra_pred_mem_.left_mem + 16; + Pixel* const top = intra_pred_mem_.top_mem + 16; + const auto kMaxPixel = static_cast<Pixel>((1 << bitdepth) - 1); + intra_pred_mem_.Set(kMaxPixel); + + // skip DcFill + for (int i = 1; i < kNumIntraPredictors; ++i) { + if (cur_intrapreds_[i] == nullptr) continue; + memcpy(intra_pred_mem_.dst, intra_pred_mem_.ref_src, + sizeof(intra_pred_mem_.dst)); + const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel); + cur_intrapreds_[i](intra_pred_mem_.dst, stride, top, left); + if (!test_utils::CompareBlocks(intra_pred_mem_.dst, intra_pred_mem_.ref_src, + block_width_, block_height_, kMaxBlockSize, + kMaxBlockSize, true)) { + ADD_FAILURE() << "Expected " << ToString(static_cast<IntraPredictor>(i)) + << " to produce a block containing '" + << static_cast<int>(kMaxPixel) << "'"; + } + } +} + +template <int bitdepth, typename Pixel> +void IntraPredTest<bitdepth, Pixel>::TestRandomValues() { + // Use an alternate seed to differentiate this test from TestSpeed(). + libvpx_test::ACMRandom rnd(test_utils::kAlternateDeterministicSeed); + for (int i = 0; i < kNumIntraPredictors; ++i) { + // Skip the 'C' test case as this is used as the reference. + if (base_intrapreds_[i] == nullptr) continue; + if (cur_intrapreds_[i] == nullptr) continue; + // It may be worthwhile to temporarily increase this loop size when testing + // changes that specifically affect this test. + for (int n = 0; n < 10000; ++n) { + intra_pred_mem_.Reset(&rnd); + + memcpy(intra_pred_mem_.dst, intra_pred_mem_.ref_src, + sizeof(intra_pred_mem_.dst)); + const Pixel* const top = intra_pred_mem_.top_mem + 16; + const Pixel* const left = intra_pred_mem_.left_mem + 16; + const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel); + base_intrapreds_[i](intra_pred_mem_.ref_src, stride, top, left); + cur_intrapreds_[i](intra_pred_mem_.dst, stride, top, left); + if (!test_utils::CompareBlocks( + intra_pred_mem_.dst, intra_pred_mem_.ref_src, block_width_, + block_height_, kMaxBlockSize, kMaxBlockSize, true)) { + ADD_FAILURE() << "Result from optimized version of " + << ToString(static_cast<IntraPredictor>(i)) + << " differs from reference in iteration #" << n; + break; + } + } + } +} + +//------------------------------------------------------------------------------ + +using IntraPredTest8bpp = IntraPredTest<8, uint8_t>; + +const char* const* GetIntraPredDigests8bpp(TransformSize tx_size) { + static const char* const kDigests4x4[kNumIntraPredictors] = { + "7b1c762e28747f885d2b7d83cb8aa75c", "73353f179207f1432d40a132809e3a50", + "80c9237c838b0ec0674ccb070df633d5", "1cd79116b41fda884e7fa047f5eb14df", + "33211425772ee539a59981a2e9dc10c1", "d6f5f65a267f0e9a2752e8151cc1dcd7", + "7ff8c762cb766eb0665682152102ce4b", "2276b861ae4599de15938651961907ec", + "766982bc69f4aaaa8e71014c2dc219bc", "e2c31b5fd2199c49e17c31610339ab3f", + }; + static const char* const kDigests4x8[kNumIntraPredictors] = { + "0a0d8641ecfa0e82f541acdc894d5574", "1a40371af6cff9c278c5b0def9e4b3e7", + "3631a7a99569663b514f15b590523822", "646c7b592136285bd31501494e7393e7", + "ecbe89cc64dc2688123d3cfe865b5237", "79048e70ecbb7d43a4703f62718588c0", + "f3de11bf1198a00675d806d29c41d676", "32bb6cd018f6e871c342fcc21c7180cf", + "6f076a1e5ab3d69cf08811d62293e4be", "2a84460a8b189b4589824cf6b3b39954", + }; + static const char* const kDigests4x16[kNumIntraPredictors] = { + "cb8240be98444ede5ae98ca94afc1557", "460acbcf825a1fa0d8f2aa6bf2d6a21c", + "7896fdbbfe538dce1dc3a5b0873d74b0", "504aea29c6b27f21555d5516b8de2d8a", + "c5738e7fa82b91ea0e39232120da56ea", "19abbd934c243a6d9df7585d81332dd5", + "9e42b7b342e45c842dfa8aedaddbdfaa", "0e9eb07a89f8bf96bc219d5d1c3d9f6d", + "659393c31633e0f498bae384c9df5c7b", "bee3a28312da99dd550ec309ae4fff25", + }; + static const char* const kDigests8x4[kNumIntraPredictors] = { + "5950744064518f77867c8e14ebd8b5d7", "46b6cbdc76efd03f4ac77870d54739f7", + "efe21fd1b98cb1663950e0bf49483b3b", "3c647b64760b298092cbb8e2f5c06bfd", + "c3595929687ffb04c59b128d56e2632f", "d89ad2ddf8a74a520fdd1d7019fd75b4", + "53907cb70ad597ee5885f6c58201f98b", "09d2282a29008b7fb47eb60ed6653d06", + "e341fc1c910d7cb2dac5dbc58b9c9af9", "a8fabd4c259b607a90a2e4d18cae49de", + }; + static const char* const kDigests8x8[kNumIntraPredictors] = { + "06fb7cb52719855a38b4883b4b241749", "2013aafd42a4303efb553e42264ab8b0", + "2f070511d5680c12ca73a20e47fd6e23", "9923705af63e454392625794d5459fe0", + "04007a0d39778621266e2208a22c4fac", "2d296c202d36b4a53f1eaddda274e4a1", + "c87806c220d125c7563c2928e836fbbd", "339b49710a0099087e51ab5afc8d8713", + "c90fbc020afd9327bf35dccae099bf77", "95b356a7c346334d29294a5e2d13cfd9", + }; + static const char* const kDigests8x16[kNumIntraPredictors] = { + "3c5a4574d96b5bb1013429636554e761", "8cf56b17c52d25eb785685f2ab48b194", + "7911e2e02abfbe226f17529ac5db08fc", "064e509948982f66a14293f406d88d42", + "5c443aa713891406d5be3af4b3cf67c6", "5d2cb98e532822ca701110cda9ada968", + "3d58836e17918b8890012dd96b95bb9d", "20e8d61ddc451b9e553a294073349ffd", + "a9aa6cf9d0dcf1977a1853ccc264e40b", "103859f85750153f47b81f68ab7881f2", + }; + static const char* const kDigests8x32[kNumIntraPredictors] = { + "b393a2db7a76acaccc39e04d9dc3e8ac", "bbda713ee075a7ef095f0f479b5a1f82", + "f337dce3980f70730d6f6c2c756e3b62", "796189b05dc026e865c9e95491b255d1", + "ea932c21e7189eeb215c1990491320ab", "a9fffdf9455eba5e3b01317cae140289", + "9525dbfdbf5fba61ef9c7aa5fe887503", "8c6a7e3717ff8a459f415c79bb17341c", + "3761071bfaa2363a315fe07223f95a2d", "0e5aeb9b3f485b90df750469f60c15aa", + }; + static const char* const kDigests16x4[kNumIntraPredictors] = { + "1c0a950b3ac500def73b165b6a38467c", "95e7f7300f19da280c6a506e40304462", + "28a6af15e31f76d3ff189012475d78f5", "e330d67b859bceef62b96fc9e1f49a34", + "36eca3b8083ce2fb5f7e6227dfc34e71", "08f567d2abaa8e83e4d9b33b3f709538", + "dc2d0ba13aa9369446932f03b53dc77d", "9ab342944c4b1357aa79d39d7bebdd3a", + "77ec278c5086c88b91d68eef561ed517", "60fbe11bfe216c182aaacdec326c4dae", + }; + static const char* const kDigests16x8[kNumIntraPredictors] = { + "053a2bc4b5b7287fee524af4e77f077a", "619b720b13f14f32391a99ea7ff550d5", + "728d61c11b06baf7fe77881003a918b9", "889997b89a44c9976cb34f573e2b1eea", + "b43bfc31d1c770bb9ca5ca158c9beec4", "9d3fe9f762e0c6e4f114042147c50c7f", + "c74fdd7c9938603b01e7ecf9fdf08d61", "870c7336db1102f80f74526bd5a7cf4e", + "3fd5354a6190903d6a0b661fe177daf6", "409ca6b0b2558aeadf5ef2b8a887e67a", + }; + static const char* const kDigests16x16[kNumIntraPredictors] = { + "1fa9e2086f6594bda60c30384fbf1635", "2098d2a030cd7c6be613edc74dc2faf8", + "f3c72b0c8e73f1ddca04d14f52d194d8", "6b31f2ee24cf88d3844a2fc67e1f39f3", + "d91a22a83575e9359c5e4871ab30ddca", "24c32a0d38b4413d2ef9bf1f842c8634", + "6e9e47bf9da9b2b9ae293e0bbd8ff086", "968b82804b5200b074bcdba9718140d4", + "4e6d7e612c5ae0bbdcc51a453cd1db3f", "ce763a41977647d072f33e277d69c7b9", + }; + static const char* const kDigests16x32[kNumIntraPredictors] = { + "01afd04432026ff56327d6226b720be2", "a6e7be906cc6f1e7a520151bfa7c303d", + "bc05c46f18d0638f0228f1de64f07cd5", "204e613e429935f721a5b29cec7d44bb", + "aa0a7c9a7482dfc06d9685072fc5bafd", "ffb60f090d83c624bb4f7dc3a630ac4f", + "36bcb9ca9bb5eac520b050409de25da5", "34d9a5dd3363668391bc3bd05b468182", + "1e149c28db8b234e43931c347a523794", "6e8aff02470f177c3ff4416db79fc508", + }; + static const char* const kDigests16x64[kNumIntraPredictors] = { + "727797ef15ccd8d325476fe8f12006a3", "f77c544ac8035e01920deae40cee7b07", + "12b0c69595328c465e0b25e0c9e3e9fc", "3b2a053ee8b05a8ac35ad23b0422a151", + "f3be77c0fe67eb5d9d515e92bec21eb7", "f1ece6409e01e9dd98b800d49628247d", + "efd2ec9bfbbd4fd1f6604ea369df1894", "ec703de918422b9e03197ba0ed60a199", + "739418efb89c07f700895deaa5d0b3e3", "9943ae1bbeeebfe1d3a92dc39e049d63", + }; + static const char* const kDigests32x8[kNumIntraPredictors] = { + "4da55401331ed98acec0c516d7307513", "0ae6f3974701a5e6c20baccd26b4ca52", + "79b799f1eb77d5189535dc4e18873a0e", "90e943adf3de4f913864dce4e52b4894", + "5e1b9cc800a89ef45f5bdcc9e99e4e96", "3103405df20d254cbf32ac30872ead4b", + "648550e369b77687bff3c7d6f249b02f", "f9f73bcd8aadfc059fa260325df957a1", + "204cef70d741c25d4fe2b1d10d2649a5", "04c05e18488496eba64100faa25e8baf", + }; + static const char* const kDigests32x16[kNumIntraPredictors] = { + "86ad1e1047abaf9959150222e8f19593", "1908cbe04eb4e5c9d35f1af7ffd7ee72", + "6ad3bb37ebe8374b0a4c2d18fe3ebb6a", "08d3cfe7a1148bff55eb6166da3378c6", + "656a722394764d17b6c42401b9e0ad3b", "4aa00c192102efeb325883737e562f0d", + "9881a90ca88bca4297073e60b3bb771a", "8cd74aada398a3d770fc3ace38ecd311", + "0a927e3f5ff8e8338984172cc0653b13", "d881d68b4eb3ee844e35e04ad6721f5f", + }; + static const char* const kDigests32x32[kNumIntraPredictors] = { + "1303ca680644e3d8c9ffd4185bb2835b", "2a4d9f5cc8da307d4cf7dc021df10ba9", + "ced60d3f4e4b011a6a0314dd8a4b1fd8", "ced60d3f4e4b011a6a0314dd8a4b1fd8", + "1464b01aa928e9bd82c66bad0f921693", "90deadfb13d7c3b855ba21b326c1e202", + "af96a74f8033dff010e53a8521bc6f63", "9f1039f2ef082aaee69fcb7d749037c2", + "3f82893e478e204f2d254b34222d14dc", "ddb2b95ffb65b84dd4ff1f7256223305", + }; + static const char* const kDigests32x64[kNumIntraPredictors] = { + "e1e8ed803236367821981500a3d9eebe", "0f46d124ba9f48cdd5d5290acf786d6d", + "4e2a2cfd8f56f15939bdfc753145b303", "0ce332b343934b34cd4417725faa85cb", + "1d2f8e48e3adb7c448be05d9f66f4954", "9fb2e176636a5689b26f73ca73fcc512", + "e720ebccae7e25e36f23da53ae5b5d6a", "86fe4364734169aaa4520d799890d530", + "b1870290764bb1b100d1974e2bd70f1d", "ce5b238e19d85ef69d85badfab4e63ae", + }; + static const char* const kDigests64x16[kNumIntraPredictors] = { + "de1b736e9d99129609d6ef3a491507a0", "516d8f6eb054d74d150e7b444185b6b9", + "69e462c3338a9aaf993c3f7cfbc15649", "821b76b1494d4f84d20817840f719a1a", + "fd9b4276e7affe1e0e4ce4f428058994", "cd82fd361a4767ac29a9f406b480b8f3", + "2792c2f810157a4a6cb13c28529ff779", "1220442d90c4255ba0969d28b91e93a6", + "c7253e10b45f7f67dfee3256c9b94825", "879792198071c7e0b50b9b5010d8c18f", + }; + static const char* const kDigests64x32[kNumIntraPredictors] = { + "e48e1ac15e97191a8fda08d62fff343e", "80c15b303235f9bc2259027bb92dfdc4", + "538424b24bd0830f21788e7238ca762f", "a6c5aeb722615089efbca80b02951ceb", + "12604b37875533665078405ef4582e35", "0048afa17bd3e1632d68b96048836530", + "07a0cfcb56a5eed50c4bd6c26814336b", "529d8a070de5bc6531fa3ee8f450c233", + "33c50a11c7d78f72434064f634305e95", "e0ef7f0559c1a50ec5a8c12011b962f7", + }; + static const char* const kDigests64x64[kNumIntraPredictors] = { + "a1650dbcd56e10288c3e269eca37967d", "be91585259bc37bf4dc1651936e90b3e", + "afe020786b83b793c2bbd9468097ff6e", "6e1094fa7b50bc813aa2ba29f5df8755", + "9e5c34f3797e0cdd3cd9d4c05b0d8950", "bc87be7ac899cc6a28f399d7516c49fe", + "9811fd0d2dd515f06122f5d1bd18b784", "3c140e466f2c2c0d9cb7d2157ab8dc27", + "9543de76c925a8f6adc884cc7f98dc91", "df1df0376cc944afe7e74e94f53e575a", + }; + + switch (tx_size) { + case kTransformSize4x4: + return kDigests4x4; + case kTransformSize4x8: + return kDigests4x8; + case kTransformSize4x16: + return kDigests4x16; + case kTransformSize8x4: + return kDigests8x4; + case kTransformSize8x8: + return kDigests8x8; + case kTransformSize8x16: + return kDigests8x16; + case kTransformSize8x32: + return kDigests8x32; + case kTransformSize16x4: + return kDigests16x4; + case kTransformSize16x8: + return kDigests16x8; + case kTransformSize16x16: + return kDigests16x16; + case kTransformSize16x32: + return kDigests16x32; + case kTransformSize16x64: + return kDigests16x64; + case kTransformSize32x8: + return kDigests32x8; + case kTransformSize32x16: + return kDigests32x16; + case kTransformSize32x32: + return kDigests32x32; + case kTransformSize32x64: + return kDigests32x64; + case kTransformSize64x16: + return kDigests64x16; + case kTransformSize64x32: + return kDigests64x32; + case kTransformSize64x64: + return kDigests64x64; + default: + ADD_FAILURE() << "Unknown transform size: " << tx_size; + return nullptr; + } +} + +TEST_P(IntraPredTest8bpp, DISABLED_Speed) { + const auto num_runs = + static_cast<int>(2.0e9 / (block_width_ * block_height_)); + TestSpeed(GetIntraPredDigests8bpp(tx_size_), num_runs); +} + +TEST_P(IntraPredTest8bpp, FixedInput) { + TestSpeed(GetIntraPredDigests8bpp(tx_size_), 1); +} + +TEST_P(IntraPredTest8bpp, Overflow) { TestSaturatedValues(); } +TEST_P(IntraPredTest8bpp, Random) { TestRandomValues(); } + +//------------------------------------------------------------------------------ + +#if LIBGAV1_MAX_BITDEPTH >= 10 +using IntraPredTest10bpp = IntraPredTest<10, uint16_t>; + +const char* const* GetIntraPredDigests10bpp(TransformSize tx_size) { + static const char* const kDigests4x4[kNumIntraPredictors] = { + "432bf9e762416bec582cb3654cbc4545", "8b9707ff4d506e0cb326f2d9a8d78705", + "a076275258cc5af87ed8b075136fb219", "f9587004012a8d2cecaa347331ccdf96", + "1c4e6890c5e6eed495fe54a6b6df8d6f", "0ae15fae8969a3c972ee895f325955a3", + "97db177738b831da8066df4f3fb7adbd", "4add5685b8a56991c9dce4ff7086ec25", + "75c6a655256188e378e70658b8f1631f", "14a27db20f9d5594ef74a7ea10c3e5ef", + }; + static const char* const kDigests4x8[kNumIntraPredictors] = { + "9cbd7c18aca2737fa41db27150798819", "13d1e734692e27339c10b07da33c1113", + "0617cf74e2dd5d34ea517af1767fa47e", "c6a7b01228ccdf74af8528ef8f5f55c6", + "13b05d87b3d566b2f7a4b332cd8a762e", "b26ae0e8da1fe8989dfe2900fa2c3847", + "c30f3acdd386bdac91028fe48b751810", "04d2baf5192c5af97ca18d3b9b0d5968", + "a0ef82983822fc815bf1e8326cd41e33", "20bf218bae5f6b5c6d56b85f3f9bbadb", + }; + static const char* const kDigests4x16[kNumIntraPredictors] = { + "d9b47bdddaa5e22312ff9ece7a3cae08", "cb76c79971b502dd8999a7047b3e2f86", + "3b09a3ff431d03b379acfdc444602540", "88608f6fcd687831e871053723cf76c3", + "a7bd2a17de1cf19c9a4b2c550f277a5c", "29b389f564f266a67687b8d2bc750418", + "4680847c30fe93c06f87e2ee1da544d6", "0e4eda11e1fe6ebe8526c2a2c5390bbb", + "bf3e20197282885acabb158f3a77ba59", "fccea71d1a253316b905f4a073c84a36", + }; + static const char* const kDigests8x4[kNumIntraPredictors] = { + "05ba0ed96aac48cd94e7597f12184320", "d97d04e791904d3cedc34d5430a4d1d2", + "49217081a169c2d30b0a43f816d0b58b", "09e2a6a6bfe35b83e9434ee9c8dcf417", + "4b03c8822169ee4fa058513d65f0e32f", "cabdeebc923837ee3f2d3480354d6a81", + "957eda610a23a011ed25976aee94eaf0", "4a197e3dfce1f0d3870138a9b66423aa", + "18c0d0fbe0e96a0baf2f98fa1908cbb9", "21114e5737328cdbba9940e4f85a0855", + }; + static const char* const kDigests8x8[kNumIntraPredictors] = { + "430e99eecda7e6434e1973dbdcc2a29d", "88864d7402c09b57735db49c58707304", + "8312f80b936380ceb51375e29a4fd75d", "472a7ed9c68bdbd9ecca197b7a8b3f01", + "4f66ee4dc0cb752c3b65d576cd06bb5c", "36383d6f61799143470129e2d5241a6f", + "c96279406c8d2d02771903e93a4e8d37", "4fb64f9700ed0bf08fbe7ab958535348", + "c008c33453ac9cf8c42ae6ec88f9941c", "39c401a9938b23e318ae7819e458daf1", + }; + static const char* const kDigests8x16[kNumIntraPredictors] = { + "bda6b75fedfe0705f9732ff84c918672", "4ff130a47429e0762386557018ec10b2", + "8156557bf938d8e3a266318e57048fc5", "bdfa8e01a825ec7ae2d80519e3c94eec", + "108fc8e5608fe09f9cc30d7a52cbc0c1", "a2271660af5424b64c6399ca5509dee1", + "b09af9729f39516b28ff62363f8c0cb2", "4fe67869dac99048dfcf4d4e621884ec", + "311f498369a9c98f77a961bf91e73e65", "d66e78b9f41d5ee6a4b25e37ec9af324", + }; + static const char* const kDigests8x32[kNumIntraPredictors] = { + "26c45325f02521e7e5c66c0aa0819329", "79dfb68513d4ccd2530c485f0367858e", + "8288e99b4d738b13956882c3ad3f03fe", "7c4993518b1620b8be8872581bb72239", + "2b1c3126012d981f787ed0a2601ee377", "051ba9f0c4d4fecb1fcd81fdea94cae4", + "320362239ad402087303a4df39512bb1", "210df35b2055c9c01b9e3e5ae24e524b", + "f8536db74ce68c0081bbd8799dac25f9", "27f2fe316854282579906d071af6b705", + }; + static const char* const kDigests16x4[kNumIntraPredictors] = { + "decff67721ff7e9e65ec641e78f5ccf3", "99e3b2fbdabfa9b76b749cfb6530a9fd", + "accdb3d25629916963a069f1e1c0e061", "ad42855e9146748b0e235b8428487b4b", + "53025e465f267e7af2896ebd028447a0", "577d26fcd2d655cc77a1f1f875648699", + "7a61a3619267221b448b20723840e9f0", "fb4ccc569bdae3614e87bc5be1e84284", + "b866095d8a3e6910cc4f92f8d8d6075a", "6ba9013cba1624872bfbac111e8d344a", + }; + static const char* const kDigests16x8[kNumIntraPredictors] = { + "2832156bd076c75f8be5622f34cb3efe", "da70e516f5a8842dd4965b80cd8d2a76", + "c3e137c6d79c57be2073d1eda22c8d1e", "8c5d28c7b3301b50326582dd7f89a175", + "9d8558775155b201cd178ab61458b642", "ecbddb9c6808e0c609c8fe537b7f7408", + "29a123c22cb4020170f9a80edf1208da", "653d0cd0688aa682334156f7b4599b34", + "1bfa66ae92a22a0346511db1713fe7df", "1802ad1e657e7fc08fc063342f471ca1", + }; + static const char* const kDigests16x16[kNumIntraPredictors] = { + "2270c626de9d49769660ae9184a6428f", "9f069625cdcdd856e2e7ec19ff4fcd50", + "34167b9c413362a377aa7b1faf92ae6d", "3cec2b23d179765daea8dfb87c9efdd5", + "daa8f0863a5df2aef2b20999961cc8f8", "d9e4dd4bc63991e4f09cb97eb25f4db4", + "4e1a182fc3fcf5b9f5a73898f81c2004", "c58e4275406c9fd1c2a74b40c27afff0", + "b8092796fd4e4dd9d2b92afb770129ba", "75424d1f18ff00c4093743d033c6c9b6", + }; + static const char* const kDigests16x32[kNumIntraPredictors] = { + "5aa050947f3d488537f5a68c23bb135b", "9e66143a2c3863b6fe171275a192d378", + "86b0c4777625e84d52913073d234f860", "9e2144fcf2107c76cec4241416bbecd5", + "c72be592efc72c3c86f2359b6f622aba", "c4e0e735545f78f43e21e9c39eab7b8f", + "52122e7c84a4bab67a8a359efb427023", "7b5fd8bb7e0744e81fd6fa4ed4c2e0fb", + "a9950d110bffb0411a8fcd1262dceef0", "2a2dd496f01f5d87f257ed202a703cbe", + }; + static const char* const kDigests16x64[kNumIntraPredictors] = { + "eeb1b873e81ca428b11f162bd5b28843", "39ce7d22791f82562b0ca1e0afdf1604", + "6bd6bdac8982a4b84613f9963d35d5e9", "a9ac2438e87522621c7e6fe6d02c01ab", + "a8b9c471fe6c66ed0717e77fea77bba1", "e050b6aa38aee6e951d3be5a94a8abd0", + "3c5ecc31aa45e8175d37e90af247bca6", "30c0f9e412ea726970f575f910edfb94", + "f3d96395816ce58fb98480a5b4c32ab2", "9c14811957e013fb009dcd4a3716b338", + }; + static const char* const kDigests32x8[kNumIntraPredictors] = { + "d6560d7fc9ae9bd7c25e2983b4a825e3", "90a67154bbdc26cd06ab0fa25fff3c53", + "c42d37c5a634e68fafc982626842db0b", "ecc8646d258cfa431facbc0dba168f80", + "9f3c167b790b52242dc8686c68eac389", "62dc3bc34406636ccec0941579461f65", + "5c0f0ebdb3c936d4decc40d5261aec7c", "dbfc0f056ca25e0331042da6d292e10a", + "14fa525d74e6774781198418d505c595", "5f95e70db03da9ed70cd79e23f19199c", + }; + static const char* const kDigests32x16[kNumIntraPredictors] = { + "dfe3630aa9eeb1adcc8604269a309f26", "ba6180227d09f5a573f69dc6ee1faf80", + "03edea9d71ca3d588e1a0a69aecdf555", "2c8805415f44b4fac6692090dc1b1ddd", + "18efd17ed72a6e92ef8b0a692cf7a2e3", "63a6e0abfb839b43c68c23b2c43c8918", + "be15479205bb60f5a17baaa81a6b47ad", "243d21e1d9f9dd2b981292ac7769315a", + "21de1cb5269e0e1d08930c519e676bf7", "73065b3e27e9c4a3a6d043712d3d8b25", + }; + static const char* const kDigests32x32[kNumIntraPredictors] = { + "c3136bb829088e33401b1affef91f692", "68bbcf93d17366db38bbc7605e07e322", + "2786be5fb7c25eeec4d2596c4154c3eb", "25ac7468e691753b8291be859aac7493", + "a6805ce21bfd26760e749efc8f590fa3", "5a38fd324b466e8ac43f5e289d38107e", + "dd0628fc5cc920b82aa941378fa907c8", "8debadbdb2dec3dc7eb43927e9d36998", + "61e1bc223c9e04c64152cc4531b6c099", "900b00ac1f20c0a8d22f8b026c0ee1cc", + }; + static const char* const kDigests32x64[kNumIntraPredictors] = { + "5a591b2b83f0a6cce3c57ce164a5f983", "f42167ec516102b83b2c5176df57316b", + "58f3772d3df511c8289b340beb178d96", "c24166e7dc252d34ac6f92712956d751", + "7dca3acfe2ea09e6292a9ece2078b827", "5c029235fc0820804e40187d2b22a96e", + "375572944368afbc04ca97dab7fb3328", "8867235908736fd99c4022e4ed604e6e", + "63ec336034d62846b75558c49082870f", "46f35d85eb8499d61bfeac1c49e52531", + }; + static const char* const kDigests64x16[kNumIntraPredictors] = { + "67755882209304659a0e6bfc324e16b9", "cd89b272fecb5f23431b3f606f590722", + "9bcff7d971a4af0a2d1cac6d66d83482", "d8d6bb55ebeec4f03926908d391e15ba", + "0eb5b5ced3e7177a1dd6a1e72e7a7d21", "92b47fe431d9cf66f9e601854f0f3017", + "7dc599557eddb2ea480f86fc89c76b30", "4f40175676c164320fe8005440ad9217", + "b00eacb24081a041127f136e9e5983ec", "cb0ab76a5e90f2eb75c38b99b9833ff8", + }; + static const char* const kDigests64x32[kNumIntraPredictors] = { + "21d873011d1b4ef1daedd9aa8c6938ea", "4866da21db0261f738903d97081cb785", + "a722112233a82595a8d001a4078b834d", "24c7a133c6fcb59129c3782ef908a6c1", + "490e40505dd255d3a909d8a72c280cbc", "2afe719fb30bf2a664829bb74c8f9e2a", + "623adad2ebb8f23e355cd77ace4616cd", "d6092541e9262ad009bef79a5d350a86", + "ae86d8fba088683ced8abfd7e1ddf380", "32aa8aa21f2f24333d31f99e12b95c53", + }; + static const char* const kDigests64x64[kNumIntraPredictors] = { + "6d88aeb40dfe3ac43c68808ca3c00806", "6a75d88ac291d6a3aaf0eec0ddf2aa65", + "30ef52d7dc451affdd587c209f5cb2dd", "e073f7969f392258eaa907cf0636452a", + "de10f07016a2343bcd3a9deb29f4361e", "dc35ff273fea4355d2c8351c2ed14e6e", + "01b9a545968ac75c3639ddabb837fa0b", "85c98ed9c0ea1523a15281bc9a909b8c", + "4c255f7ef7fd46db83f323806d79dca4", "fe2fe6ffb19cb8330e2f2534271d6522", + }; + + switch (tx_size) { + case kTransformSize4x4: + return kDigests4x4; + case kTransformSize4x8: + return kDigests4x8; + case kTransformSize4x16: + return kDigests4x16; + case kTransformSize8x4: + return kDigests8x4; + case kTransformSize8x8: + return kDigests8x8; + case kTransformSize8x16: + return kDigests8x16; + case kTransformSize8x32: + return kDigests8x32; + case kTransformSize16x4: + return kDigests16x4; + case kTransformSize16x8: + return kDigests16x8; + case kTransformSize16x16: + return kDigests16x16; + case kTransformSize16x32: + return kDigests16x32; + case kTransformSize16x64: + return kDigests16x64; + case kTransformSize32x8: + return kDigests32x8; + case kTransformSize32x16: + return kDigests32x16; + case kTransformSize32x32: + return kDigests32x32; + case kTransformSize32x64: + return kDigests32x64; + case kTransformSize64x16: + return kDigests64x16; + case kTransformSize64x32: + return kDigests64x32; + case kTransformSize64x64: + return kDigests64x64; + default: + ADD_FAILURE() << "Unknown transform size: " << tx_size; + return nullptr; + } +} + +TEST_P(IntraPredTest10bpp, DISABLED_Speed) { + const auto num_runs = + static_cast<int>(2.0e9 / (block_width_ * block_height_)); + TestSpeed(GetIntraPredDigests10bpp(tx_size_), num_runs); +} + +TEST_P(IntraPredTest10bpp, FixedInput) { + TestSpeed(GetIntraPredDigests10bpp(tx_size_), 1); +} + +TEST_P(IntraPredTest10bpp, Overflow) { TestSaturatedValues(); } +TEST_P(IntraPredTest10bpp, Random) { TestRandomValues(); } +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +constexpr TransformSize kTransformSizes[] = { + kTransformSize4x4, kTransformSize4x8, kTransformSize4x16, + kTransformSize8x4, kTransformSize8x8, kTransformSize8x16, + kTransformSize8x32, kTransformSize16x4, kTransformSize16x8, + kTransformSize16x16, kTransformSize16x32, kTransformSize16x64, + kTransformSize32x8, kTransformSize32x16, kTransformSize32x32, + kTransformSize32x64, kTransformSize64x16, kTransformSize64x32, + kTransformSize64x64}; + +INSTANTIATE_TEST_SUITE_P(C, IntraPredTest8bpp, + testing::ValuesIn(kTransformSizes)); +#if LIBGAV1_ENABLE_SSE4_1 +INSTANTIATE_TEST_SUITE_P(SSE41, IntraPredTest8bpp, + testing::ValuesIn(kTransformSizes)); +#endif // LIBGAV1_ENABLE_SSE4_1 +#if LIBGAV1_ENABLE_NEON +INSTANTIATE_TEST_SUITE_P(NEON, IntraPredTest8bpp, + testing::ValuesIn(kTransformSizes)); +#endif // LIBGAV1_ENABLE_NEON + +#if LIBGAV1_MAX_BITDEPTH >= 10 +INSTANTIATE_TEST_SUITE_P(C, IntraPredTest10bpp, + testing::ValuesIn(kTransformSizes)); +#if LIBGAV1_ENABLE_SSE4_1 +INSTANTIATE_TEST_SUITE_P(SSE41, IntraPredTest10bpp, + testing::ValuesIn(kTransformSizes)); +#endif // LIBGAV1_ENABLE_SSE4_1 +#if LIBGAV1_ENABLE_NEON +INSTANTIATE_TEST_SUITE_P(NEON, IntraPredTest10bpp, + testing::ValuesIn(kTransformSizes)); +#endif // LIBGAV1_ENABLE_NEON + +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +} // namespace +} // namespace dsp + +static std::ostream& operator<<(std::ostream& os, const TransformSize tx_size) { + return os << ToString(tx_size); +} + +} // namespace libgav1 diff --git a/src/dsp/inverse_transform.cc b/src/dsp/inverse_transform.cc index a03fad2..ed984d8 100644 --- a/src/dsp/inverse_transform.cc +++ b/src/dsp/inverse_transform.cc @@ -1184,9 +1184,10 @@ void TransformLoop_C(TransformType tx_type, TransformSize tx_size, Residual tx_buffer[64]; for (int j = 0; j < tx_width; ++j) { const int flipped_j = flip_columns ? tx_width - j - 1 : j; - for (int i = 0; i < tx_height; ++i) { + int i = 0; + do { tx_buffer[i] = residual[i][flipped_j]; - } + } while (++i != tx_height); if (adjusted_tx_height == 1) { dconly_transform1d(tx_buffer, column_clamp_range, false, 0, false); } else { @@ -1211,6 +1212,7 @@ void TransformLoop_C(TransformType tx_type, TransformSize tx_size, //------------------------------------------------------------------------------ +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS template <int bitdepth, typename Residual, typename Pixel> void InitAll(Dsp* const dsp) { // Maximum transform size for Dct is 64. @@ -1325,6 +1327,7 @@ void InitAll(Dsp* const dsp) { Wht4DcOnly_C<bitdepth, Residual>, Wht4_C<Residual>, /*is_row=*/false>; } +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS void Init8bpp() { Dsp* const dsp = dsp_internal::GetWritableDspTable(8); diff --git a/src/dsp/inverse_transform_test.cc b/src/dsp/inverse_transform_test.cc new file mode 100644 index 0000000..623e203 --- /dev/null +++ b/src/dsp/inverse_transform_test.cc @@ -0,0 +1,536 @@ +// Copyright 2021 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/inverse_transform.h" + +#include <algorithm> +#include <cstdint> +#include <cstdio> +#include <cstring> +#include <ostream> + +#include "absl/strings/match.h" +#include "absl/time/clock.h" +#include "absl/time/time.h" +#include "gtest/gtest.h" +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/utils/array_2d.h" +#include "src/utils/bit_mask_set.h" +#include "src/utils/common.h" +#include "src/utils/constants.h" +#include "src/utils/cpu.h" +#include "src/utils/memory.h" +#include "tests/block_utils.h" +#include "tests/third_party/libvpx/acm_random.h" +#include "tests/utils.h" + +namespace libgav1 { +namespace dsp { +namespace { + +constexpr int kMaxBlockSize = 64; +constexpr int kTotalPixels = kMaxBlockSize * kMaxBlockSize; + +const char* const kTransformSize1DNames[kNum1DTransformSizes] = { + "k1DTransformSize4", "k1DTransformSize8", "k1DTransformSize16", + "k1DTransformSize32", "k1DTransformSize64"}; + +constexpr TransformSize1D kRow1DTransformSizes[] = { + k1DTransformSize4, k1DTransformSize4, k1DTransformSize4, + k1DTransformSize8, k1DTransformSize8, k1DTransformSize8, + k1DTransformSize8, k1DTransformSize16, k1DTransformSize16, + k1DTransformSize16, k1DTransformSize16, k1DTransformSize16, + k1DTransformSize32, k1DTransformSize32, k1DTransformSize32, + k1DTransformSize32, k1DTransformSize64, k1DTransformSize64, + k1DTransformSize64}; + +constexpr TransformSize1D kCol1DTransformSizes[] = { + k1DTransformSize4, k1DTransformSize8, k1DTransformSize16, + k1DTransformSize4, k1DTransformSize8, k1DTransformSize16, + k1DTransformSize32, k1DTransformSize4, k1DTransformSize8, + k1DTransformSize16, k1DTransformSize32, k1DTransformSize64, + k1DTransformSize8, k1DTransformSize16, k1DTransformSize32, + k1DTransformSize64, k1DTransformSize16, k1DTransformSize32, + k1DTransformSize64}; + +template <int bitdepth, typename SrcPixel, typename DstPixel> +class InverseTransformTestBase : public testing::TestWithParam<TransformSize>, + public test_utils::MaxAlignedAllocable { + public: + InverseTransformTestBase() { + switch (tx_size_) { + case kNumTransformSizes: + EXPECT_NE(tx_size_, kNumTransformSizes); + break; + default: + block_width_ = kTransformWidth[tx_size_]; + block_height_ = kTransformHeight[tx_size_]; + break; + } + } + + InverseTransformTestBase(const InverseTransformTestBase&) = delete; + InverseTransformTestBase& operator=(const InverseTransformTestBase&) = delete; + ~InverseTransformTestBase() override = default; + + protected: + struct InverseTransformMem { + void Reset(libvpx_test::ACMRandom* rnd, int width, int height) { + ASSERT_NE(rnd, nullptr); + // Limit the size of the residual values to bitdepth + sign in order + // to prevent outranging in the transforms. + const int num_bits = bitdepth + 1; + const int sign_shift = (bitdepth == 8 ? 16 : 32) - num_bits; + const int mask = (1 << num_bits) - 1; + // Fill residual with random data. For widths == 64, only fill the upper + // left 32 x min(block_height_, 32). + memset(ref_src, 0, sizeof(ref_src)); + SrcPixel* r = ref_src; + const int stride = width; + for (int y = 0; y < std::min(height, 32); ++y) { + for (int x = 0; x < std::min(width, 32); ++x) { + r[x] = rnd->Rand16() & mask; + // The msb of num_bits is the sign bit, so force each 16 bit value to + // the correct sign. + r[x] = (r[x] << sign_shift) >> sign_shift; + } + r += stride; + } + + // Set frame data to random values. + for (int y = 0; y < kMaxBlockSize; ++y) { + for (int x = 0; x < kMaxBlockSize; ++x) { + const int mask = (1 << bitdepth) - 1; + cur_frame[y * kMaxBlockSize + x] = base_frame[y * kMaxBlockSize + x] = + rnd->Rand16() & mask; + } + } + } + + // Set ref_src to |pixel|. + void Set(const SrcPixel pixel) { + for (auto& r : ref_src) r = pixel; + } + + alignas(kMaxAlignment) DstPixel base_frame[kTotalPixels]; + alignas(kMaxAlignment) DstPixel cur_frame[kTotalPixels]; + + alignas(kMaxAlignment) SrcPixel base_residual[kTotalPixels]; + alignas(kMaxAlignment) SrcPixel cur_residual[kTotalPixels]; + + alignas(kMaxAlignment) SrcPixel ref_src[kTotalPixels]; + }; + + void SetUp() override { test_utils::ResetDspTable(bitdepth); } + + const TransformSize tx_size_ = GetParam(); + int block_width_; + int block_height_; + InverseTransformMem inverse_transform_mem_; +}; + +//------------------------------------------------------------------------------ +// InverseTransformTest + +template <int bitdepth, typename Pixel, typename DstPixel> +class InverseTransformTest + : public InverseTransformTestBase<bitdepth, Pixel, DstPixel> { + public: + InverseTransformTest() = default; + InverseTransformTest(const InverseTransformTest&) = delete; + InverseTransformTest& operator=(const InverseTransformTest&) = delete; + ~InverseTransformTest() override = default; + + protected: + using InverseTransformTestBase<bitdepth, Pixel, DstPixel>::tx_size_; + using InverseTransformTestBase<bitdepth, Pixel, DstPixel>::block_width_; + using InverseTransformTestBase<bitdepth, Pixel, DstPixel>::block_height_; + using InverseTransformTestBase<bitdepth, Pixel, + DstPixel>::inverse_transform_mem_; + + void SetUp() override { + InverseTransformTestBase<bitdepth, Pixel, DstPixel>::SetUp(); + InverseTransformInit_C(); + + const Dsp* const dsp = GetDspTable(bitdepth); + ASSERT_NE(dsp, nullptr); + + tx_size_1d_row_ = kRow1DTransformSizes[tx_size_]; + tx_size_1d_column_ = kCol1DTransformSizes[tx_size_]; + + memcpy(base_inverse_transforms_, dsp->inverse_transforms, + sizeof(base_inverse_transforms_)); + + const testing::TestInfo* const test_info = + testing::UnitTest::GetInstance()->current_test_info(); + const char* const test_case = test_info->test_suite_name(); + if (absl::StartsWith(test_case, "C/")) { + memset(base_inverse_transforms_, 0, sizeof(base_inverse_transforms_)); + } else if (absl::StartsWith(test_case, "SSE41/")) { + if ((GetCpuInfo() & kSSE4_1) != 0) { + InverseTransformInit_SSE4_1(); + } + } else if (absl::StartsWith(test_case, "NEON/")) { + InverseTransformInit_NEON(); + InverseTransformInit10bpp_NEON(); + } else { + FAIL() << "Unrecognized architecture prefix in test case name: " + << test_case; + } + + memcpy(cur_inverse_transforms_, dsp->inverse_transforms, + sizeof(cur_inverse_transforms_)); + + for (int i = 0; i < kNum1DTransforms; ++i) { + // skip functions that haven't been specialized for this particular + // architecture. + if (cur_inverse_transforms_[i][tx_size_1d_row_][kRow] == + base_inverse_transforms_[i][tx_size_1d_row_][kRow]) { + cur_inverse_transforms_[i][tx_size_1d_row_][kRow] = nullptr; + } + if (cur_inverse_transforms_[i][tx_size_1d_column_][kColumn] == + base_inverse_transforms_[i][tx_size_1d_column_][kColumn]) { + cur_inverse_transforms_[i][tx_size_1d_column_][kColumn] = nullptr; + } + } + + base_frame_buffer_.Reset(kMaxBlockSize, kMaxBlockSize, + inverse_transform_mem_.base_frame); + + cur_frame_buffer_.Reset(kMaxBlockSize, kMaxBlockSize, + inverse_transform_mem_.cur_frame); + } + + // These tests modify inverse_transform_mem_. + void TestRandomValues(int num_tests); + void TestDcOnlyRandomValue(int num_tests); + + Array2DView<DstPixel> base_frame_buffer_; + Array2DView<DstPixel> cur_frame_buffer_; + + TransformSize1D tx_size_1d_row_ = k1DTransformSize4; + TransformSize1D tx_size_1d_column_ = k1DTransformSize4; + + InverseTransformAddFuncs base_inverse_transforms_; + InverseTransformAddFuncs cur_inverse_transforms_; +}; + +constexpr TransformType kLibgav1TxType[kNumTransformTypes] = { + kTransformTypeDctDct, kTransformTypeAdstDct, + kTransformTypeDctAdst, kTransformTypeAdstAdst, + kTransformTypeFlipadstDct, kTransformTypeDctFlipadst, + kTransformTypeFlipadstFlipadst, kTransformTypeAdstFlipadst, + kTransformTypeFlipadstAdst, kTransformTypeIdentityIdentity, + kTransformTypeIdentityDct, kTransformTypeDctIdentity, + kTransformTypeIdentityAdst, kTransformTypeAdstIdentity, + kTransformTypeIdentityFlipadst, kTransformTypeFlipadstIdentity}; + +// Maps TransformType to dsp::Transform1D for the row transforms. +constexpr Transform1D kRowTransform[kNumTransformTypes] = { + k1DTransformDct, k1DTransformAdst, k1DTransformDct, + k1DTransformAdst, k1DTransformAdst, k1DTransformDct, + k1DTransformAdst, k1DTransformAdst, k1DTransformAdst, + k1DTransformIdentity, k1DTransformIdentity, k1DTransformDct, + k1DTransformIdentity, k1DTransformAdst, k1DTransformIdentity, + k1DTransformAdst}; + +// Maps TransformType to dsp::Transform1D for the column transforms. +constexpr Transform1D kColumnTransform[kNumTransformTypes] = { + k1DTransformDct, k1DTransformDct, k1DTransformAdst, + k1DTransformAdst, k1DTransformDct, k1DTransformAdst, + k1DTransformAdst, k1DTransformAdst, k1DTransformAdst, + k1DTransformIdentity, k1DTransformDct, k1DTransformIdentity, + k1DTransformAdst, k1DTransformIdentity, k1DTransformAdst, + k1DTransformIdentity}; + +// Mask indicating whether the transform sets contain a particular transform +// type. If |tx_type| is present in |tx_set|, then the |tx_type|th LSB is set. +constexpr BitMaskSet kTransformTypeInSetMask[kNumTransformSets] = { + BitMaskSet(0x1), BitMaskSet(0xE0F), BitMaskSet(0x20F), + BitMaskSet(0xFFFF), BitMaskSet(0xFFF), BitMaskSet(0x201)}; + +bool IsTxSizeTypeValid(TransformSize tx_size, TransformType tx_type) { + const TransformSize tx_size_square_max = kTransformSizeSquareMax[tx_size]; + TransformSet tx_set; + if (tx_size_square_max > kTransformSize32x32) { + tx_set = kTransformSetDctOnly; + } else if (tx_size_square_max == kTransformSize32x32) { + tx_set = kTransformSetInter3; + } else if (tx_size_square_max == kTransformSize16x16) { + tx_set = kTransformSetInter2; + } else { + tx_set = kTransformSetInter1; + } + return kTransformTypeInSetMask[tx_set].Contains(tx_type); +} + +template <int bitdepth, typename Pixel, typename DstPixel> +void InverseTransformTest<bitdepth, Pixel, DstPixel>::TestRandomValues( + int num_tests) { + libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed()); + + for (int tx_type_idx = 0; tx_type_idx < kNumTransformTypes; ++tx_type_idx) { + const TransformType tx_type = kLibgav1TxType[tx_type_idx]; + const Transform1D row_transform = kRowTransform[tx_type]; + const Transform1D column_transform = kColumnTransform[tx_type]; + + // Skip the 'C' test case as this is used as the reference. + if (base_inverse_transforms_[row_transform][tx_size_1d_row_][kRow] == + nullptr || + cur_inverse_transforms_[row_transform][tx_size_1d_row_][kRow] == + nullptr || + base_inverse_transforms_[column_transform][tx_size_1d_column_] + [kColumn] == nullptr || + cur_inverse_transforms_[column_transform][tx_size_1d_column_] + [kColumn] == nullptr) { + continue; + } + + // Only test valid tx_size for given tx_type. See 5.11.40. + if (!IsTxSizeTypeValid(tx_size_, tx_type)) continue; + + absl::Duration base_elapsed_time[2]; + absl::Duration cur_elapsed_time[2]; + + for (int n = 0; n < num_tests; ++n) { + const int tx_height = std::min(block_height_, 32); + const int start_x = 0; + const int start_y = 0; + + inverse_transform_mem_.Reset(&rnd, block_width_, block_height_); + memcpy(inverse_transform_mem_.base_residual, + inverse_transform_mem_.ref_src, + sizeof(inverse_transform_mem_.ref_src)); + memcpy(inverse_transform_mem_.cur_residual, + inverse_transform_mem_.ref_src, + sizeof(inverse_transform_mem_.ref_src)); + + const absl::Time base_row_start = absl::Now(); + base_inverse_transforms_[row_transform][tx_size_1d_row_][kRow]( + tx_type, tx_size_, tx_height, inverse_transform_mem_.base_residual, + start_x, start_y, &base_frame_buffer_); + base_elapsed_time[kRow] += absl::Now() - base_row_start; + + const absl::Time cur_row_start = absl::Now(); + cur_inverse_transforms_[row_transform][tx_size_1d_row_][kRow]( + tx_type, tx_size_, tx_height, inverse_transform_mem_.cur_residual, + start_x, start_y, &cur_frame_buffer_); + cur_elapsed_time[kRow] += absl::Now() - cur_row_start; + + const absl::Time base_column_start = absl::Now(); + base_inverse_transforms_[column_transform][tx_size_1d_column_][kColumn]( + tx_type, tx_size_, tx_height, inverse_transform_mem_.base_residual, + start_x, start_y, &base_frame_buffer_); + base_elapsed_time[kColumn] += absl::Now() - base_column_start; + + const absl::Time cur_column_start = absl::Now(); + cur_inverse_transforms_[column_transform][tx_size_1d_column_][kColumn]( + tx_type, tx_size_, tx_height, inverse_transform_mem_.cur_residual, + start_x, start_y, &cur_frame_buffer_); + cur_elapsed_time[kColumn] += absl::Now() - cur_column_start; + + if (!test_utils::CompareBlocks(inverse_transform_mem_.base_frame, + inverse_transform_mem_.cur_frame, + block_width_, block_height_, kMaxBlockSize, + kMaxBlockSize, false)) { + ADD_FAILURE() << "Result from optimized version of " + << ToString( + static_cast<TransformSize1D>(tx_size_1d_column_)) + << " differs from reference in iteration #" << n + << "tx_type_idx:" << tx_type_idx; + break; + } + } + + if (num_tests > 1) { + const auto base_row_elapsed_time_us = + static_cast<int>(absl::ToInt64Microseconds(base_elapsed_time[kRow])); + const auto cur_row_elapsed_time_us = + static_cast<int>(absl::ToInt64Microseconds(cur_elapsed_time[kRow])); + printf("TxType %30s[%19s]:: base_row: %5d us cur_row: %5d us %2.2fx \n", + ToString(tx_type), kTransformSize1DNames[tx_size_1d_row_], + base_row_elapsed_time_us, cur_row_elapsed_time_us, + static_cast<float>(base_row_elapsed_time_us) / + static_cast<float>(cur_row_elapsed_time_us)); + const auto base_column_elapsed_time_us = static_cast<int>( + absl::ToInt64Microseconds(base_elapsed_time[kColumn])); + const auto cur_column_elapsed_time_us = static_cast<int>( + absl::ToInt64Microseconds(cur_elapsed_time[kColumn])); + printf("TxType %30s[%19s]:: base_col: %5d us cur_col: %5d us %2.2fx \n", + ToString(tx_type), kTransformSize1DNames[tx_size_1d_column_], + base_column_elapsed_time_us, cur_column_elapsed_time_us, + static_cast<float>(base_column_elapsed_time_us) / + static_cast<float>(cur_column_elapsed_time_us)); + } + } +} + +template <int bitdepth, typename Pixel, typename DstPixel> +void InverseTransformTest<bitdepth, Pixel, DstPixel>::TestDcOnlyRandomValue( + int num_tests) { + libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed()); + + for (int tx_type_idx = 0; tx_type_idx < kNumTransformTypes; ++tx_type_idx) { + const TransformType tx_type = kLibgav1TxType[tx_type_idx]; + const Transform1D row_transform = kRowTransform[tx_type]; + const Transform1D column_transform = kColumnTransform[tx_type]; + + if (cur_inverse_transforms_[row_transform][tx_size_1d_row_][kRow] == + nullptr || + cur_inverse_transforms_[column_transform][tx_size_1d_column_] + [kColumn] == nullptr) { + continue; + } + + // Only test valid tx_size for given tx_type. See 5.11.40. + if (IsTxSizeTypeValid(tx_size_, tx_type) == 0) continue; + + absl::Duration base_elapsed_time[2]; + absl::Duration cur_elapsed_time[2]; + + for (int n = 0; n < num_tests; ++n) { + const int tx_height = std::min(block_height_, 32); + const int start_x = 0; + const int start_y = 0; + + // Using width == 1 and height == 1 will reset only the dc value. + inverse_transform_mem_.Reset(&rnd, 1, 1); + memcpy(inverse_transform_mem_.base_residual, + inverse_transform_mem_.ref_src, + sizeof(inverse_transform_mem_.ref_src)); + memcpy(inverse_transform_mem_.cur_residual, + inverse_transform_mem_.ref_src, + sizeof(inverse_transform_mem_.ref_src)); + + // For this test, the "base" contains the output when the + // tx_height is set to the max for the given block size. The + // "cur" contains the output when the passed in tx_height is 1. + // Compare the outputs for match. + const absl::Time base_row_start = absl::Now(); + cur_inverse_transforms_[row_transform][tx_size_1d_row_][kRow]( + tx_type, tx_size_, tx_height, inverse_transform_mem_.base_residual, + start_x, start_y, &base_frame_buffer_); + base_elapsed_time[kRow] += absl::Now() - base_row_start; + + const absl::Time cur_row_start = absl::Now(); + cur_inverse_transforms_[row_transform][tx_size_1d_row_][kRow]( + tx_type, tx_size_, /*adjusted_tx_height=*/1, + inverse_transform_mem_.cur_residual, start_x, start_y, + &cur_frame_buffer_); + cur_elapsed_time[kRow] += absl::Now() - cur_row_start; + + const absl::Time base_column_start = absl::Now(); + cur_inverse_transforms_[column_transform][tx_size_1d_column_][kColumn]( + tx_type, tx_size_, tx_height, inverse_transform_mem_.base_residual, + start_x, start_y, &base_frame_buffer_); + base_elapsed_time[kColumn] += absl::Now() - base_column_start; + + const absl::Time cur_column_start = absl::Now(); + cur_inverse_transforms_[column_transform][tx_size_1d_column_][kColumn]( + tx_type, tx_size_, /*adjusted_tx_height=*/1, + inverse_transform_mem_.cur_residual, start_x, start_y, + &cur_frame_buffer_); + cur_elapsed_time[kColumn] += absl::Now() - cur_column_start; + + if (!test_utils::CompareBlocks(inverse_transform_mem_.base_frame, + inverse_transform_mem_.cur_frame, + block_width_, block_height_, kMaxBlockSize, + kMaxBlockSize, false)) { + ADD_FAILURE() << "Result from dc only version of " + << ToString( + static_cast<TransformSize1D>(tx_size_1d_column_)) + << " differs from reference in iteration #" << n + << "tx_type_idx:" << tx_type_idx; + break; + } + } + + if (num_tests > 1) { + const auto base_row_elapsed_time_us = + static_cast<int>(absl::ToInt64Microseconds(base_elapsed_time[kRow])); + const auto cur_row_elapsed_time_us = + static_cast<int>(absl::ToInt64Microseconds(cur_elapsed_time[kRow])); + printf("TxType %30s[%19s]:: base_row: %5d us cur_row: %5d us %2.2fx \n", + ToString(tx_type), kTransformSize1DNames[tx_size_1d_row_], + base_row_elapsed_time_us, cur_row_elapsed_time_us, + static_cast<float>(base_row_elapsed_time_us) / + static_cast<float>(cur_row_elapsed_time_us)); + const auto base_column_elapsed_time_us = static_cast<int>( + absl::ToInt64Microseconds(base_elapsed_time[kColumn])); + const auto cur_column_elapsed_time_us = static_cast<int>( + absl::ToInt64Microseconds(cur_elapsed_time[kColumn])); + printf("TxType %30s[%19s]:: base_col: %5d us cur_col: %5d us %2.2fx \n", + ToString(tx_type), kTransformSize1DNames[tx_size_1d_column_], + base_column_elapsed_time_us, cur_column_elapsed_time_us, + static_cast<float>(base_column_elapsed_time_us) / + static_cast<float>(cur_column_elapsed_time_us)); + } + } +} + +using InverseTransformTest8bpp = InverseTransformTest<8, int16_t, uint8_t>; + +TEST_P(InverseTransformTest8bpp, Random) { TestRandomValues(1); } + +TEST_P(InverseTransformTest8bpp, DISABLED_Speed) { TestRandomValues(10000); } + +TEST_P(InverseTransformTest8bpp, DcRandom) { TestDcOnlyRandomValue(1); } + +constexpr TransformSize kTransformSizesAll[] = { + kTransformSize4x4, kTransformSize4x8, kTransformSize4x16, + kTransformSize8x4, kTransformSize8x8, kTransformSize8x16, + kTransformSize8x32, kTransformSize16x4, kTransformSize16x8, + kTransformSize16x16, kTransformSize16x32, kTransformSize16x64, + kTransformSize32x8, kTransformSize32x16, kTransformSize32x32, + kTransformSize32x64, kTransformSize64x16, kTransformSize64x32, + kTransformSize64x64}; + +INSTANTIATE_TEST_SUITE_P(C, InverseTransformTest8bpp, + testing::ValuesIn(kTransformSizesAll)); +#if LIBGAV1_ENABLE_NEON +INSTANTIATE_TEST_SUITE_P(NEON, InverseTransformTest8bpp, + testing::ValuesIn(kTransformSizesAll)); +#endif +#if LIBGAV1_ENABLE_SSE4_1 +INSTANTIATE_TEST_SUITE_P(SSE41, InverseTransformTest8bpp, + testing::ValuesIn(kTransformSizesAll)); +#endif + +#if LIBGAV1_MAX_BITDEPTH >= 10 +using InverseTransformTest10bpp = InverseTransformTest<10, int32_t, uint16_t>; + +TEST_P(InverseTransformTest10bpp, Random) { TestRandomValues(1); } + +TEST_P(InverseTransformTest10bpp, DISABLED_Speed) { TestRandomValues(10000); } + +TEST_P(InverseTransformTest10bpp, DcRandom) { TestDcOnlyRandomValue(1); } + +INSTANTIATE_TEST_SUITE_P(C, InverseTransformTest10bpp, + testing::ValuesIn(kTransformSizesAll)); + +#if LIBGAV1_ENABLE_NEON +INSTANTIATE_TEST_SUITE_P(NEON, InverseTransformTest10bpp, + testing::ValuesIn(kTransformSizesAll)); +#endif +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +} // namespace +} // namespace dsp + +static std::ostream& operator<<(std::ostream& os, const TransformSize param) { + return os << ToString(param); +} + +} // namespace libgav1 diff --git a/src/dsp/libgav1_dsp.cmake b/src/dsp/libgav1_dsp.cmake index 960d5a7..a28334d 100644 --- a/src/dsp/libgav1_dsp.cmake +++ b/src/dsp/libgav1_dsp.cmake @@ -40,8 +40,16 @@ list(APPEND libgav1_dsp_sources "${libgav1_source}/dsp/film_grain_common.h" "${libgav1_source}/dsp/intra_edge.cc" "${libgav1_source}/dsp/intra_edge.h" + "${libgav1_source}/dsp/intrapred_cfl.cc" + "${libgav1_source}/dsp/intrapred_cfl.h" + "${libgav1_source}/dsp/intrapred_directional.cc" + "${libgav1_source}/dsp/intrapred_directional.h" + "${libgav1_source}/dsp/intrapred_filter.cc" + "${libgav1_source}/dsp/intrapred_filter.h" "${libgav1_source}/dsp/intrapred.cc" "${libgav1_source}/dsp/intrapred.h" + "${libgav1_source}/dsp/intrapred_smooth.cc" + "${libgav1_source}/dsp/intrapred_smooth.h" "${libgav1_source}/dsp/inverse_transform.cc" "${libgav1_source}/dsp/inverse_transform.h" "${libgav1_source}/dsp/inverse_transform.inc" @@ -67,6 +75,8 @@ list(APPEND libgav1_dsp_sources list(APPEND libgav1_dsp_sources_avx2 ${libgav1_dsp_sources_avx2} + "${libgav1_source}/dsp/x86/cdef_avx2.cc" + "${libgav1_source}/dsp/x86/cdef_avx2.h" "${libgav1_source}/dsp/x86/convolve_avx2.cc" "${libgav1_source}/dsp/x86/convolve_avx2.h" "${libgav1_source}/dsp/x86/loop_restoration_10bit_avx2.cc" @@ -89,11 +99,16 @@ list(APPEND libgav1_dsp_sources_neon "${libgav1_source}/dsp/arm/intra_edge_neon.cc" "${libgav1_source}/dsp/arm/intra_edge_neon.h" "${libgav1_source}/dsp/arm/intrapred_cfl_neon.cc" + "${libgav1_source}/dsp/arm/intrapred_cfl_neon.h" + "${libgav1_source}/dsp/arm/intrapred_directional_neon.h" "${libgav1_source}/dsp/arm/intrapred_directional_neon.cc" - "${libgav1_source}/dsp/arm/intrapred_filter_intra_neon.cc" + "${libgav1_source}/dsp/arm/intrapred_filter_neon.cc" + "${libgav1_source}/dsp/arm/intrapred_filter_neon.h" "${libgav1_source}/dsp/arm/intrapred_neon.cc" "${libgav1_source}/dsp/arm/intrapred_neon.h" "${libgav1_source}/dsp/arm/intrapred_smooth_neon.cc" + "${libgav1_source}/dsp/arm/intrapred_smooth_neon.h" + "${libgav1_source}/dsp/arm/inverse_transform_10bit_neon.cc" "${libgav1_source}/dsp/arm/inverse_transform_neon.cc" "${libgav1_source}/dsp/arm/inverse_transform_neon.h" "${libgav1_source}/dsp/arm/loop_filter_neon.cc" @@ -124,14 +139,23 @@ list(APPEND libgav1_dsp_sources_sse4 "${libgav1_source}/dsp/x86/cdef_sse4.h" "${libgav1_source}/dsp/x86/convolve_sse4.cc" "${libgav1_source}/dsp/x86/convolve_sse4.h" + "${libgav1_source}/dsp/x86/convolve_sse4.inc" "${libgav1_source}/dsp/x86/distance_weighted_blend_sse4.cc" "${libgav1_source}/dsp/x86/distance_weighted_blend_sse4.h" + "${libgav1_source}/dsp/x86/film_grain_sse4.cc" + "${libgav1_source}/dsp/x86/film_grain_sse4.h" "${libgav1_source}/dsp/x86/intra_edge_sse4.cc" "${libgav1_source}/dsp/x86/intra_edge_sse4.h" + "${libgav1_source}/dsp/x86/intrapred_cfl_sse4.cc" + "${libgav1_source}/dsp/x86/intrapred_cfl_sse4.h" + "${libgav1_source}/dsp/x86/intrapred_directional_sse4.cc" + "${libgav1_source}/dsp/x86/intrapred_directional_sse4.h" + "${libgav1_source}/dsp/x86/intrapred_filter_sse4.cc" + "${libgav1_source}/dsp/x86/intrapred_filter_sse4.h" "${libgav1_source}/dsp/x86/intrapred_sse4.cc" "${libgav1_source}/dsp/x86/intrapred_sse4.h" - "${libgav1_source}/dsp/x86/intrapred_cfl_sse4.cc" "${libgav1_source}/dsp/x86/intrapred_smooth_sse4.cc" + "${libgav1_source}/dsp/x86/intrapred_smooth_sse4.h" "${libgav1_source}/dsp/x86/inverse_transform_sse4.cc" "${libgav1_source}/dsp/x86/inverse_transform_sse4.h" "${libgav1_source}/dsp/x86/loop_filter_sse4.cc" diff --git a/src/dsp/loop_filter_test.cc b/src/dsp/loop_filter_test.cc new file mode 100644 index 0000000..ca5107a --- /dev/null +++ b/src/dsp/loop_filter_test.cc @@ -0,0 +1,348 @@ +// Copyright 2021 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/loop_filter.h" + +#include <algorithm> +#include <cstdint> +#include <cstdio> +#include <cstring> +#include <ostream> +#include <string> + +#include "absl/strings/match.h" +#include "absl/time/clock.h" +#include "absl/time/time.h" +#include "gtest/gtest.h" +#include "src/dsp/dsp.h" +#include "src/utils/constants.h" +#include "src/utils/cpu.h" +#include "tests/block_utils.h" +#include "tests/third_party/libvpx/acm_random.h" +#include "tests/third_party/libvpx/md5_helper.h" +#include "tests/utils.h" + +namespace libgav1 { +namespace dsp { +namespace { + +// Horizontal and Vertical need 32x32: 8 pixels preceding filtered section +// 16 pixels within filtered section +// 8 pixels following filtered section +constexpr int kNumPixels = 1024; +constexpr int kBlockStride = 32; + +constexpr int kNumTests = 50000; +constexpr int kNumSpeedTests = 500000; + +constexpr int kMaxLoopFilter = 63; + +template <typename Pixel> +void InitInput(Pixel* dst, const int stride, const int bitdepth, + libvpx_test::ACMRandom& rnd, const uint8_t inner_thresh, + const bool transpose) { + const int max_pixel = (1 << bitdepth) - 1; + const int pixel_range = max_pixel + 1; + Pixel tmp[kNumPixels]; + auto clip_pixel = [max_pixel](int val) { + return static_cast<Pixel>(std::max(std::min(val, max_pixel), 0)); + }; + + for (int i = 0; i < kNumPixels;) { + const uint8_t val = rnd.Rand8(); + if (val & 0x80) { // 50% chance to choose a new value. + tmp[i++] = rnd(pixel_range); + } else { // 50% chance to repeat previous value in row X times. + int j = 0; + while (j++ < ((val & 0x1f) + 1) && i < kNumPixels) { + if (i < 1) { + tmp[i] = rnd(pixel_range); + } else if (val & 0x20) { // Increment by a value within the limit. + tmp[i] = clip_pixel(tmp[i - 1] + (inner_thresh - 1)); + } else { // Decrement by a value within the limit. + tmp[i] = clip_pixel(tmp[i - 1] - (inner_thresh - 1)); + } + ++i; + } + } + } + + for (int i = 0; i < kNumPixels;) { + const uint8_t val = rnd.Rand8(); + if (val & 0x80) { + ++i; + } else { // 50% chance to repeat previous value in column X times. + int j = 0; + while (j++ < ((val & 0x1f) + 1) && i < kNumPixels) { + if (i < 1) { + tmp[i] = rnd(pixel_range); + } else if (val & 0x20) { // Increment by a value within the limit. + tmp[(i % 32) * 32 + i / 32] = clip_pixel( + tmp[((i - 1) % 32) * 32 + (i - 1) / 32] + (inner_thresh - 1)); + } else { // Decrement by a value within the inner_thresh. + tmp[(i % 32) * 32 + i / 32] = clip_pixel( + tmp[((i - 1) % 32) * 32 + (i - 1) / 32] - (inner_thresh - 1)); + } + ++i; + } + } + } + + for (int i = 0; i < kNumPixels; ++i) { + const int offset = transpose ? stride * (i % stride) + i / stride : i; + dst[i] = tmp[offset]; + } +} + +template <int bitdepth, typename Pixel> +class LoopFilterTest : public testing::TestWithParam<LoopFilterSize> { + public: + LoopFilterTest() = default; + LoopFilterTest(const LoopFilterTest&) = delete; + LoopFilterTest& operator=(const LoopFilterTest&) = delete; + ~LoopFilterTest() override = default; + + protected: + void SetUp() override { + test_utils::ResetDspTable(bitdepth); + LoopFilterInit_C(); + + const Dsp* const dsp = GetDspTable(bitdepth); + ASSERT_NE(dsp, nullptr); + memcpy(base_loop_filters_, dsp->loop_filters[size_], + sizeof(base_loop_filters_)); + + const testing::TestInfo* const test_info = + testing::UnitTest::GetInstance()->current_test_info(); + const char* const test_case = test_info->test_suite_name(); + if (absl::StartsWith(test_case, "C/")) { + memset(base_loop_filters_, 0, sizeof(base_loop_filters_)); + } else if (absl::StartsWith(test_case, "SSE41/")) { + if ((GetCpuInfo() & kSSE4_1) != 0) { + LoopFilterInit_SSE4_1(); + } + } else if (absl::StartsWith(test_case, "NEON/")) { + LoopFilterInit_NEON(); + } else { + FAIL() << "Unrecognized architecture prefix in test case name: " + << test_case; + } + + memcpy(cur_loop_filters_, dsp->loop_filters[size_], + sizeof(cur_loop_filters_)); + + for (int i = 0; i < kNumLoopFilterTypes; ++i) { + // skip functions that haven't been specialized for this particular + // architecture. + if (cur_loop_filters_[i] == base_loop_filters_[i]) { + cur_loop_filters_[i] = nullptr; + } + } + } + + // Check |digests| if non-NULL otherwise print the filter timing. + void TestRandomValues(const char* const digests[kNumLoopFilterTypes], + int num_runs) const; + void TestSaturatedValues() const; + + const LoopFilterSize size_ = GetParam(); + LoopFilterFunc base_loop_filters_[kNumLoopFilterTypes]; + LoopFilterFunc cur_loop_filters_[kNumLoopFilterTypes]; +}; + +template <int bitdepth, typename Pixel> +void LoopFilterTest<bitdepth, Pixel>::TestRandomValues( + const char* const digests[kNumLoopFilterTypes], const int num_runs) const { + for (int i = 0; i < kNumLoopFilterTypes; ++i) { + libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed()); + if (cur_loop_filters_[i] == nullptr) continue; + + libvpx_test::MD5 md5_digest; + absl::Duration elapsed_time; + for (int n = 0; n < num_runs; ++n) { + Pixel dst[kNumPixels]; + const auto outer_thresh = + static_cast<uint8_t>(rnd(3 * kMaxLoopFilter + 5)); + const auto inner_thresh = static_cast<uint8_t>(rnd(kMaxLoopFilter + 1)); + const auto hev_thresh = + static_cast<uint8_t>(rnd(kMaxLoopFilter + 1) >> 4); + InitInput(dst, kBlockStride, bitdepth, rnd, inner_thresh, (n & 1) == 0); + + const absl::Time start = absl::Now(); + cur_loop_filters_[i](dst + 8 + kBlockStride * 8, kBlockStride, + outer_thresh, inner_thresh, hev_thresh); + elapsed_time += absl::Now() - start; + + md5_digest.Add(reinterpret_cast<const uint8_t*>(dst), sizeof(dst)); + } + if (digests == nullptr) { + const auto elapsed_time_us = + static_cast<int>(absl::ToInt64Microseconds(elapsed_time)); + printf("Mode %s[%25s]: %5d us\n", + ToString(static_cast<LoopFilterSize>(size_)), + ToString(static_cast<LoopFilterType>(i)), elapsed_time_us); + } else { + const std::string digest = md5_digest.Get(); + printf("Mode %s[%25s]: MD5: %s\n", + ToString(static_cast<LoopFilterSize>(size_)), + ToString(static_cast<LoopFilterType>(i)), digest.c_str()); + EXPECT_STREQ(digests[i], digest.c_str()); + } + } +} + +template <int bitdepth, typename Pixel> +void LoopFilterTest<bitdepth, Pixel>::TestSaturatedValues() const { + const LoopFilterType filter = kLoopFilterTypeHorizontal; + if (cur_loop_filters_[filter] == nullptr) return; + + Pixel dst[kNumPixels], ref[kNumPixels]; + const auto value = static_cast<Pixel>((1 << bitdepth) - 1); + for (auto& r : dst) r = value; + memcpy(ref, dst, sizeof(dst)); + + const int outer_thresh = 24; + const int inner_thresh = 8; + const int hev_thresh = 0; + cur_loop_filters_[filter](dst + 8 + kBlockStride * 8, kBlockStride, + outer_thresh, inner_thresh, hev_thresh); + ASSERT_TRUE(test_utils::CompareBlocks(ref, dst, kBlockStride, kBlockStride, + kBlockStride, kBlockStride, true)) + << "kLoopFilterTypeHorizontal output doesn't match reference"; +} + +//------------------------------------------------------------------------------ + +using LoopFilterTest8bpp = LoopFilterTest<8, uint8_t>; + +const char* const* GetDigests8bpp(LoopFilterSize size) { + static const char* const kDigestsSize4[kNumLoopFilterTypes] = { + "2e07bdb04b363d4ce69c7d738b1ee01a", + "7ff41f2ffa809a2016d342d92afa7f89", + }; + static const char* const kDigestsSize6[kNumLoopFilterTypes] = { + "2cd4d9ee7497ed67e38fad9cbeb7e278", + "75c57a30a927d1aca1ac5c4f175712ca", + }; + static const char* const kDigestsSize8[kNumLoopFilterTypes] = { + "854860a272d58ace223454ea727a6fe4", + "4129ee49b047777583c0e9b2006c87bf", + }; + static const char* const kDigestsSize14[kNumLoopFilterTypes] = { + "6eb768620b7ccc84b6f88b9193b02ad2", + "56e034d9edbe0d5a3cae69b2d9b3486e", + }; + + switch (size) { + case kLoopFilterSize4: + return kDigestsSize4; + case kLoopFilterSize6: + return kDigestsSize6; + case kLoopFilterSize8: + return kDigestsSize8; + case kLoopFilterSize14: + return kDigestsSize14; + default: + ADD_FAILURE() << "Unknown loop filter size" << size; + return nullptr; + } +} + +TEST_P(LoopFilterTest8bpp, DISABLED_Speed) { + TestRandomValues(nullptr, kNumSpeedTests); +} + +TEST_P(LoopFilterTest8bpp, FixedInput) { + TestRandomValues(GetDigests8bpp(size_), kNumTests); +} + +TEST_P(LoopFilterTest8bpp, SaturatedValues) { TestSaturatedValues(); } + +constexpr LoopFilterSize kLoopFilterSizes[] = { + kLoopFilterSize4, kLoopFilterSize6, kLoopFilterSize8, kLoopFilterSize14}; + +INSTANTIATE_TEST_SUITE_P(C, LoopFilterTest8bpp, + testing::ValuesIn(kLoopFilterSizes)); + +#if LIBGAV1_ENABLE_SSE4_1 +INSTANTIATE_TEST_SUITE_P(SSE41, LoopFilterTest8bpp, + testing::ValuesIn(kLoopFilterSizes)); +#endif +#if LIBGAV1_ENABLE_NEON +INSTANTIATE_TEST_SUITE_P(NEON, LoopFilterTest8bpp, + testing::ValuesIn(kLoopFilterSizes)); +#endif +//------------------------------------------------------------------------------ + +#if LIBGAV1_MAX_BITDEPTH >= 10 +using LoopFilterTest10bpp = LoopFilterTest<10, uint16_t>; + +const char* const* GetDigests10bpp(LoopFilterSize size) { + static const char* const kDigestsSize4[kNumLoopFilterTypes] = { + "657dd0f612734c9c1fb50a2313567af4", + "b1c0a0a0b35bad1589badf3c291c0461", + }; + static const char* const kDigestsSize6[kNumLoopFilterTypes] = { + "d41906d4830157052d5bde417d9df9fc", + "451490def78bd649d16d64db4e665a62", + }; + static const char* const kDigestsSize8[kNumLoopFilterTypes] = { + "a763127680f31db7184f2a63ee140268", + "1f413bebacaa2435f0e07963a9095243", + }; + static const char* const kDigestsSize14[kNumLoopFilterTypes] = { + "f0e61add3e5856657c4055751a6dd6e2", + "44da25d613ea601bf5f6e2a42d329cf0", + }; + + switch (size) { + case kLoopFilterSize4: + return kDigestsSize4; + case kLoopFilterSize6: + return kDigestsSize6; + case kLoopFilterSize8: + return kDigestsSize8; + case kLoopFilterSize14: + return kDigestsSize14; + default: + ADD_FAILURE() << "Unknown loop filter size" << size; + return nullptr; + } +} + +TEST_P(LoopFilterTest10bpp, DISABLED_Speed) { + TestRandomValues(nullptr, kNumSpeedTests); +} + +TEST_P(LoopFilterTest10bpp, FixedInput) { + TestRandomValues(GetDigests10bpp(size_), kNumTests); +} + +INSTANTIATE_TEST_SUITE_P(C, LoopFilterTest10bpp, + testing::ValuesIn(kLoopFilterSizes)); + +#if LIBGAV1_ENABLE_SSE4_1 +INSTANTIATE_TEST_SUITE_P(SSE41, LoopFilterTest10bpp, + testing::ValuesIn(kLoopFilterSizes)); +#endif +#endif + +} // namespace + +static std::ostream& operator<<(std::ostream& os, const LoopFilterSize size) { + return os << ToString(size); +} + +} // namespace dsp +} // namespace libgav1 diff --git a/src/dsp/loop_restoration.cc b/src/dsp/loop_restoration.cc index 0909df0..1a15d90 100644 --- a/src/dsp/loop_restoration.cc +++ b/src/dsp/loop_restoration.cc @@ -143,12 +143,12 @@ inline void WienerVertical(const int16_t* wiener_buffer, const int width, // filter[3] = 0 - 2 * (filter[0] + filter[1] + filter[2]). // Thus in libaom's computation, an offset of 128 is needed for filter[3]. template <int bitdepth, typename Pixel> -void WienerFilter_C(const RestorationUnitInfo& restoration_info, - const void* const source, const void* const top_border, - const void* const bottom_border, const ptrdiff_t stride, - const int width, const int height, - RestorationBuffer* const restoration_buffer, - void* const dest) { +void WienerFilter_C( + const RestorationUnitInfo& restoration_info, const void* const source, + const ptrdiff_t stride, const void* const top_border, + const ptrdiff_t top_border_stride, const void* const bottom_border, + const ptrdiff_t bottom_border_stride, const int width, const int height, + RestorationBuffer* const restoration_buffer, void* const dest) { constexpr int kCenterTap = kWienerFilterTaps / 2; const int16_t* const number_leading_zero_coefficients = restoration_info.wiener_info.number_leading_zero_coefficients; @@ -170,38 +170,42 @@ void WienerFilter_C(const RestorationUnitInfo& restoration_info, auto* wiener_buffer = wiener_buffer_org + number_rows_to_skip * width; if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) { - WienerHorizontal<bitdepth, Pixel>(top + (2 - height_extra) * stride, stride, - width, height_extra, filter_horizontal, 0, - &wiener_buffer); + WienerHorizontal<bitdepth, Pixel>( + top + (2 - height_extra) * top_border_stride, top_border_stride, width, + height_extra, filter_horizontal, 0, &wiener_buffer); WienerHorizontal<bitdepth, Pixel>(src, stride, width, height, filter_horizontal, 0, &wiener_buffer); - WienerHorizontal<bitdepth, Pixel>(bottom, stride, width, height_extra, - filter_horizontal, 0, &wiener_buffer); - } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) { - WienerHorizontal<bitdepth, Pixel>(top + (2 - height_extra) * stride, stride, - width, height_extra, filter_horizontal, 1, + WienerHorizontal<bitdepth, Pixel>(bottom, bottom_border_stride, width, + height_extra, filter_horizontal, 0, &wiener_buffer); + } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) { + WienerHorizontal<bitdepth, Pixel>( + top + (2 - height_extra) * top_border_stride, top_border_stride, width, + height_extra, filter_horizontal, 1, &wiener_buffer); WienerHorizontal<bitdepth, Pixel>(src, stride, width, height, filter_horizontal, 1, &wiener_buffer); - WienerHorizontal<bitdepth, Pixel>(bottom, stride, width, height_extra, - filter_horizontal, 1, &wiener_buffer); - } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) { - WienerHorizontal<bitdepth, Pixel>(top + (2 - height_extra) * stride, stride, - width, height_extra, filter_horizontal, 2, + WienerHorizontal<bitdepth, Pixel>(bottom, bottom_border_stride, width, + height_extra, filter_horizontal, 1, &wiener_buffer); + } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) { + WienerHorizontal<bitdepth, Pixel>( + top + (2 - height_extra) * top_border_stride, top_border_stride, width, + height_extra, filter_horizontal, 2, &wiener_buffer); WienerHorizontal<bitdepth, Pixel>(src, stride, width, height, filter_horizontal, 2, &wiener_buffer); - WienerHorizontal<bitdepth, Pixel>(bottom, stride, width, height_extra, - filter_horizontal, 2, &wiener_buffer); + WienerHorizontal<bitdepth, Pixel>(bottom, bottom_border_stride, width, + height_extra, filter_horizontal, 2, + &wiener_buffer); } else { assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3); - WienerHorizontal<bitdepth, Pixel>(top + (2 - height_extra) * stride, stride, - width, height_extra, filter_horizontal, 3, - &wiener_buffer); + WienerHorizontal<bitdepth, Pixel>( + top + (2 - height_extra) * top_border_stride, top_border_stride, width, + height_extra, filter_horizontal, 3, &wiener_buffer); WienerHorizontal<bitdepth, Pixel>(src, stride, width, height, filter_horizontal, 3, &wiener_buffer); - WienerHorizontal<bitdepth, Pixel>(bottom, stride, width, height_extra, - filter_horizontal, 3, &wiener_buffer); + WienerHorizontal<bitdepth, Pixel>(bottom, bottom_border_stride, width, + height_extra, filter_horizontal, 3, + &wiener_buffer); } // vertical filtering. @@ -233,7 +237,7 @@ void WienerFilter_C(const RestorationUnitInfo& restoration_info, //------------------------------------------------------------------------------ // SGR -// When |height| is 1, |src_stride| could be set to arbitrary value. +// When |height| is 1, |src_stride| could be set to an arbitrary value. template <typename Pixel, int size> LIBGAV1_ALWAYS_INLINE void BoxSum(const Pixel* src, const ptrdiff_t src_stride, const int height, const int width, @@ -267,7 +271,7 @@ LIBGAV1_ALWAYS_INLINE void BoxSum(const Pixel* src, const ptrdiff_t src_stride, } while (--y != 0); } -// When |height| is 1, |src_stride| could be set to arbitrary value. +// When |height| is 1, |src_stride| could be set to an arbitrary value. template <typename Pixel> LIBGAV1_ALWAYS_INLINE void BoxSum(const Pixel* src, const ptrdiff_t src_stride, const int height, const int width, @@ -541,8 +545,11 @@ inline void BoxFilter(const Pixel* const src, const ptrdiff_t stride, template <int bitdepth, typename Pixel> inline void BoxFilterProcess(const RestorationUnitInfo& restoration_info, - const Pixel* src, const Pixel* const top_border, - const Pixel* bottom_border, const ptrdiff_t stride, + const Pixel* src, const ptrdiff_t stride, + const Pixel* const top_border, + const ptrdiff_t top_border_stride, + const Pixel* bottom_border, + const ptrdiff_t bottom_border_stride, const int width, const int height, SgrBuffer* const sgr_buffer, Pixel* dst) { const auto temp_stride = Align<ptrdiff_t>(width, 8); @@ -582,8 +589,8 @@ inline void BoxFilterProcess(const RestorationUnitInfo& restoration_info, b565[1] = b565[0] + temp_stride; assert(scales[0] != 0); assert(scales[1] != 0); - BoxSum<Pixel>(top_border, stride, 2, width + 2, sum3, sum5 + 1, square_sum3, - square_sum5 + 1); + BoxSum<Pixel>(top_border, top_border_stride, 2, width + 2, sum3, sum5 + 1, + square_sum3, square_sum5 + 1); sum5[0] = sum5[1]; square_sum5[0] = square_sum5[1]; BoxSum<Pixel>(src, stride, 1, width + 2, sum3 + 2, sum5 + 3, square_sum3 + 2, @@ -631,7 +638,7 @@ inline void BoxFilterProcess(const RestorationUnitInfo& restoration_info, ptrdiff_t s_stride; if ((height & 1) == 0) { sr = bottom_border; - s_stride = stride; + s_stride = bottom_border_stride; } else { sr = src + 2 * stride; s_stride = bottom_border - (src + 2 * stride); @@ -658,8 +665,9 @@ inline void BoxFilterProcess(const RestorationUnitInfo& restoration_info, std::swap(ma565[0], ma565[1]); std::swap(b565[0], b565[1]); } - BoxSum<Pixel>(bottom_border + stride, stride, 1, width + 2, sum3 + 2, - sum5 + 3, square_sum3 + 2, square_sum5 + 3); + BoxSum<Pixel>(bottom_border + bottom_border_stride, bottom_border_stride, 1, + width + 2, sum3 + 2, sum5 + 3, square_sum3 + 2, + square_sum5 + 3); sum5[4] = sum5[3]; square_sum5[4] = square_sum5[3]; BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, scales[0], @@ -681,12 +689,13 @@ inline void BoxFilterProcess(const RestorationUnitInfo& restoration_info, template <int bitdepth, typename Pixel> inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info, - const Pixel* src, + const Pixel* src, const ptrdiff_t stride, const Pixel* const top_border, + const ptrdiff_t top_border_stride, const Pixel* bottom_border, - const ptrdiff_t stride, const int width, - const int height, SgrBuffer* const sgr_buffer, - Pixel* dst) { + const ptrdiff_t bottom_border_stride, + const int width, const int height, + SgrBuffer* const sgr_buffer, Pixel* dst) { const auto temp_stride = Align<ptrdiff_t>(width, 8); const ptrdiff_t sum_stride = temp_stride + 8; const int sgr_proj_index = restoration_info.sgr_proj_info.index; @@ -705,7 +714,8 @@ inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info, b565[0] = sgr_buffer->b565; b565[1] = b565[0] + temp_stride; assert(scale != 0); - BoxSum<Pixel, 5>(top_border, stride, 2, width + 2, sum5 + 1, square_sum5 + 1); + BoxSum<Pixel, 5>(top_border, top_border_stride, 2, width + 2, sum5 + 1, + square_sum5 + 1); sum5[0] = sum5[1]; square_sum5[0] = square_sum5[1]; BoxSum<Pixel, 5>(src, stride, 1, width + 2, sum5 + 3, square_sum5 + 3); @@ -736,7 +746,7 @@ inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info, ptrdiff_t s_stride; if ((height & 1) == 0) { sr = bottom_border; - s_stride = stride; + s_stride = bottom_border_stride; } else { sr = src + 2 * stride; s_stride = bottom_border - (src + 2 * stride); @@ -755,8 +765,8 @@ inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info, Circulate5PointersBy2<uint16_t>(sum5); Circulate5PointersBy2<uint32_t>(square_sum5); } - BoxSum<Pixel, 5>(bottom_border + stride, stride, 1, width + 2, sum5 + 3, - square_sum5 + 3); + BoxSum<Pixel, 5>(bottom_border + bottom_border_stride, bottom_border_stride, + 1, width + 2, sum5 + 3, square_sum5 + 3); sum5[4] = sum5[3]; square_sum5[4] = square_sum5[3]; BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, scale, sgr_buffer, @@ -772,12 +782,13 @@ inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info, template <int bitdepth, typename Pixel> inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info, - const Pixel* src, + const Pixel* src, const ptrdiff_t stride, const Pixel* const top_border, + const ptrdiff_t top_border_stride, const Pixel* bottom_border, - const ptrdiff_t stride, const int width, - const int height, SgrBuffer* const sgr_buffer, - Pixel* dst) { + const ptrdiff_t bottom_border_stride, + const int width, const int height, + SgrBuffer* const sgr_buffer, Pixel* dst) { assert(restoration_info.sgr_proj_info.multiplier[0] == 0); const auto temp_stride = Align<ptrdiff_t>(width, 8); const ptrdiff_t sum_stride = temp_stride + 8; @@ -802,7 +813,8 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info, b444[0] = sgr_buffer->b444; b444[1] = b444[0] + temp_stride; assert(scale != 0); - BoxSum<Pixel, 3>(top_border, stride, 2, width + 2, sum3, square_sum3); + BoxSum<Pixel, 3>(top_border, top_border_stride, 2, width + 2, sum3, + square_sum3); BoxSum<Pixel, 3>(src, stride, 1, width + 2, sum3 + 2, square_sum3 + 2); BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, scale, false, sgr_buffer, ma343[0], b343[0], nullptr, @@ -814,7 +826,7 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info, s = src + stride; } else { s = bottom_border; - bottom_border += stride; + bottom_border += bottom_border_stride; } BoxSum<Pixel, 3>(s, 0, 1, width + 2, sum3 + 2, square_sum3 + 2); BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, scale, true, @@ -845,7 +857,7 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info, b444, dst); src += stride; dst += stride; - bottom_border += stride; + bottom_border += bottom_border_stride; Circulate3PointersBy1<uint16_t>(ma343); Circulate3PointersBy1<uint32_t>(b343); std::swap(ma444[0], ma444[1]); @@ -854,12 +866,12 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info, } template <int bitdepth, typename Pixel> -void SelfGuidedFilter_C(const RestorationUnitInfo& restoration_info, - const void* const source, const void* const top_border, - const void* const bottom_border, const ptrdiff_t stride, - const int width, const int height, - RestorationBuffer* const restoration_buffer, - void* const dest) { +void SelfGuidedFilter_C( + const RestorationUnitInfo& restoration_info, const void* const source, + const ptrdiff_t stride, const void* const top_border, + const ptrdiff_t top_border_stride, const void* const bottom_border, + const ptrdiff_t bottom_border_stride, const int width, const int height, + RestorationBuffer* const restoration_buffer, void* const dest) { const int index = restoration_info.sgr_proj_info.index; const int radius_pass_0 = kSgrProjParams[index][0]; // 2 or 0 const int radius_pass_1 = kSgrProjParams[index][2]; // 1 or 0 @@ -872,17 +884,17 @@ void SelfGuidedFilter_C(const RestorationUnitInfo& restoration_info, // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the // following assertion. assert(radius_pass_0 != 0); - BoxFilterProcessPass1<bitdepth, Pixel>(restoration_info, src - 3, top - 3, - bottom - 3, stride, width, height, - sgr_buffer, dst); + BoxFilterProcessPass1<bitdepth, Pixel>( + restoration_info, src - 3, stride, top - 3, top_border_stride, + bottom - 3, bottom_border_stride, width, height, sgr_buffer, dst); } else if (radius_pass_0 == 0) { - BoxFilterProcessPass2<bitdepth, Pixel>(restoration_info, src - 2, top - 2, - bottom - 2, stride, width, height, - sgr_buffer, dst); + BoxFilterProcessPass2<bitdepth, Pixel>( + restoration_info, src - 2, stride, top - 2, top_border_stride, + bottom - 2, bottom_border_stride, width, height, sgr_buffer, dst); } else { - BoxFilterProcess<bitdepth, Pixel>(restoration_info, src - 3, top - 3, - bottom - 3, stride, width, height, - sgr_buffer, dst); + BoxFilterProcess<bitdepth, Pixel>( + restoration_info, src - 3, stride, top - 3, top_border_stride, + bottom - 3, bottom_border_stride, width, height, sgr_buffer, dst); } } diff --git a/src/dsp/loop_restoration_test.cc b/src/dsp/loop_restoration_test.cc new file mode 100644 index 0000000..97a05d4 --- /dev/null +++ b/src/dsp/loop_restoration_test.cc @@ -0,0 +1,616 @@ +// Copyright 2021 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/loop_restoration.h" + +#include <algorithm> +#include <cstdint> +#include <cstring> +#include <string> + +#include "absl/strings/match.h" +#include "absl/time/clock.h" +#include "absl/time/time.h" +#include "gtest/gtest.h" +#include "src/dsp/common.h" +#include "src/dsp/dsp.h" +#include "src/utils/common.h" +#include "src/utils/constants.h" +#include "src/utils/cpu.h" +#include "src/utils/memory.h" +#include "tests/block_utils.h" +#include "tests/third_party/libvpx/acm_random.h" +#include "tests/utils.h" + +namespace libgav1 { +namespace dsp { +namespace { + +// in unit of Pixel. +constexpr int kBorder = 16; +constexpr int kWidth = 256; +constexpr int kHeight = 255; +constexpr int kStride = kWidth + 2 * kBorder; +constexpr int kOffset = kBorder * kStride + kBorder; +constexpr int kMaxBlockSize = 288 * kStride; +constexpr int kUnitWidths[] = {32, 64, 128, 256}; + +constexpr int kNumRadiusTypes = 3; +constexpr int kNumWienerOrders = 4; +constexpr int kWienerOrders[] = {7, 5, 3, 1}; +constexpr int kWienerOrderIdLookup[] = {0, 3, 0, 2, 0, 1, 0, 0}; + +template <int bitdepth, typename Pixel> +class SelfGuidedFilterTest : public testing::TestWithParam<int>, + public test_utils::MaxAlignedAllocable { + public: + SelfGuidedFilterTest() = default; + SelfGuidedFilterTest(const SelfGuidedFilterTest&) = delete; + SelfGuidedFilterTest& operator=(const SelfGuidedFilterTest&) = delete; + ~SelfGuidedFilterTest() override = default; + + void SetUp() override { + test_utils::ResetDspTable(bitdepth); + LoopRestorationInit_C(); + const testing::TestInfo* const test_info = + testing::UnitTest::GetInstance()->current_test_info(); + const char* const test_case = test_info->test_suite_name(); + if (absl::StartsWith(test_case, "C/")) { + } else if (absl::StartsWith(test_case, "AVX2/")) { + if ((GetCpuInfo() & kAVX2) != 0) { + LoopRestorationInit_AVX2(); +#if LIBGAV1_MAX_BITDEPTH >= 10 + LoopRestorationInit10bpp_AVX2(); +#endif + } + } else if (absl::StartsWith(test_case, "SSE41/")) { + if ((GetCpuInfo() & kSSE4_1) != 0) { + LoopRestorationInit_SSE4_1(); +#if LIBGAV1_MAX_BITDEPTH >= 10 + LoopRestorationInit10bpp_SSE4_1(); +#endif + } + } else if (absl::StartsWith(test_case, "NEON/")) { + LoopRestorationInit_NEON(); + } else { + FAIL() << "Unrecognized architecture prefix in test case name: " + << test_case; + } + const Dsp* const dsp = GetDspTable(bitdepth); + ASSERT_NE(dsp, nullptr); + target_self_guided_filter_func_ = dsp->loop_restorations[1]; + restoration_info_.type = kLoopRestorationTypeSgrProj; + memset(dst_, 0, sizeof(dst_)); + } + + void SetInputData(int type, Pixel value, int radius_index, + libvpx_test::ACMRandom* rnd); + void TestFixedValues(int test_index, Pixel value); + void TestRandomValues(bool speed); + + protected: + const int unit_width_ = GetParam(); + const int unit_height_ = kRestorationUnitHeight; + + private: + alignas(kMaxAlignment) Pixel src_[kMaxBlockSize]; + alignas(kMaxAlignment) Pixel dst_[kMaxBlockSize]; + RestorationUnitInfo restoration_info_; + RestorationBuffer restoration_buffer_; + LoopRestorationFunc target_self_guided_filter_func_; +}; + +template <int bitdepth, typename Pixel> +void SelfGuidedFilterTest<bitdepth, Pixel>::SetInputData( + int type, Pixel value, int radius_index, + libvpx_test::ACMRandom* const rnd) { + const int mask = (1 << bitdepth) - 1; + if (type == 0) { // Set fixed values + for (auto& s : src_) s = value; + } else { // Set random values + for (auto& s : src_) s = rnd->Rand16() & mask; + } + for (auto& d : dst_) d = rnd->Rand16() & mask; + restoration_info_.sgr_proj_info.multiplier[0] = + kSgrProjMultiplierMin[0] + + rnd->PseudoUniform(kSgrProjMultiplierMax[0] - kSgrProjMultiplierMin[0] + + 1); + restoration_info_.sgr_proj_info.multiplier[1] = + kSgrProjMultiplierMin[1] + + rnd->PseudoUniform(kSgrProjMultiplierMax[1] - kSgrProjMultiplierMin[1] + + 1); + // regulate multiplier so that it matches libaom. + // Valid self-guided filter doesn't allow r0 and r1 to be 0 at the same time. + // When r0 or r1 is zero, its corresponding multiplier is set to zero in + // libaom. + int index; + if (radius_index == 0) { + index = 0; // r0 = 2, r1 = 1 + } else if (radius_index == 1) { + index = 10; // r0 = 0, r1 = 1 + } else /* if (radius_index == 2) */ { + index = 14; // r0 = 2, r1 = 0 + } + const uint8_t r0 = kSgrProjParams[index][0]; + const uint8_t r1 = kSgrProjParams[index][2]; + static constexpr int kMultiplier[2] = {0, 95}; + restoration_info_.sgr_proj_info.index = index; + if (r0 == 0) { + restoration_info_.sgr_proj_info.multiplier[0] = kMultiplier[0]; + } else if (r1 == 0) { + restoration_info_.sgr_proj_info.multiplier[1] = kMultiplier[1]; + } +} + +template <int bitdepth, typename Pixel> +void SelfGuidedFilterTest<bitdepth, Pixel>::TestFixedValues(int test_index, + Pixel value) { + static const char* const kDigest[][2][kNumRadiusTypes] = { + {{"7b78783ff4f03625a50c2ebfd574adca", "4faa0810639016f11a9f761ce28c38b0", + "a03314fc210bee68c7adbb44d2bbdac7"}, + {"fce031d1339cfef5016e76a643538a71", "d439e1060de3f07b5b29c9b0b7c08e54", + "a6583fe9359877f4a259c81d900fc4fb"}}, + {{"948ea16a90c4cefef87ce5b0ee105fc6", "76740629877b721432b84dbbdb4e352a", + "27100f37b3e42a5f2a051e1566edb6f8"}, + {"dd320de3bc82f4ba69738b2190ea9f85", "bf82f271e30a1aca91e53b086e133fb3", + "69c274ac59c99999e1bfbf2fc4586ebd"}}, + {{"9fbf1b246011250f38532a543cc6dd74", "d5c1e0142390ebb51b075c49f8ee9ff4", + "92f31086ba2f9e1508983b22d93a4e5c"}, + {"2198321e6b95e7199738e60f5ddc6966", "34f74626027ffca010c824ddf0942b13", + "43dd7df2c2a601262c68cd8af1c61b82"}}, + {{"42364ff8dbdbd6706fa3b8855a4258be", "a7843fdfd4d3c0d80ba812b353b4d6b4", + "f8a6a025827f29f857bed3e28ba3ea33"}, + {"b83c1f8d7712e37f9b21b033822e37ed", "589daf2e3e6f8715873920515cfc1b42", + "20dcbe8e317a4373bebf11d56adc5f02"}}}; + if (target_self_guided_filter_func_ == nullptr) return; + ASSERT_LT(value, 1 << bitdepth); + constexpr int bd_index = (bitdepth == 8) ? 0 : 1; + libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed()); + const Pixel* const src = src_ + kOffset; + Pixel* const dst = dst_ + kOffset; + for (int radius_index = 0; radius_index < kNumRadiusTypes; ++radius_index) { + SetInputData(0, value, radius_index, &rnd); + const absl::Time start = absl::Now(); + for (int y = 0; y < kHeight; y += unit_height_) { + const int height = std::min(unit_height_, kHeight - y); + for (int x = 0; x < kWidth; x += unit_width_) { + const int width = std::min(unit_width_, kWidth - x); + const Pixel* const source = src + y * kStride + x; + target_self_guided_filter_func_( + restoration_info_, source, kStride, + source - kRestorationVerticalBorder * kStride, kStride, + source + height * kStride, kStride, width, height, + &restoration_buffer_, dst + y * kStride + x); + } + } + const absl::Duration elapsed_time = absl::Now() - start; + test_utils::CheckMd5Digest( + "kLoopRestorationTypeSgrProj", std::to_string(GetParam()).c_str(), + kDigest[test_index][bd_index][radius_index], dst_ + kBorder * kStride, + kHeight * kStride * sizeof(*dst_), elapsed_time); + } +} + +template <int bitdepth, typename Pixel> +void SelfGuidedFilterTest<bitdepth, Pixel>::TestRandomValues(bool speed) { + static const char* const kDigest[][2][kNumRadiusTypes] = { + {{"9f8358ed820943fa0abe3a8ebb5887db", "fb5d48870165522341843bcbfa8674fb", + "ca67159cd29475ac5d52ca4a0df3ea10"}, + {"a78641886ea0cf8757057d1d91e01434", "1b95172a5f2f9c514c78afa4cf8e5678", + "a8ba988283d9e1ad1f0dcdbf6bbdaade"}}, + {{"f219b445e5c80ffb5dd0359cc2cb4dd4", "699b2c9ddca1cbb0d4fc24cbcbe951e9", + "a4005899fa8d3c3c4669910f93ff1290"}, + {"10a75cab3c78b891c8c6d92d55f685d1", "d46f158f57c628136f6f298ee8ca6e0e", + "07203ad761775d5d317f2b7884afd9fe"}}, + {{"000d4e382be4003b514c9135893d0a37", "8fb082dca975be363bfc9c2d317ae084", + "475bcb6a58f87da7723f6227bc2aca0e"}, + {"4d589683f69ccc5b416149dcc5c835d5", "986b6832df1f6020d50be61ae121e42f", + "7cb5c5dbdb3d1c54cfa00def450842dc"}}, + {{"fd43bfe34d63614554dd29fb24b12173", "5c1ba74ba3062c769d5c3c86a85ac9b9", + "f1eda6d15b37172199d9949c2315832f"}, + {"a11be3117fb77e8fe113581b06f98bd1", "df94d12b774ad5cf744c871e707c36c8", + "b23dc0b54c3500248d53377030428a61"}}, + {{"f3079b3b21d8dc6fce7bb1fd104be359", "c6fcbc686cfb97ab3a64f445d73aad36", + "23966cba3e0e7803eeb951905861e0dd"}, + {"7210391a6fe26e5ca5ea205bc38aa035", "4c3e6eccad3ea152d320ecd1077169de", + "dcee48f94126a2132963e86e93dd4903"}}}; + if (target_self_guided_filter_func_ == nullptr) return; + constexpr int bd_index = (bitdepth == 8) ? 0 : 1; + const int num_inputs = speed ? 1 : 5; + const int num_tests = speed ? 20000 : 1; + libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed()); + const Pixel* const src = src_ + kOffset; + Pixel* const dst = dst_ + kOffset; + for (int i = 0; i < num_inputs; ++i) { + for (int radius_index = 0; radius_index < kNumRadiusTypes; ++radius_index) { + SetInputData(1, 0, radius_index, &rnd); + const absl::Time start = absl::Now(); + for (int k = 0; k < num_tests; ++k) { + for (int y = 0; y < kHeight; y += unit_height_) { + const int height = std::min(unit_height_, kHeight - y); + for (int x = 0; x < kWidth; x += unit_width_) { + const int width = std::min(unit_width_, kWidth - x); + const Pixel* const source = src + y * kStride + x; + target_self_guided_filter_func_( + restoration_info_, source, kStride, + source - kRestorationVerticalBorder * kStride, kStride, + source + height * kStride, kStride, width, height, + &restoration_buffer_, dst + y * kStride + x); + } + } + } + const absl::Duration elapsed_time = absl::Now() - start; + test_utils::CheckMd5Digest( + "kLoopRestorationTypeSgrProj", std::to_string(GetParam()).c_str(), + kDigest[i][bd_index][radius_index], dst_ + kBorder * kStride, + kHeight * kStride * sizeof(*dst_), elapsed_time); + } + } +} + +using SelfGuidedFilterTest8bpp = SelfGuidedFilterTest<8, uint8_t>; + +TEST_P(SelfGuidedFilterTest8bpp, Correctness) { + TestFixedValues(0, 0); + TestFixedValues(1, 1); + TestFixedValues(2, 128); + TestFixedValues(3, 255); + TestRandomValues(false); +} + +TEST_P(SelfGuidedFilterTest8bpp, DISABLED_Speed) { TestRandomValues(true); } + +INSTANTIATE_TEST_SUITE_P(C, SelfGuidedFilterTest8bpp, + testing::ValuesIn(kUnitWidths)); +#if LIBGAV1_ENABLE_AVX2 +INSTANTIATE_TEST_SUITE_P(AVX2, SelfGuidedFilterTest8bpp, + testing::ValuesIn(kUnitWidths)); +#endif +#if LIBGAV1_ENABLE_SSE4_1 +INSTANTIATE_TEST_SUITE_P(SSE41, SelfGuidedFilterTest8bpp, + testing::ValuesIn(kUnitWidths)); +#endif +#if LIBGAV1_ENABLE_NEON +INSTANTIATE_TEST_SUITE_P(NEON, SelfGuidedFilterTest8bpp, + testing::ValuesIn(kUnitWidths)); +#endif + +#if LIBGAV1_MAX_BITDEPTH >= 10 +using SelfGuidedFilterTest10bpp = SelfGuidedFilterTest<10, uint16_t>; + +TEST_P(SelfGuidedFilterTest10bpp, Correctness) { + TestFixedValues(0, 0); + TestFixedValues(1, 1); + TestFixedValues(2, 512); + TestFixedValues(3, 1023); + TestRandomValues(false); +} + +TEST_P(SelfGuidedFilterTest10bpp, DISABLED_Speed) { TestRandomValues(true); } + +INSTANTIATE_TEST_SUITE_P(C, SelfGuidedFilterTest10bpp, + testing::ValuesIn(kUnitWidths)); + +#if LIBGAV1_ENABLE_AVX2 +INSTANTIATE_TEST_SUITE_P(AVX2, SelfGuidedFilterTest10bpp, + testing::ValuesIn(kUnitWidths)); +#endif +#if LIBGAV1_ENABLE_SSE4_1 +INSTANTIATE_TEST_SUITE_P(SSE41, SelfGuidedFilterTest10bpp, + testing::ValuesIn(kUnitWidths)); +#endif + +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +template <int bitdepth, typename Pixel> +class WienerFilterTest : public testing::TestWithParam<int>, + public test_utils::MaxAlignedAllocable { + public: + WienerFilterTest() = default; + WienerFilterTest(const WienerFilterTest&) = delete; + WienerFilterTest& operator=(const WienerFilterTest&) = delete; + ~WienerFilterTest() override = default; + + void SetUp() override { + test_utils::ResetDspTable(bitdepth); + LoopRestorationInit_C(); + const Dsp* const dsp = GetDspTable(bitdepth); + ASSERT_NE(dsp, nullptr); + base_wiener_filter_func_ = dsp->loop_restorations[0]; + const testing::TestInfo* const test_info = + testing::UnitTest::GetInstance()->current_test_info(); + const char* const test_case = test_info->test_suite_name(); + if (absl::StartsWith(test_case, "C/")) { + } else if (absl::StartsWith(test_case, "AVX2/")) { + if ((GetCpuInfo() & kAVX2) != 0) { + LoopRestorationInit_AVX2(); +#if LIBGAV1_MAX_BITDEPTH >= 10 + LoopRestorationInit10bpp_AVX2(); +#endif + } + } else if (absl::StartsWith(test_case, "SSE41/")) { + if ((GetCpuInfo() & kSSE4_1) != 0) { + LoopRestorationInit_SSE4_1(); +#if LIBGAV1_MAX_BITDEPTH >= 10 + LoopRestorationInit10bpp_SSE4_1(); +#endif + } + } else if (absl::StartsWith(test_case, "NEON/")) { + LoopRestorationInit_NEON(); + } else { + FAIL() << "Unrecognized architecture prefix in test case name: " + << test_case; + } + target_wiener_filter_func_ = dsp->loop_restorations[0]; + restoration_info_.type = kLoopRestorationTypeWiener; + memset(dst_, 0, sizeof(dst_)); + memset(tmp_, 0, sizeof(tmp_)); + memset(buffer_, 0, sizeof(buffer_)); + } + + static void CleanFilterByOrder(const int order, + int16_t filter[kWienerFilterTaps]) { + if (order <= 5) filter[0] = 0; + if (order <= 3) filter[1] = 0; + if (order <= 1) filter[2] = 0; + } + + void SetInputData(int type, Pixel value, int vertical_order, + int horizontal_order); + void TestFixedValues(int digest_id, Pixel value); + void TestRandomValues(bool speed); + void TestCompare2C(); + + protected: + const int unit_width_ = GetParam(); + const int unit_height_ = kRestorationUnitHeight; + + private: + alignas(kMaxAlignment) + uint16_t buffer_[(kRestorationUnitWidth + kWienerFilterTaps - 1) * + kRestorationUnitHeight]; + alignas(kMaxAlignment) Pixel src_[kMaxBlockSize]; + alignas(kMaxAlignment) Pixel dst_[kMaxBlockSize]; + alignas(kMaxAlignment) Pixel tmp_[kMaxBlockSize]; + RestorationUnitInfo restoration_info_; + RestorationBuffer restoration_buffer_; + LoopRestorationFunc base_wiener_filter_func_; + LoopRestorationFunc target_wiener_filter_func_; +}; + +template <int bitdepth, typename Pixel> +void WienerFilterTest<bitdepth, Pixel>::SetInputData( + int type, Pixel value, const int vertical_order, + const int horizontal_order) { + const int mask = (1 << bitdepth) - 1; + libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed()); + if (type == 0) { + for (auto& s : src_) s = value; + } else { + for (auto& s : src_) s = rnd.Rand16() & mask; + } + int order = vertical_order; + for (int i = WienerInfo::kVertical; i <= WienerInfo::kHorizontal; ++i) { + auto& filter = restoration_info_.wiener_info.filter[i]; + filter[3] = 128; + for (int j = 0; j < 3; ++j) { + filter[j] = kWienerTapsMin[j] + + rnd.PseudoUniform(kWienerTapsMax[j] - kWienerTapsMin[j] + 1); + } + CleanFilterByOrder(order, filter); + filter[3] -= 2 * (filter[0] + filter[1] + filter[2]); + restoration_info_.wiener_info.number_leading_zero_coefficients[i] = + (kWienerFilterTaps - order) / 2; + order = horizontal_order; + } +} + +template <int bitdepth, typename Pixel> +void WienerFilterTest<bitdepth, Pixel>::TestFixedValues(int digest_id, + Pixel value) { + static const char* const kDigest[2][4] = { + {"74fc90760a14b13340cb718f200ba350", "5bacaca0128cd36f4805330b3787771d", + "1109e17545cc4fbd5810b8b77e19fc36", "e7f914ec9d065aba92338016e17a526c"}, + {"c8cc38790ceb0bea1eb989686755e1e5", "70f573b7e8875262c638a68d2f317916", + "193b19065899c835cb513149eb36d135", "f1dff65e3e53558b303ef0a2e3f3ba98"}}; + if (target_wiener_filter_func_ == nullptr) return; + ASSERT_LT(value, 1 << bitdepth); + constexpr int bd_index = (bitdepth == 8) ? 0 : 1; + const Pixel* const src = src_ + kOffset; + Pixel* const dst = dst_ + kOffset; + for (const auto vertical_order : kWienerOrders) { + for (const auto horizontal_order : kWienerOrders) { + SetInputData(0, value, vertical_order, horizontal_order); + memset(dst_, 0, sizeof(dst_)); + const absl::Time start = absl::Now(); + for (int y = 0; y < kHeight; y += unit_height_) { + const int height = std::min(unit_height_, kHeight - y); + for (int x = 0; x < kWidth; x += unit_width_) { + const int width = std::min(unit_width_, kWidth - x); + const Pixel* const source = src + y * kStride + x; + target_wiener_filter_func_( + restoration_info_, source, kStride, + source - kRestorationVerticalBorder * kStride, kStride, + source + height * kStride, kStride, width, height, + &restoration_buffer_, dst + y * kStride + x); + } + } + const absl::Duration elapsed_time = absl::Now() - start; + test_utils::CheckMd5Digest( + "kLoopRestorationTypeWiener", std::to_string(GetParam()).c_str(), + kDigest[bd_index][digest_id], dst_, sizeof(dst_), elapsed_time); + } + } +} + +template <int bitdepth, typename Pixel> +void WienerFilterTest<bitdepth, Pixel>::TestRandomValues(bool speed) { + static const char* const kDigest[2][kNumWienerOrders][kNumWienerOrders] = { + {{"40d0cf56d2ffb4f581e68b0fc97f547f", "5c04745209b684ba98004ebb0f64e70b", + "545ed7d3f7e7ca3b86b4ada31f7aaee7", "0d6b2967f1bd1d99b720e563fe0cf03f"}, + {"44b37076f0cf27f6eb506aca50c1d3e4", "e927d64dc9249e05a65e10ee75baa7d9", + "6136ecb4e29b17c9566504148943fd47", "c5ee2da81d44dc8cb2ac8021f724eb7a"}, + {"125cbb227313ec91a2683f26e6f049d1", "77671b6529c806d23b749f304b548f59", + "28d53a1b486881895b8f73fa64486df1", "f5e32165bafe575d7ee7a6fbae75f36d"}, + {"e832c41f2566ab542b32abba9d4f27bd", "ab1336ee6b85cba651f35ee5d3b3cc5c", + "52a673b6d14fbdca5ebdb1a34ee3326f", + "ebb42c7c9111f2e39f21e2158e801d9e"}}, + {{"8cd9c6bd9983bd49564a58ed4af9098a", "f71f333c9d71237ed4e46f0ef2283196", + "375b43abc1d6682d62f91c1841b8b0fc", "71e2444822ae9c697ddfc96e07c6e8a1"}, + {"d9ed3a66ceef405c08c87f6e91b71059", "c171fcff5fb7bb919f13ead7a4917a4c", + "8fbd1edb82fcd78d4d286886f65a700a", "fe14a143e6b261c5bb07b179d40be5a2"}, + {"1c995f4e7f117857de73211b81093bd0", "5ab1ee3bb14adcd66d66802d58bee068", + "d77430783e173ebd1b30e5d9336c8b69", "e159a3620747458dff7ed3d20da1a4b7"}, + {"5346fa07d195c257548a332753b057a3", "c77674bc0a638abc4d38d58e494fc7cf", + "7cbc1562a9dd08e1973b3b9ac1afc765", + "3c91bf1a34672cd40bf261c5820d3ec3"}}}; + if (target_wiener_filter_func_ == nullptr) return; + constexpr int bd_index = (bitdepth == 8) ? 0 : 1; + const int num_tests = speed ? 100000 : 1; + const Pixel* const src = src_ + kOffset; + Pixel* const dst = dst_ + kOffset; + for (const auto vertical_order : kWienerOrders) { + for (const auto horizontal_order : kWienerOrders) { + SetInputData(1, (1 << bitdepth) - 1, vertical_order, horizontal_order); + memset(dst_, 0, sizeof(dst_)); + const absl::Time start = absl::Now(); + for (int i = 0; i < num_tests; ++i) { + for (int y = 0; y < kHeight; y += unit_height_) { + const int height = std::min(unit_height_, kHeight - y); + for (int x = 0; x < kWidth; x += unit_width_) { + const int width = std::min(unit_width_, kWidth - x); + const Pixel* const source = src + y * kStride + x; + target_wiener_filter_func_( + restoration_info_, source, kStride, + source - kRestorationVerticalBorder * kStride, kStride, + source + height * kStride, kStride, width, height, + &restoration_buffer_, dst + y * kStride + x); + } + } + } + const absl::Duration elapsed_time = absl::Now() - start; + test_utils::CheckMd5Digest( + "kLoopRestorationTypeWiener", std::to_string(GetParam()).c_str(), + kDigest[bd_index][kWienerOrderIdLookup[vertical_order]] + [kWienerOrderIdLookup[horizontal_order]], + dst_, sizeof(dst_), elapsed_time); + } + } +} + +template <int bitdepth, typename Pixel> +void WienerFilterTest<bitdepth, Pixel>::TestCompare2C() { + if (base_wiener_filter_func_ == nullptr) return; + if (target_wiener_filter_func_ == nullptr) return; + if (base_wiener_filter_func_ == target_wiener_filter_func_) return; + const Pixel* const src = src_ + kOffset; + Pixel* const dst = dst_ + kOffset; + Pixel* const tmp = tmp_ + kOffset; + for (const auto vertical_order : kWienerOrders) { + for (const auto horizontal_order : kWienerOrders) { + SetInputData(1, (1 << bitdepth) - 1, vertical_order, horizontal_order); + for (int x = 0; x < 2; ++x) { + // Prepare min/max filter coefficients. + int order = vertical_order; + for (int i = WienerInfo::kVertical; i <= WienerInfo::kHorizontal; ++i) { + auto& filter = restoration_info_.wiener_info.filter[i]; + for (int j = 0; j < 3; ++j) { + filter[j] = (x == 0) ? kWienerTapsMin[j] : kWienerTapsMax[j]; + } + CleanFilterByOrder(order, filter); + filter[3] = 128 - 2 * (filter[0] + filter[1] + filter[2]); + restoration_info_.wiener_info.number_leading_zero_coefficients[i] = + (kWienerFilterTaps - order) / 2; + order = horizontal_order; + } + base_wiener_filter_func_(restoration_info_, src, kStride, + src - kRestorationVerticalBorder * kStride, + kStride, src + unit_height_ * kStride, kStride, + unit_width_, unit_height_, + &restoration_buffer_, dst); + target_wiener_filter_func_(restoration_info_, src, kStride, + src - kRestorationVerticalBorder * kStride, + kStride, src + unit_height_ * kStride, + kStride, unit_width_, unit_height_, + &restoration_buffer_, tmp); + if (!test_utils::CompareBlocks(dst, tmp, unit_width_, unit_height_, + kStride, kStride, false, false)) { + ADD_FAILURE() << "Mismatch -- wiener taps min/max"; + } + } + } + } +} + +using WienerFilterTest8bpp = WienerFilterTest<8, uint8_t>; + +TEST_P(WienerFilterTest8bpp, Correctness) { + TestFixedValues(0, 0); + TestFixedValues(1, 1); + TestFixedValues(2, 128); + TestFixedValues(3, 255); + TestRandomValues(false); +} + +TEST_P(WienerFilterTest8bpp, DISABLED_Speed) { TestRandomValues(true); } + +TEST_P(WienerFilterTest8bpp, TestCompare2C) { TestCompare2C(); } + +INSTANTIATE_TEST_SUITE_P(C, WienerFilterTest8bpp, + testing::ValuesIn(kUnitWidths)); +#if LIBGAV1_ENABLE_AVX2 +INSTANTIATE_TEST_SUITE_P(AVX2, WienerFilterTest8bpp, + testing::ValuesIn(kUnitWidths)); +#endif +#if LIBGAV1_ENABLE_SSE4_1 +INSTANTIATE_TEST_SUITE_P(SSE41, WienerFilterTest8bpp, + testing::ValuesIn(kUnitWidths)); +#endif +#if LIBGAV1_ENABLE_NEON +INSTANTIATE_TEST_SUITE_P(NEON, WienerFilterTest8bpp, + testing::ValuesIn(kUnitWidths)); +#endif + +#if LIBGAV1_MAX_BITDEPTH >= 10 +using WienerFilterTest10bpp = WienerFilterTest<10, uint16_t>; + +TEST_P(WienerFilterTest10bpp, Correctness) { + TestFixedValues(0, 0); + TestFixedValues(1, 1); + TestFixedValues(2, 512); + TestFixedValues(3, 1023); + TestRandomValues(false); +} + +TEST_P(WienerFilterTest10bpp, DISABLED_Speed) { TestRandomValues(true); } + +TEST_P(WienerFilterTest10bpp, TestCompare2C) { TestCompare2C(); } + +INSTANTIATE_TEST_SUITE_P(C, WienerFilterTest10bpp, + testing::ValuesIn(kUnitWidths)); + +#if LIBGAV1_ENABLE_AVX2 +INSTANTIATE_TEST_SUITE_P(AVX2, WienerFilterTest10bpp, + testing::ValuesIn(kUnitWidths)); +#endif +#if LIBGAV1_ENABLE_SSE4_1 +INSTANTIATE_TEST_SUITE_P(SSE41, WienerFilterTest10bpp, + testing::ValuesIn(kUnitWidths)); +#endif + +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +} // namespace +} // namespace dsp +} // namespace libgav1 diff --git a/src/dsp/mask_blend.cc b/src/dsp/mask_blend.cc index 101c410..15ef821 100644 --- a/src/dsp/mask_blend.cc +++ b/src/dsp/mask_blend.cc @@ -25,8 +25,8 @@ namespace libgav1 { namespace dsp { namespace { -template <int subsampling_x, int subsampling_y> -uint8_t GetMaskValue(const uint8_t* mask, const uint8_t* mask_next_row, int x) { +uint8_t GetMaskValue(const uint8_t* mask, const uint8_t* mask_next_row, int x, + int subsampling_x, int subsampling_y) { if ((subsampling_x | subsampling_y) == 0) { return mask[x]; } @@ -63,7 +63,7 @@ void MaskBlend_C(const void* prediction_0, const void* prediction_1, for (int y = 0; y < height; ++y) { for (int x = 0; x < width; ++x) { const uint8_t mask_value = - GetMaskValue<subsampling_x, subsampling_y>(mask, mask_next_row, x); + GetMaskValue(mask, mask_next_row, x, subsampling_x, subsampling_y); if (is_inter_intra) { dst[x] = static_cast<Pixel>(RightShiftWithRounding( mask_value * pred_1[x] + (64 - mask_value) * pred_0[x], 6)); @@ -96,7 +96,7 @@ void InterIntraMaskBlend8bpp_C(const uint8_t* prediction_0, for (int y = 0; y < height; ++y) { for (int x = 0; x < width; ++x) { const uint8_t mask_value = - GetMaskValue<subsampling_x, subsampling_y>(mask, mask_next_row, x); + GetMaskValue(mask, mask_next_row, x, subsampling_x, subsampling_y); prediction_1[x] = static_cast<uint8_t>(RightShiftWithRounding( mask_value * prediction_1[x] + (64 - mask_value) * prediction_0[x], 6)); @@ -148,6 +148,7 @@ void Init8bpp() { #ifndef LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp420 dsp->inter_intra_mask_blend_8bpp[2] = InterIntraMaskBlend8bpp_C<1, 1>; #endif + static_cast<void>(GetMaskValue); #endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS } diff --git a/src/dsp/mask_blend_test.cc b/src/dsp/mask_blend_test.cc new file mode 100644 index 0000000..b5e7e60 --- /dev/null +++ b/src/dsp/mask_blend_test.cc @@ -0,0 +1,493 @@ +// Copyright 2020 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/mask_blend.h" + +#include <cstddef> +#include <cstdint> +#include <cstring> +#include <ostream> +#include <string> +#include <type_traits> + +#include "absl/strings/match.h" +#include "absl/strings/str_format.h" +#include "absl/strings/string_view.h" +#include "absl/time/clock.h" +#include "absl/time/time.h" +#include "gtest/gtest.h" +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/utils/common.h" +#include "src/utils/constants.h" +#include "src/utils/cpu.h" +#include "src/utils/memory.h" +#include "tests/third_party/libvpx/acm_random.h" +#include "tests/utils.h" + +namespace libgav1 { +namespace dsp { +namespace { + +constexpr int kNumSpeedTests = 50000; +// mask_blend is applied to compound prediction values when is_inter_intra is +// false. This implies a range far exceeding that of pixel values. The ranges +// include kCompoundOffset in 10bpp and 12bpp. +// see: src/dsp/convolve.cc & src/dsp/warp.cc. +constexpr int kCompoundPredictionRange[3][2] = { + // 8bpp + {-5132, 9212}, + // 10bpp + {3988, 61532}, + // 12bpp + {3974, 61559}, +}; + +const char* GetDigest8bpp(int id) { + static const char* const kDigest[] = { + "4b70d5ef5ac7554b4b2660a4abe14a41", "64adb36f07e4a2c4ea4f05cfd715ff58", + "c490478208374a43765900ef7115c264", "b98f222eb70ef8589da2d6c839ca22b8", + "54752ca05f67b5af571bc311aa4e3de3", "344b2dab7accd8bd0a255bee16207336", + "0b2f6f755d1547eea7e0172f8133ea01", "310dc6364fdacba186c01f0e8ac4fcb7", + "b0c9f08b73d9e5c16eaf5abdbca1fdc0", "eaad805999d949fa1e1bbbb63b4b7827", + "6eb2a80d212df89403efb50db7a81b08", "c30730aa799dba78a2ebd3f729af82c7", + "4346c2860b23f0072b6b288f14c1df36", "8f8dd3eeed74ef115ca8a2f82ebff0ba", + "42e8872a81647767636f4c75609e0e2f", "1ff2526547d59557f7bb458249e34527", + "cd303d685268aebd2919dd468928d0ba", "254fb3ad990f9d408d252c70dd682e27", + "ba8d99c62853d14855f5d93e9574c97b", "e8ab744348681d6aa1043080efa86fc9", + "2fa919ca1f54b4336de878ff4015c352", "18e47c9809b909c2bfad08e00dffc635", + "9a90c843f06f0b662c509c26f5dd5054", "f89c608f884f37b064fc2b49eb2690a9", + "2448734d948ca6ddeb0ce8038a4ab2cf", "a3e0f86b7a5cb49716a424709c00b5a4", + "eb84dba768b54da10cded2f932f0aab7", "d6e8fdeb6875b70488f25d7f7ed9423f", + "1ca0822febce19c02ddc42a7b3331257", "a9259bb9b87ad002619eb47b907d7226", + "6408c5f327f1a9a390fb0046d4bc112b", "dba612489f87d00a82f2735fbcb98dcc", + "e8626a97699fbd247d6358ad5f766bee", "5e638a6897d7a2950f3512f871fa19e6", + "45a58708939779413f8e0e1de2ee5e6f", "079ae4682d398f0a7e4b66059589586d", + "6a06e617308409f9181b59bdd4f63d83", "b05ade2c1a572fc5fcca92b4163d9afb", + "30e955c3f86111207d5922575602e90a", "af5e6c65ed48a0eb7d509f7036398728", + "f9da3310d7dc75910483dfdd2af6ee62", "a9423b4d67bee5e7c7bc3baa7a9c017a", + "6b90a04333407013dd011c1af582e79f", "e658088a74bfb7cc57a2faa74a6f8689", + "6eedf27126eba6915035f9f701a1b992", "89116a7c6ad3f70a5b3f3105d04ad1a8", + "f41e5e166b049d0006d8b2cab56523b3", "3bed57a684075bbe3c25fd0c3e5520c3", + "85c0b21af2afb18ce948abfe3e23c85b", "bd8aaa3602d6b42438f8449f8adb52cb", + "1266bad904caad2c6d4047abefc2393d", "6573f2fe2a14c9ab7d5e192742388489", + "6b9b443f6306059fa3fe18df9de6dc48", "c9a91ee6ae8b653f552866e4073dd097", + "fa58938384198f7709d4871d155ba100", "033d121fc782e83ff94c31e73407d2a8", + "7ea268d79f7b8c75a4feeb24e892471a", "73a376bb3e07172d1e094ab8e01a7d42", + "13c366e0da1663fac126ea3d3876c110", "2f5eb5fcdf953c63fee2b8c75a6e5568", + "2054b197f002223f2d75699884279511", "67ce53e6991657a922d77cc8a23f1e07", + "f48e6d666435e7a917d6f90539b0d557", "21d03669d8d255e43552f8fb90724717", + "43dbaa1a7aaf2a01764e78e041b6763b", "a8173347ea861ecee6da54f81df73951", + "6b97ec4e4647a8de026d693059b855b7", "a85bf4c4b48791ac4971339877e4bc8a", + "04cf84d020a60ce3ce53845255ca8ec9", "ddd87035b960499b883d0aefcf96b6b2", + "278c5dd102474d598bf788cd66977ba9", "78b3790785811516142d417a49177c8c", + "7883ea9c2df0b4f5797cba31f4352678", "727004811025ac97b04940e2eaf68f94", + "7ffa3f97ec13dc8b6225550133a392bc", "6f5f2cb7a44aa0daea5c6b3315110591", + "88a59d68875fb44ec3be9d3fa293bccb", "0516e71f76b9d998794d3d63e480fa2f", + "193793d42f0964b4a958a68d9d7eb4ba", "4d259c7c6a95744e4ebaaa5361befb11", + "c090155b997dc103203bcb5a9dcc6282", + }; + return kDigest[id]; +} + +#if LIBGAV1_MAX_BITDEPTH >= 10 +const char* GetDigest10bpp(int id) { + static const char* const kDigest[] = { + "1af3cbd1616941b59e6a3f6a417b6312", "1d8b3f4b9d5d2f4ff5be8e81b7243121", + "53a3a76bf2bcd5761cd15fc739a4f4e1", "7597f69dc19a584280be0d67911db6a6", + "e1221c172843dc6c1b345bcd370771cc", "2ccbe012ca167114b14c3ba70befa960", + "0f68632d7e5faddb4554ca430d1df822", "8caa0061a26e142b783951d5abd7bf5d", + "1cce6acdbd8ca8d2546ba937584730bf", "022913e87a3c1a86aaefe2c2d4f89882", + "48f8ab636ba15a06731d869b603cbe58", "ba1616c990d224c20de123c3ccf19952", + "346a797b7cb4de10759e329f8b49e077", "8f4aa102e9b1ac430bdb9ebd4ec4cfca", + "5886397456b15e504ad55d8e0ce71e0e", "2a78b52ce43dc28606e83521963c00fa", + "8d3ef5280063337b0df97f91251bb8fc", "81f0ceada000ce40586be828a2045430", + "edb7b70a473392148bc419a44385326b", "97abe2eecaf9158a0529b234a241a57a", + "65729d750aa1258e4a7eccef247ac8c2", "78cc995e81188b9e8b29fa58796a3313", + "a1eb6a8c2f7c77e30e739a1b3b07cc74", "805b0f2f4b9d80f118d800b5ab4f603e", + "12610c83533f7170149390ba581f70b2", "cba20deed43b49ada3f626c91510995d", + "ba7ea35410b746fcbcf56c24ccb56d59", "933b2235b9b943984607d87f0bce1067", + "7ae59015295db8983bc8472429076464", "c18cce63327b367c0a260e9cbf4222b9", + "7c9672a7dfa964cb3ed3f2b4b443d2b6", "b29bcf1cc5369702e0179db1198db531", + "412326aff6c89116240b5d3ef63fa5cc", "3d854589fd171e42d118be4627ec5330", + "9a157e51e39ed314031224f074193791", "c645cdc63d3112f27b90cc9080c6d071", + "3f360cc336a4ee9a9bd78bde1a6e9eb3", "37b40fa8674d03a7cd66afdee939b9bf", + "cd6c7b98fe71b533c6a06d6d9122a6d0", "c26e0a0e90a969d762edcab770bed3b7", + "e517967d2cf4f1b0fff09d334475e2ae", "bc760a328a0a4b2d75593667adfa2a0e", + "b6239fdeeccc462640047cb2e2c2be96", "bc01f6a232ef9f0d9e57301779edd67f", + "cf6e8c1823c5498fa5589db40406a6ad", "2a9a4bd0bd84f0b85225a5b30f5eaa16", + "56f7bb2265dbd8a563bb269aa527c8a3", "fcbed0f0350be5a1384f95f8090d262e", + "f3ecf2e5747ebff65ac78ecbe7cc5e6a", "1d57d1371ad2f5f320cc4de789665f7c", + "e9f400fee64673b0f6313400fe449135", "5dfdc4a8376740011c777df46418b5d2", + "a4eb2c077300c0d8eeda028c9db3a63a", "90551259280c2b2150f018304204f072", + "4cbcd76496fc5b841cd164b6067b9c0b", "895964acc7b7e7d084de2266421c351b", + "af2e05159d369d0e3b72707f242b2845", "c7d393cef751950df3b9ed8056a9ffce", + "788541c0807aed47b863d47e5912555d", "163a06512f48c1b0f2535c8c50815bcc", + "dc5e723bab9fbfd7074a62e05b6b3c2b", "bf91200ce1bf97b4642a601adc13d700", + "d93fcefa6b9004baaab76d436e7ac931", "e89a2111caecc6bcf5f2b42ea0167ab4", + "e04a058df9b87878ca97edc1c42e76e1", "5d1f60876147edd6ed29d1fb50172464", + "655fb228aa410fd244c58c87fe510bec", "639a8a0a8f62d628136f5a97b3728b69", + "5b60f2428b092a502d6471fa09befd7f", "40601555ac945b4d37d3434b6e5619be", + "02be23bf1f89d5f5af02a39b98f96142", "9347a45bd54d28d8105f8183996b3505", + "d8429cc7b0b388981861a0fdd40289f0", "c4b7fab3b044486f663e160c07805e0a", + "f5f5d513b1f1c13d0abc70fc18afea48", "f236795ea30f1b8761b268734a245ba1", + "c7b7452ea8247a3a40248278d08953d5", "ddd6ba3c5ec56cc7a0b0161ae67001fa", + "94675749f2db46a8ade6f2f211db9a32", "3d165364ff96a5ef39e67a53fe3ed3be", + "3d1d66a9401fd7e78050724ca1fa0419", + }; + return kDigest[id]; +} +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +struct MaskBlendTestParam { + MaskBlendTestParam(int width, int height, int subsampling_x, + int subsampling_y, bool is_inter_intra, + bool is_wedge_inter_intra) + : width(width), + height(height), + subsampling_x(subsampling_x), + subsampling_y(subsampling_y), + is_inter_intra(is_inter_intra), + is_wedge_inter_intra(is_wedge_inter_intra) {} + int width; + int height; + int subsampling_x; + int subsampling_y; + bool is_inter_intra; + bool is_wedge_inter_intra; +}; + +std::ostream& operator<<(std::ostream& os, const MaskBlendTestParam& param) { + return os << "BlockSize" << param.width << "x" << param.height + << ", subsampling(x/y): " << param.subsampling_x << "/" + << param.subsampling_y + << ", is_inter_intra: " << param.is_inter_intra + << ", is_wedge_inter_intra: " << param.is_wedge_inter_intra; +} + +template <int bitdepth, typename Pixel> +class MaskBlendTest : public testing::TestWithParam<MaskBlendTestParam>, + public test_utils::MaxAlignedAllocable { + public: + MaskBlendTest() = default; + ~MaskBlendTest() override = default; + + void SetUp() override { + test_utils::ResetDspTable(bitdepth); + MaskBlendInit_C(); + const dsp::Dsp* const dsp = dsp::GetDspTable(bitdepth); + ASSERT_NE(dsp, nullptr); + const testing::TestInfo* const test_info = + testing::UnitTest::GetInstance()->current_test_info(); + const absl::string_view test_case = test_info->test_suite_name(); + if (absl::StartsWith(test_case, "C/")) { + } else if (absl::StartsWith(test_case, "NEON/")) { + MaskBlendInit_NEON(); + } else if (absl::StartsWith(test_case, "SSE41/")) { + if ((GetCpuInfo() & kSSE4_1) != 0) { + MaskBlendInit_SSE4_1(); + } + } else { + FAIL() << "Unrecognized architecture prefix in test case name: " + << test_case; + } + func_ = (param_.is_inter_intra && !param_.is_wedge_inter_intra) + ? dsp->mask_blend[0][param_.is_inter_intra] + : dsp->mask_blend[param_.subsampling_x + param_.subsampling_y] + [param_.is_inter_intra]; + func_8bpp_ = dsp->inter_intra_mask_blend_8bpp[param_.is_wedge_inter_intra + ? param_.subsampling_x + + param_.subsampling_y + : 0]; + } + + protected: + int GetDigestIdOffset() const { + // id is for retrieving the corresponding digest from the lookup table given + // the set of input parameters. id can be figured out by its width, height + // and an offset (id_offset). + // For example, in kMaskBlendTestParam, this set of parameters + // (8, 8, 0, 0, false, false) corresponds to the first entry in the + // digest lookup table, where id == 0. + // (8, 8, 1, 0, false, false) corresponds to id == 13. + // (8, 8, 1, 1, false, false) corresponds to id == 26. + // (8, 8, 0, 0, true, false) corresponds to id == 39. + // Id_offset denotes offset for different modes (is_inter_intra, + // is_wedge_inter_intra). Width and height help to figure out id: + // width = 8, height = 8, id = id_offset + log2(8) - 3. + // width = 8, height = 16, id = id_offset + log2(min(width, height) - 3 + 1. + // ... + if (!param_.is_inter_intra && !param_.is_wedge_inter_intra) { + return param_.subsampling_x * 13 + param_.subsampling_y * 13; + } + if (param_.is_inter_intra && !param_.is_wedge_inter_intra) { + return 39 + param_.subsampling_x * 7 + param_.subsampling_y * 7; + } + if (param_.is_inter_intra && param_.is_wedge_inter_intra) { + return 60 + param_.subsampling_x * 7 + param_.subsampling_y * 7; + } + return 0; + } + + int GetDigestId() const { + int id = GetDigestIdOffset(); + if (param_.width == param_.height) { + return id + 3 * (FloorLog2(param_.width) - 3); + } + if (param_.width < param_.height) { + return id + 1 + 3 * (FloorLog2(param_.width) - 3); + } + return id + 2 + 3 * (FloorLog2(param_.height) - 3); + } + + void Test(const char* digest, int num_runs); + + private: + using PredType = + typename std::conditional<bitdepth == 8, int16_t, uint16_t>::type; + static constexpr int kStride = kMaxSuperBlockSizeInPixels; + static constexpr int kDestStride = kMaxSuperBlockSizeInPixels * sizeof(Pixel); + const MaskBlendTestParam param_ = GetParam(); + alignas(kMaxAlignment) PredType + source1_[kMaxSuperBlockSizeInPixels * kMaxSuperBlockSizeInPixels] = {}; + uint8_t source1_8bpp_[kMaxSuperBlockSizeInPixels * + kMaxSuperBlockSizeInPixels] = {}; + alignas(kMaxAlignment) PredType + source2_[kMaxSuperBlockSizeInPixels * kMaxSuperBlockSizeInPixels] = {}; + uint8_t source2_8bpp_[kMaxSuperBlockSizeInPixels * + kMaxSuperBlockSizeInPixels] = {}; + uint8_t source2_8bpp_cache_[kMaxSuperBlockSizeInPixels * + kMaxSuperBlockSizeInPixels] = {}; + uint8_t mask_[kMaxSuperBlockSizeInPixels * kMaxSuperBlockSizeInPixels]; + uint8_t dest_[sizeof(Pixel) * kMaxSuperBlockSizeInPixels * + kMaxSuperBlockSizeInPixels] = {}; + dsp::MaskBlendFunc func_; + dsp::InterIntraMaskBlendFunc8bpp func_8bpp_; +}; + +template <int bitdepth, typename Pixel> +void MaskBlendTest<bitdepth, Pixel>::Test(const char* const digest, + const int num_runs) { + if (func_ == nullptr && func_8bpp_ == nullptr) return; + const int width = param_.width >> param_.subsampling_x; + const int height = param_.height >> param_.subsampling_y; + + // Add id offset to seed just to add more randomness to input blocks. + // If we use the same seed for different block sizes, the generated input + // blocks are repeated. For example, if input size is 8x8, the generated + // block is exactly the upper left half of the generated 16x16 block. + libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed() + + GetDigestIdOffset()); + PredType* src_1 = source1_; + uint8_t* src_1_8bpp = source1_8bpp_; + PredType* src_2 = source2_; + uint8_t* src_2_8bpp = source2_8bpp_; + const ptrdiff_t src_2_stride = param_.is_inter_intra ? kStride : width; + uint8_t* mask_row = mask_; + const int range_mask = (1 << (bitdepth)) - 1; + for (int y = 0; y < height; ++y) { + for (int x = 0; x < width; ++x) { + src_1[x] = static_cast<PredType>(rnd.Rand16() & range_mask); + src_2[x] = static_cast<PredType>(rnd.Rand16() & range_mask); + if (param_.is_inter_intra && bitdepth == 8) { + src_1_8bpp[x] = src_1[x]; + src_2_8bpp[x] = src_2[x]; + } + if (!param_.is_inter_intra) { + // Implies isCompound == true. + constexpr int bitdepth_index = (bitdepth - 8) >> 1; + const int min_val = kCompoundPredictionRange[bitdepth_index][0]; + const int max_val = kCompoundPredictionRange[bitdepth_index][1]; + src_1[x] = static_cast<PredType>(rnd(max_val - min_val) + min_val); + src_2[x] = static_cast<PredType>(rnd(max_val - min_val) + min_val); + } + } + src_1 += width; + src_1_8bpp += width; + src_2 += src_2_stride; + src_2_8bpp += src_2_stride; + } + // Mask should be setup regardless of subsampling. + for (int y = 0; y < param_.height; ++y) { + for (int x = 0; x < param_.width; ++x) { + mask_row[x] = rnd.Rand8() & 63; + mask_row[x] += rnd.Rand8() & 1; // Range of mask is [0, 64]. + } + mask_row += kStride; + } + + absl::Duration elapsed_time; + for (int i = 0; i < num_runs; ++i) { + const absl::Time start = absl::Now(); + if (param_.is_inter_intra && bitdepth == 8) { + ASSERT_EQ(func_, nullptr); + static_assert(sizeof(source2_8bpp_cache_) == sizeof(source2_8bpp_), ""); + // source2_8bpp_ is modified in the call. + memcpy(source2_8bpp_cache_, source2_8bpp_, sizeof(source2_8bpp_)); + func_8bpp_(source1_8bpp_, source2_8bpp_, src_2_stride, mask_, kStride, + width, height); + for (int y = 0; y < height; ++y) { + for (int x = 0; x < width; ++x) { + dest_[y * kDestStride + x] = source2_8bpp_[y * src_2_stride + x]; + } + } + memcpy(source2_8bpp_, source2_8bpp_cache_, sizeof(source2_8bpp_)); + } else { + if (bitdepth != 8) { + ASSERT_EQ(func_8bpp_, nullptr); + } + func_(source1_, source2_, src_2_stride, mask_, kStride, width, height, + dest_, kDestStride); + } + elapsed_time += absl::Now() - start; + } + + test_utils::CheckMd5Digest( + "MaskBlend", + absl::StrFormat("%dx%d", param_.width, param_.height).c_str(), digest, + dest_, sizeof(dest_), elapsed_time); +} + +const MaskBlendTestParam kMaskBlendTestParam[] = { + // is_inter_intra = false, is_wedge_inter_intra = false. + // block size range is from 8x8 to 128x128. + MaskBlendTestParam(8, 8, 0, 0, false, false), + MaskBlendTestParam(8, 16, 0, 0, false, false), + MaskBlendTestParam(16, 8, 0, 0, false, false), + MaskBlendTestParam(16, 16, 0, 0, false, false), + MaskBlendTestParam(16, 32, 0, 0, false, false), + MaskBlendTestParam(32, 16, 0, 0, false, false), + MaskBlendTestParam(32, 32, 0, 0, false, false), + MaskBlendTestParam(32, 64, 0, 0, false, false), + MaskBlendTestParam(64, 32, 0, 0, false, false), + MaskBlendTestParam(64, 64, 0, 0, false, false), + MaskBlendTestParam(64, 128, 0, 0, false, false), + MaskBlendTestParam(128, 64, 0, 0, false, false), + MaskBlendTestParam(128, 128, 0, 0, false, false), + MaskBlendTestParam(8, 8, 1, 0, false, false), + MaskBlendTestParam(8, 16, 1, 0, false, false), + MaskBlendTestParam(16, 8, 1, 0, false, false), + MaskBlendTestParam(16, 16, 1, 0, false, false), + MaskBlendTestParam(16, 32, 1, 0, false, false), + MaskBlendTestParam(32, 16, 1, 0, false, false), + MaskBlendTestParam(32, 32, 1, 0, false, false), + MaskBlendTestParam(32, 64, 1, 0, false, false), + MaskBlendTestParam(64, 32, 1, 0, false, false), + MaskBlendTestParam(64, 64, 1, 0, false, false), + MaskBlendTestParam(64, 128, 1, 0, false, false), + MaskBlendTestParam(128, 64, 1, 0, false, false), + MaskBlendTestParam(128, 128, 1, 0, false, false), + MaskBlendTestParam(8, 8, 1, 1, false, false), + MaskBlendTestParam(8, 16, 1, 1, false, false), + MaskBlendTestParam(16, 8, 1, 1, false, false), + MaskBlendTestParam(16, 16, 1, 1, false, false), + MaskBlendTestParam(16, 32, 1, 1, false, false), + MaskBlendTestParam(32, 16, 1, 1, false, false), + MaskBlendTestParam(32, 32, 1, 1, false, false), + MaskBlendTestParam(32, 64, 1, 1, false, false), + MaskBlendTestParam(64, 32, 1, 1, false, false), + MaskBlendTestParam(64, 64, 1, 1, false, false), + MaskBlendTestParam(64, 128, 1, 1, false, false), + MaskBlendTestParam(128, 64, 1, 1, false, false), + MaskBlendTestParam(128, 128, 1, 1, false, false), + // is_inter_intra = true, is_wedge_inter_intra = false. + // block size range is from 8x8 to 32x32. + MaskBlendTestParam(8, 8, 0, 0, true, false), + MaskBlendTestParam(8, 16, 0, 0, true, false), + MaskBlendTestParam(16, 8, 0, 0, true, false), + MaskBlendTestParam(16, 16, 0, 0, true, false), + MaskBlendTestParam(16, 32, 0, 0, true, false), + MaskBlendTestParam(32, 16, 0, 0, true, false), + MaskBlendTestParam(32, 32, 0, 0, true, false), + MaskBlendTestParam(8, 8, 1, 0, true, false), + MaskBlendTestParam(8, 16, 1, 0, true, false), + MaskBlendTestParam(16, 8, 1, 0, true, false), + MaskBlendTestParam(16, 16, 1, 0, true, false), + MaskBlendTestParam(16, 32, 1, 0, true, false), + MaskBlendTestParam(32, 16, 1, 0, true, false), + MaskBlendTestParam(32, 32, 1, 0, true, false), + MaskBlendTestParam(8, 8, 1, 1, true, false), + MaskBlendTestParam(8, 16, 1, 1, true, false), + MaskBlendTestParam(16, 8, 1, 1, true, false), + MaskBlendTestParam(16, 16, 1, 1, true, false), + MaskBlendTestParam(16, 32, 1, 1, true, false), + MaskBlendTestParam(32, 16, 1, 1, true, false), + MaskBlendTestParam(32, 32, 1, 1, true, false), + // is_inter_intra = true, is_wedge_inter_intra = true. + // block size range is from 8x8 to 32x32. + MaskBlendTestParam(8, 8, 0, 0, true, true), + MaskBlendTestParam(8, 16, 0, 0, true, true), + MaskBlendTestParam(16, 8, 0, 0, true, true), + MaskBlendTestParam(16, 16, 0, 0, true, true), + MaskBlendTestParam(16, 32, 0, 0, true, true), + MaskBlendTestParam(32, 16, 0, 0, true, true), + MaskBlendTestParam(32, 32, 0, 0, true, true), + MaskBlendTestParam(8, 8, 1, 0, true, true), + MaskBlendTestParam(8, 16, 1, 0, true, true), + MaskBlendTestParam(16, 8, 1, 0, true, true), + MaskBlendTestParam(16, 16, 1, 0, true, true), + MaskBlendTestParam(16, 32, 1, 0, true, true), + MaskBlendTestParam(32, 16, 1, 0, true, true), + MaskBlendTestParam(32, 32, 1, 0, true, true), + MaskBlendTestParam(8, 8, 1, 1, true, true), + MaskBlendTestParam(8, 16, 1, 1, true, true), + MaskBlendTestParam(16, 8, 1, 1, true, true), + MaskBlendTestParam(16, 16, 1, 1, true, true), + MaskBlendTestParam(16, 32, 1, 1, true, true), + MaskBlendTestParam(32, 16, 1, 1, true, true), + MaskBlendTestParam(32, 32, 1, 1, true, true), +}; + +using MaskBlendTest8bpp = MaskBlendTest<8, uint8_t>; + +TEST_P(MaskBlendTest8bpp, Blending) { Test(GetDigest8bpp(GetDigestId()), 1); } + +TEST_P(MaskBlendTest8bpp, DISABLED_Speed) { + Test(GetDigest8bpp(GetDigestId()), kNumSpeedTests); +} + +INSTANTIATE_TEST_SUITE_P(C, MaskBlendTest8bpp, + testing::ValuesIn(kMaskBlendTestParam)); + +#if LIBGAV1_ENABLE_NEON +INSTANTIATE_TEST_SUITE_P(NEON, MaskBlendTest8bpp, + testing::ValuesIn(kMaskBlendTestParam)); +#endif + +#if LIBGAV1_ENABLE_SSE4_1 +INSTANTIATE_TEST_SUITE_P(SSE41, MaskBlendTest8bpp, + testing::ValuesIn(kMaskBlendTestParam)); +#endif + +#if LIBGAV1_MAX_BITDEPTH >= 10 +using MaskBlendTest10bpp = MaskBlendTest<10, uint16_t>; + +TEST_P(MaskBlendTest10bpp, Blending) { Test(GetDigest10bpp(GetDigestId()), 1); } + +TEST_P(MaskBlendTest10bpp, DISABLED_Speed) { + Test(GetDigest10bpp(GetDigestId()), kNumSpeedTests); +} + +INSTANTIATE_TEST_SUITE_P(C, MaskBlendTest10bpp, + testing::ValuesIn(kMaskBlendTestParam)); + +#if LIBGAV1_ENABLE_SSE4_1 +INSTANTIATE_TEST_SUITE_P(SSE41, MaskBlendTest10bpp, + testing::ValuesIn(kMaskBlendTestParam)); +#endif +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +} // namespace +} // namespace dsp +} // namespace libgav1 diff --git a/src/dsp/motion_field_projection_test.cc b/src/dsp/motion_field_projection_test.cc new file mode 100644 index 0000000..3a47cc7 --- /dev/null +++ b/src/dsp/motion_field_projection_test.cc @@ -0,0 +1,213 @@ +// Copyright 2021 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/motion_field_projection.h" + +#include <algorithm> +#include <array> +#include <cassert> +#include <cmath> +#include <cstdint> +#include <string> + +#include "absl/strings/match.h" +#include "absl/strings/str_format.h" +#include "absl/time/clock.h" +#include "absl/time/time.h" +#include "gtest/gtest.h" +#include "src/dsp/dsp.h" +#include "src/utils/array_2d.h" +#include "src/utils/common.h" +#include "src/utils/constants.h" +#include "src/utils/cpu.h" +#include "src/utils/reference_info.h" +#include "src/utils/types.h" +#include "tests/third_party/libvpx/acm_random.h" +#include "tests/utils.h" + +namespace libgav1 { +namespace dsp { +namespace { + +constexpr int kMotionFieldWidth = 160; +constexpr int kMotionFieldHight = 120; + +// The 'int' parameter is unused but required to allow for instantiations of C, +// NEON, etc. +class MotionFieldProjectionTest : public testing::TestWithParam<int> { + public: + MotionFieldProjectionTest() = default; + MotionFieldProjectionTest(const MotionFieldProjectionTest&) = delete; + MotionFieldProjectionTest& operator=(const MotionFieldProjectionTest&) = + delete; + ~MotionFieldProjectionTest() override = default; + + void SetUp() override { + test_utils::ResetDspTable(8); + MotionFieldProjectionInit_C(); + const testing::TestInfo* const test_info = + testing::UnitTest::GetInstance()->current_test_info(); + const char* const test_case = test_info->test_suite_name(); + if (absl::StartsWith(test_case, "C/")) { + } else if (absl::StartsWith(test_case, "NEON/")) { + MotionFieldProjectionInit_NEON(); + } else if (absl::StartsWith(test_case, "SSE41/")) { + if ((GetCpuInfo() & kSSE4_1) != 0) { + MotionFieldProjectionInit_SSE4_1(); + } + } else { + FAIL() << "Unrecognized architecture prefix in test case name: " + << test_case; + } + const Dsp* const dsp = GetDspTable(8); + ASSERT_NE(dsp, nullptr); + target_motion_field_projection_kernel_func_ = + dsp->motion_field_projection_kernel; + } + + void SetInputData(int motion_field_width, libvpx_test::ACMRandom* rnd); + void TestRandomValues(bool speed); + + private: + MotionFieldProjectionKernelFunc target_motion_field_projection_kernel_func_; + ReferenceInfo reference_info_; + TemporalMotionField motion_field_; +}; + +void MotionFieldProjectionTest::SetInputData( + const int motion_field_width, libvpx_test::ACMRandom* const rnd) { + ASSERT_TRUE(reference_info_.Reset(kMotionFieldHight, motion_field_width)); + ASSERT_TRUE(motion_field_.mv.Reset(kMotionFieldHight, motion_field_width, + /*zero_initialize=*/false)); + ASSERT_TRUE(motion_field_.reference_offset.Reset(kMotionFieldHight, + motion_field_width, + /*zero_initialize=*/false)); + constexpr int order_hint_bits = 6; + unsigned int order_hint_shift_bits = Mod32(32 - order_hint_bits); + const unsigned int current_frame_order_hint = + rnd->Rand8() & ((1 << order_hint_bits) - 1); // [0, 63] + uint8_t reference_frame_order_hint = 0; + reference_info_.relative_distance_to[0] = 0; + reference_info_.skip_references[kReferenceFrameIntra] = true; + reference_info_.projection_divisions[kReferenceFrameIntra] = 0; + for (int i = kReferenceFrameLast; i < kNumReferenceFrameTypes; ++i) { + reference_frame_order_hint = + rnd->Rand8() & ((1 << order_hint_bits) - 1); // [0, 63] + const int relative_distance_to = + GetRelativeDistance(current_frame_order_hint, + reference_frame_order_hint, order_hint_shift_bits); + reference_info_.relative_distance_to[i] = relative_distance_to; + reference_info_.skip_references[i] = + relative_distance_to > kMaxFrameDistance || relative_distance_to <= 0; + reference_info_.projection_divisions[i] = + reference_info_.skip_references[i] + ? 0 + : kProjectionMvDivisionLookup[relative_distance_to]; + } + for (int y = 0; y < kMotionFieldHight; ++y) { + for (int x = 0; x < motion_field_width; ++x) { + reference_info_.motion_field_reference_frame[y][x] = + static_cast<ReferenceFrameType>(rnd->Rand16() & + kReferenceFrameAlternate); + reference_info_.motion_field_mv[y][x].mv[0] = rnd->Rand16Signed() / 512; + reference_info_.motion_field_mv[y][x].mv[1] = rnd->Rand16Signed() / 512; + } + } + MotionVector invalid_mv; + invalid_mv.mv[0] = kInvalidMvValue; + invalid_mv.mv[1] = kInvalidMvValue; + MotionVector* const motion_field_mv = &motion_field_.mv[0][0]; + int8_t* const motion_field_reference_offset = + &motion_field_.reference_offset[0][0]; + std::fill(motion_field_mv, motion_field_mv + motion_field_.mv.size(), + invalid_mv); + std::fill( + motion_field_reference_offset, + motion_field_reference_offset + motion_field_.reference_offset.size(), + -128); +} + +void MotionFieldProjectionTest::TestRandomValues(bool speed) { + static const char* const kDigestMv[8] = { + "87c2a74538f5c015809492ac2e521075", "ba7b4a5d82c6083b13a5b02eb7655ab7", + "8c37d96bf1744d5553860bf44a4f60a3", "720aa644f85e48995db9785e87cd02e3", + "9289c0c66524bb77a605870d78285f35", "f0326509885c2b2c89feeac53698cd47", + "6b9ad1d672dec825cb1803063d35badc", "dfe06c57cc9c70d27246df7fd0afa0b2"}; + static const char* const kDigestReferenceOffset[8] = { + "d8d1384268d7cf5c4514b39c329f94fb", "7f30e79ceb064befbad64a20d206a540", + "61e2eb5644edbd3a91b939403edc891e", "7a018f1bf88193e86934241af445dc36", + "2d6166bf8bbe1db77baf687ecf71d028", "95fee61f0219e06076d6f0e1073b1a4e", + "64d0a63751267bdc573cab761f1fe685", "906a99e0e791dbcb9183c9b68ecc4ea3"}; + const int num_tests = speed ? 2000 : 1; + if (target_motion_field_projection_kernel_func_ == nullptr) return; + libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed()); + for (int width_idx = 0; width_idx < 8; ++width_idx) { + const int motion_field_width = kMotionFieldWidth + width_idx; + SetInputData(motion_field_width, &rnd); + const int dst_sign = ((rnd.Rand16() & 1) != 0) ? 0 : -1; + const int reference_to_current_with_sign = + rnd.PseudoUniform(2 * kMaxFrameDistance + 1) - kMaxFrameDistance; + assert(std::abs(reference_to_current_with_sign) <= kMaxFrameDistance); + // Step of y8 and x8 is at least 16 except the last hop. + for (int step = 16; step <= 80; step += 16) { + const absl::Time start = absl::Now(); + for (int k = 0; k < num_tests; ++k) { + for (int y8 = 0; y8 < kMotionFieldHight; y8 += step) { + const int y8_end = std::min(y8 + step, kMotionFieldHight); + for (int x8 = 0; x8 < motion_field_width; x8 += step) { + const int x8_end = std::min(x8 + step, motion_field_width); + target_motion_field_projection_kernel_func_( + reference_info_, reference_to_current_with_sign, dst_sign, y8, + y8_end, x8, x8_end, &motion_field_); + } + } + } + const absl::Duration elapsed_time = absl::Now() - start; + test_utils::CheckMd5Digest( + "MotionFieldProjectionKernel", + absl::StrFormat("(mv) width %d step %d", motion_field_width, step) + .c_str(), + kDigestMv[width_idx], motion_field_.mv[0], + sizeof(motion_field_.mv[0][0]) * motion_field_.mv.size(), + elapsed_time); + test_utils::CheckMd5Digest( + "MotionFieldProjectionKernel", + absl::StrFormat("(ref offset) width %d step %d", motion_field_width, + step) + .c_str(), + kDigestReferenceOffset[width_idx], motion_field_.reference_offset[0], + sizeof(motion_field_.reference_offset[0][0]) * + motion_field_.reference_offset.size(), + elapsed_time); + } + } +} + +TEST_P(MotionFieldProjectionTest, Correctness) { TestRandomValues(false); } + +TEST_P(MotionFieldProjectionTest, DISABLED_Speed) { TestRandomValues(true); } + +INSTANTIATE_TEST_SUITE_P(C, MotionFieldProjectionTest, testing::Values(0)); + +#if LIBGAV1_ENABLE_NEON +INSTANTIATE_TEST_SUITE_P(NEON, MotionFieldProjectionTest, testing::Values(0)); +#endif + +#if LIBGAV1_ENABLE_SSE4_1 +INSTANTIATE_TEST_SUITE_P(SSE41, MotionFieldProjectionTest, testing::Values(0)); +#endif + +} // namespace +} // namespace dsp +} // namespace libgav1 diff --git a/src/dsp/motion_vector_search_test.cc b/src/dsp/motion_vector_search_test.cc new file mode 100644 index 0000000..a7b2ec8 --- /dev/null +++ b/src/dsp/motion_vector_search_test.cc @@ -0,0 +1,197 @@ +// Copyright 2021 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/motion_vector_search.h" + +#include <cstdint> +#include <string> + +#include "absl/strings/match.h" +#include "absl/strings/str_format.h" +#include "absl/time/clock.h" +#include "absl/time/time.h" +#include "gtest/gtest.h" +#include "src/dsp/dsp.h" +#include "src/utils/common.h" +#include "src/utils/constants.h" +#include "src/utils/cpu.h" +#include "src/utils/memory.h" +#include "src/utils/types.h" +#include "tests/third_party/libvpx/acm_random.h" +#include "tests/utils.h" + +namespace libgav1 { +namespace dsp { +namespace { + +// The 'int' parameter is unused but required to allow for instantiations of C, +// NEON, etc. +class MotionVectorSearchTest : public testing::TestWithParam<int>, + public test_utils::MaxAlignedAllocable { + public: + MotionVectorSearchTest() = default; + MotionVectorSearchTest(const MotionVectorSearchTest&) = delete; + MotionVectorSearchTest& operator=(const MotionVectorSearchTest&) = delete; + ~MotionVectorSearchTest() override = default; + + void SetUp() override { + test_utils::ResetDspTable(8); + MotionVectorSearchInit_C(); + const testing::TestInfo* const test_info = + testing::UnitTest::GetInstance()->current_test_info(); + const char* const test_case = test_info->test_suite_name(); + if (absl::StartsWith(test_case, "C/")) { + } else if (absl::StartsWith(test_case, "NEON/")) { + MotionVectorSearchInit_NEON(); + } else if (absl::StartsWith(test_case, "SSE41/")) { + if ((GetCpuInfo() & kSSE4_1) != 0) { + MotionVectorSearchInit_SSE4_1(); + } + } else { + FAIL() << "Unrecognized architecture prefix in test case name: " + << test_case; + } + const Dsp* const dsp = GetDspTable(8); + ASSERT_NE(dsp, nullptr); + mv_projection_compound_[0] = dsp->mv_projection_compound[0]; + mv_projection_compound_[1] = dsp->mv_projection_compound[1]; + mv_projection_compound_[2] = dsp->mv_projection_compound[2]; + mv_projection_single_[0] = dsp->mv_projection_single[0]; + mv_projection_single_[1] = dsp->mv_projection_single[1]; + mv_projection_single_[2] = dsp->mv_projection_single[2]; + } + + void SetInputData(libvpx_test::ACMRandom* rnd); + void TestRandomValues(bool speed); + + private: + MvProjectionCompoundFunc mv_projection_compound_[3]; + MvProjectionSingleFunc mv_projection_single_[3]; + int reference_offsets_[2]; + alignas(kMaxAlignment) + MotionVector temporal_mvs_[kMaxTemporalMvCandidatesWithPadding]; + int8_t temporal_reference_offsets_[kMaxTemporalMvCandidatesWithPadding]; + CompoundMotionVector compound_mv_org_[kMaxTemporalMvCandidates + 1] + [kMaxTemporalMvCandidatesWithPadding]; + alignas(kMaxAlignment) + CompoundMotionVector compound_mv_[kMaxTemporalMvCandidates + 1] + [kMaxTemporalMvCandidatesWithPadding]; + MotionVector single_mv_org_[kMaxTemporalMvCandidates + 1] + [kMaxTemporalMvCandidatesWithPadding]; + alignas(kMaxAlignment) + MotionVector single_mv_[kMaxTemporalMvCandidates + 1] + [kMaxTemporalMvCandidatesWithPadding]; +}; + +void MotionVectorSearchTest::SetInputData(libvpx_test::ACMRandom* const rnd) { + reference_offsets_[0] = + Clip3(rnd->Rand16(), -kMaxFrameDistance, kMaxFrameDistance); + reference_offsets_[1] = + Clip3(rnd->Rand16(), -kMaxFrameDistance, kMaxFrameDistance); + for (int i = 0; i < kMaxTemporalMvCandidatesWithPadding; ++i) { + temporal_reference_offsets_[i] = rnd->RandRange(kMaxFrameDistance); + for (auto& mv : temporal_mvs_[i].mv) { + mv = rnd->Rand16Signed() / 8; + } + } + for (int i = 0; i <= kMaxTemporalMvCandidates; ++i) { + for (int j = 0; j < kMaxTemporalMvCandidatesWithPadding; ++j) { + for (int k = 0; k < 2; ++k) { + single_mv_[i][j].mv[k] = rnd->Rand16Signed(); + for (auto& mv : compound_mv_[i][j].mv[k].mv) { + mv = rnd->Rand16Signed(); + } + } + compound_mv_org_[i][j] = compound_mv_[i][j]; + single_mv_org_[i][j] = single_mv_[i][j]; + } + } +} + +void MotionVectorSearchTest::TestRandomValues(bool speed) { + static const char* const kDigestCompound[3] = { + "74c055b06c3701b2e50f2c964a6130b9", "cab21dd54f0a1bf6e80b58cdcf1fe0a9", + "e42de30cd84fa4e7b8581a330ed08a8b"}; + static const char* const kDigestSingle[3] = { + "265ffbb59d0895183f8e2d90b6652c71", "5068d980c4ce42ed3f11963b8aece6cc", + "7e699d58df3954a38ff11c8e34151e66"}; + const int num_tests = speed ? 1000000 : 1; + libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed()); + for (int function_index = 0; function_index < 3; ++function_index) { + SetInputData(&rnd); + if (mv_projection_compound_[function_index] == nullptr) continue; + const absl::Time start = absl::Now(); + for (int count = 1; count <= kMaxTemporalMvCandidates; ++count) { + const int total_count = count + (count & 1); + for (int i = 0; i < num_tests; ++i) { + mv_projection_compound_[function_index]( + temporal_mvs_, temporal_reference_offsets_, reference_offsets_, + count, compound_mv_[count]); + } + // One more element could be calculated in SIMD implementations. + // Restore the original values if any. + for (int i = count; i < total_count; ++i) { + compound_mv_[count][i] = compound_mv_org_[count][i]; + } + } + const absl::Duration elapsed_time = absl::Now() - start; + test_utils::CheckMd5Digest( + "MvProjectionCompound", + absl::StrFormat("function_index %d", function_index).c_str(), + kDigestCompound[function_index], compound_mv_, sizeof(compound_mv_), + elapsed_time); + } + for (int function_index = 0; function_index < 3; ++function_index) { + SetInputData(&rnd); + if (mv_projection_single_[function_index] == nullptr) continue; + const absl::Time start = absl::Now(); + for (int count = 1; count <= kMaxTemporalMvCandidates; ++count) { + const int total_count = (count + 3) & ~3; + for (int i = 0; i < num_tests; ++i) { + mv_projection_single_[function_index]( + temporal_mvs_, temporal_reference_offsets_, reference_offsets_[0], + count, single_mv_[count]); + } + // Up to three more elements could be calculated in SIMD implementations. + // Restore the original values if any. + for (int i = count; i < total_count; ++i) { + single_mv_[count][i] = single_mv_org_[count][i]; + } + } + const absl::Duration elapsed_time = absl::Now() - start; + test_utils::CheckMd5Digest( + "MvProjectionSingle", + absl::StrFormat("function_index %d", function_index).c_str(), + kDigestSingle[function_index], single_mv_, sizeof(single_mv_), + elapsed_time); + } +} + +TEST_P(MotionVectorSearchTest, Correctness) { TestRandomValues(false); } + +TEST_P(MotionVectorSearchTest, DISABLED_Speed) { TestRandomValues(true); } + +INSTANTIATE_TEST_SUITE_P(C, MotionVectorSearchTest, testing::Values(0)); + +#if LIBGAV1_ENABLE_NEON +INSTANTIATE_TEST_SUITE_P(NEON, MotionVectorSearchTest, testing::Values(0)); +#endif + +#if LIBGAV1_ENABLE_SSE4_1 +INSTANTIATE_TEST_SUITE_P(SSE41, MotionVectorSearchTest, testing::Values(0)); +#endif + +} // namespace +} // namespace dsp +} // namespace libgav1 diff --git a/src/dsp/obmc_test.cc b/src/dsp/obmc_test.cc new file mode 100644 index 0000000..60b10c6 --- /dev/null +++ b/src/dsp/obmc_test.cc @@ -0,0 +1,349 @@ +// Copyright 2020 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/obmc.h" + +#include <algorithm> +#include <cstddef> +#include <cstdint> +#include <cstring> +#include <ostream> +#include <string> + +#include "absl/strings/match.h" +#include "absl/strings/str_format.h" +#include "absl/strings/string_view.h" +#include "absl/time/clock.h" +#include "absl/time/time.h" +#include "gtest/gtest.h" +#include "src/dsp/dsp.h" +#include "src/utils/common.h" +#include "src/utils/constants.h" +#include "src/utils/cpu.h" +#include "tests/block_utils.h" +#include "tests/third_party/libvpx/acm_random.h" +#include "tests/utils.h" + +namespace libgav1 { +namespace dsp { +namespace { + +#include "src/dsp/obmc.inc" + +constexpr int kMaxBlendingBlockSize = 64; +constexpr int kNumSpeedTests = 1000000; + +const char* GetDigest8bpp(int id) { + static const char* const kDigest[] = { + "76906f87892c30c7059a5c97e4838c42", "0b8670d937217c66425f2662b51eebbe", + "c8659acd1e8ecdab06be73f0954fa1ae", "e785f31f2723a193fefd534bd6f6c18f", + "751fcd8a345fef1c38a25293c9b528c0", "69af412dfa5e96ad43b79c178cb1c58b", + "2766a64622e183bb4614f2018f14fa85", "8d98589a5cef6e68ee8fadf19d420e3c", + "19eccf31dd8cf1abcee9414128fe4141", "35019f98e30bcbc6ab624682a0628519", + "199c551164e73c100045d7ab033ffdcc", "ad5a5eb2906265690c22741b0715f37b", + "e2152dea159249149ff4151111b73ed6", "6b44c0052789ce2fa4df882f35618e7d", + "1edd570bec7e63780d83588f6aacda25", "b04b81c9e52c58885907dc7f1ef2c11c", + "b24ad192e151b1e0f74d1493004cb1b6", "6c1ce7ed3463cc60870e336f990d4f14", + "2e6b7a06da21512dfdd9a517d2988655", "971ba1c41ab13bb341c04f936760f546", + "55b803239d9f12888c666c5320450937", "3d0838963f8c95dafbfb8e5e25c865d2", + "98a9be6245720d4e0da18115c1a1dbd7", "7e7afe3136ad681b5ea05664fe916548", + "33971753243f09106173199b7bae1ef5", "65413f33c19a42c112d395121aa4b3b4", + }; + return kDigest[id]; +} + +const char* GetDigestSpeed8bpp(int id) { + static const char* const kDigest[] = { + "c5b532f5960477bdd50684ab25fae0f4", "bf76ed404bc5674e0a4ff238efceb62b", + "5ea519b616cd2998fbb9b25b4c2660cb", "f23d18197a96de48901738d130a147d9", + "07b4140c693947a63865f835089766c4", "62547d29bc4dfb2e201e9d907c09e345", + "c3988da521be50aeb9944564001b282b", "d5a8ff9ca1bd49f4260bb497c489b06c", + "b3e94f1e33c316759ebf47620327168c", "c5e64a34ca7e55f4daed19cbe4c27049", + "3b234eb729e8e79db8692c4cbe1b6667", "f9f3060a44c3a575470f9700b3c3a75b", + "e3a1960b0a7238db1184a3f9d8e9a4b2", "721c7e8ec3aa0608b64f10f7ff5427db", + "ba9938553703d520bc0ade427c397140", "8b6e15e8ecd234363f70f51c64b0aea1", + "31bf64a6ed1e8002d488c0b9dcffb80a", "9ab1f3ae2e7f70cd27452f30cecfd18e", + "eaf25ac79ad70fc17ca96d8fcdf0f939", "9aaa88cb5e6b8757e37c3430bd664e70", + "8293874b2794df8fd22f5a35c3de7bee", "e9d6ee9106227c2c67ea9e6a4652e4ad", + "29f8a6fc2a650f3945a4ea6d3b975b6d", "8f300a257e913a42666b4921b2b0b5c5", + "a526265c4b3c8593736a82ddc1fd1603", "76e248f6756ac96343204b0e48d72a9e", + }; + return kDigest[id]; +} + +#if LIBGAV1_MAX_BITDEPTH >= 10 +const char* GetDigest10bpp(int id) { + static const char* const kDigest[] = { + "6ab8f28e8fb3c4b10b23efee38d4154e", "d4374005d34e43e06c1b0c906289dadd", + "6f922e4142b644ca3f1eb0f363a1c34e", "84e7c098a9335b36082fec0bc7203075", + "40f00ea6884fea23a3b7fae59e3b02c3", "70cb92d08b4fdb6dd9c7d418cb1455d3", + "ed550798b56e70439a93cb48c359e873", "55e0d927b984e78cd51a1961e58a431d", + "482a6856b87265a82e4ea3fdadb2d95b", "0be46226ff87d74ff2ce68a83eaf9cca", + "bb4461f0131a1693a0a76f21d92a480b", "ea24f78d74c7864fb247c9a98c9b97b6", + "d2e70b81882aeb3d9fccef89e7552a9d", "4a692ddf91905727bc524d91735cf93c", + "f5d882ee6d9ae6f7dfa467ca99301424", "58821b87e7d9d4388d6003ffcb3723d1", + "824ddb98eb4129b3d254c0bc7a64cd73", "5eaaafa8ef9b7ba5e2856a947e5b33df", + "071de1494e0f1b2f99266b90bdc43ddd", "c33227a96dad506adc32dacfb371ab78", + "e8a632f9fff240c439d4ae6e86795046", "26b90d74f18f9df4427b6180d48db1fc", + "e4a01e492ddc0398b5c5b60c81468242", "f1b4f7ab5c8b949e51db104f2e33565a", + "b1fb9ecc6a552e2b23ee92e2f3e4122a", "a683d20129a91bb20b904aa20c0499b1", + }; + return kDigest[id]; +} + +const char* GetDigestSpeed10bpp(int id) { + static const char* const kDigest[] = { + "df59e5fd6e0237a56381f3a516806eb8", "f478bdf43e0b91b8dc9b2661eb207e49", + "80557576299708005111029cef04da53", "24f84f07f53f61cd46bdcfe1e05ff9b5", + "4dd6bc62145baa5357a4cbf6d7a6ef15", "0b7aa27cee43b8ae0c02d07887eaa225", + "9e28cdae73ca97433499c31ca79e1d07", "1cacd6466a143f88e736fffaf21e2246", + "9c7699626660d8965e06a54282a408f3", "eef893efef62b2eb4aaad06fc462819c", + "4965d0a3ff750813df85c0082b21bd4b", "ec10fd79fbf552abc595def392e9a863", + "a148bbafdc4466fbb700b31acccca8ac", "ff0566921ff2d5145f79fbf409508fb2", + "5da9d960988549f53b817003b93e4d01", "fa9028b2ed049ad71b5fd15f2daacbe5", + "b4c4f88d1fb54869ce7ff452ca7786a6", "d607f785fce62bad85102054539e7089", + "b441761ea2817e4618c594aaa11d670a", "1cc5e08e6d5f9315dbc0369b97af941d", + "568cc1a3a67ba4e6e77f54602d0ed3e3", "522f14c068f788bc284a7d1e47d623ed", + "b543855cbe384b88861c881853c28192", "5faaafc124e94eedc69dc0f5d33dacac", + "13ca4d01bd20085459e6126555e1f7b5", "46d46fae3c8a7d9e4725154d8d2b76d8", + }; + return kDigest[id]; +} +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +struct ObmcTestParam { + ObmcTestParam(int width, int height, ObmcDirection blending_direction) + : width(width), height(height), blending_direction(blending_direction) {} + int width; + int height; + ObmcDirection blending_direction; +}; + +std::ostream& operator<<(std::ostream& os, const ObmcTestParam& param) { + return os << "BlockSize" << param.width << "x" << param.height + << ", blending_direction: " << ToString(param.blending_direction); +} + +template <int bitdepth, typename Pixel> +class ObmcBlendTest : public testing::TestWithParam<ObmcTestParam> { + public: + ObmcBlendTest() = default; + ~ObmcBlendTest() override = default; + + void SetUp() override { + test_utils::ResetDspTable(bitdepth); + ObmcInit_C(); + const dsp::Dsp* const dsp = dsp::GetDspTable(bitdepth); + ASSERT_NE(dsp, nullptr); + const testing::TestInfo* const test_info = + testing::UnitTest::GetInstance()->current_test_info(); + const absl::string_view test_case = test_info->test_suite_name(); + if (absl::StartsWith(test_case, "C/")) { + } else if (absl::StartsWith(test_case, "SSE41/")) { + if ((GetCpuInfo() & kSSE4_1) != 0) { + ObmcInit_SSE4_1(); + } + } else if (absl::StartsWith(test_case, "NEON/")) { + ObmcInit_NEON(); + } else { + FAIL() << "Unrecognized architecture prefix in test case name: " + << test_case; + } + func_ = dsp->obmc_blend[blending_direction_]; + } + + protected: + int GetDigestId() const { + // blending_direction_ == 0: + // (width, height): + // (2, 2), id = 0. (2, 4), id = 1. (4, 2), id = 2. + // (4, 4), id = 3. (4, 8), id = 4. (8, 4), id = 5. + // ... + // blending_direction_ == 1: id starts from 13. + const int id = (blending_direction_ == kObmcDirectionVertical) ? 0 : 13; + if (width_ == height_) return id + 3 * (FloorLog2(width_) - 1); + if (width_ < height_) return id + 1 + 3 * (FloorLog2(width_) - 1); + return id + 2 + 3 * (FloorLog2(height_) - 1); + } + + // Note |digest| is only used when |use_fixed_values| is false. + void Test(const char* digest, bool use_fixed_values, int value); + void TestSpeed(const char* digest, int num_runs); + + private: + const int width_ = GetParam().width; + const int height_ = GetParam().height; + const int blending_direction_ = GetParam().blending_direction; + Pixel source1_[kMaxBlendingBlockSize * kMaxBlendingBlockSize] = {}; + Pixel source2_[kMaxBlendingBlockSize * kMaxBlendingBlockSize] = {}; + dsp::ObmcBlendFunc func_; +}; + +template <int bitdepth, typename Pixel> +void ObmcBlendTest<bitdepth, Pixel>::Test(const char* const digest, + const bool use_fixed_values, + const int value) { + if (func_ == nullptr) return; + if (use_fixed_values) { + std::fill(source1_, + source1_ + kMaxBlendingBlockSize * kMaxBlendingBlockSize, value); + std::fill(source2_, + source2_ + kMaxBlendingBlockSize * kMaxBlendingBlockSize, value); + } else { + libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed()); + Pixel* src_1 = source1_; + Pixel* src_2 = source2_; + const int mask = (1 << bitdepth) - 1; + for (int y = 0; y < height_; ++y) { + for (int x = 0; x < width_; ++x) { + src_1[x] = rnd.Rand16() & mask; + src_2[x] = rnd.Rand16() & mask; + } + src_1 += kMaxBlendingBlockSize; + src_2 += kMaxBlendingBlockSize; + } + } + const ptrdiff_t stride = kMaxBlendingBlockSize * sizeof(Pixel); + func_(source1_, stride, width_, height_, source2_, stride); + if (use_fixed_values) { + const bool success = test_utils::CompareBlocks( + source1_, source2_, width_, height_, kMaxBlendingBlockSize, + kMaxBlendingBlockSize, false); + EXPECT_TRUE(success); + } else { + test_utils::CheckMd5Digest( + "Obmc", absl::StrFormat("%dx%d", width_, height_).c_str(), digest, + source1_, sizeof(source1_), absl::Duration()); + } +} + +template <int bitdepth, typename Pixel> +void ObmcBlendTest<bitdepth, Pixel>::TestSpeed(const char* const digest, + const int num_runs) { + if (func_ == nullptr) return; + libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed()); + Pixel* src_1 = source1_; + Pixel* src_2 = source2_; + const int mask = (1 << bitdepth) - 1; + for (int y = 0; y < height_; ++y) { + for (int x = 0; x < width_; ++x) { + src_1[x] = rnd.Rand16() & mask; + src_2[x] = rnd.Rand16() & mask; + } + src_1 += kMaxBlendingBlockSize; + src_2 += kMaxBlendingBlockSize; + } + const ptrdiff_t stride = kMaxBlendingBlockSize * sizeof(Pixel); + uint8_t dest[sizeof(Pixel) * kMaxBlendingBlockSize * kMaxBlendingBlockSize]; + absl::Duration elapsed_time; + for (int i = 0; i < num_runs; ++i) { + memcpy(dest, source1_, + sizeof(Pixel) * kMaxBlendingBlockSize * kMaxBlendingBlockSize); + const absl::Time start = absl::Now(); + func_(dest, stride, width_, height_, source2_, stride); + elapsed_time += absl::Now() - start; + } + memcpy(source1_, dest, + sizeof(Pixel) * kMaxBlendingBlockSize * kMaxBlendingBlockSize); + test_utils::CheckMd5Digest("Obmc", + absl::StrFormat("%dx%d", width_, height_).c_str(), + digest, source1_, sizeof(source1_), elapsed_time); +} + +const ObmcTestParam kObmcTestParam[] = { + ObmcTestParam(2, 2, kObmcDirectionVertical), + ObmcTestParam(2, 4, kObmcDirectionVertical), + ObmcTestParam(4, 2, kObmcDirectionVertical), + ObmcTestParam(4, 4, kObmcDirectionVertical), + ObmcTestParam(4, 8, kObmcDirectionVertical), + ObmcTestParam(8, 4, kObmcDirectionVertical), + ObmcTestParam(8, 8, kObmcDirectionVertical), + ObmcTestParam(8, 16, kObmcDirectionVertical), + ObmcTestParam(16, 8, kObmcDirectionVertical), + ObmcTestParam(16, 16, kObmcDirectionVertical), + ObmcTestParam(16, 32, kObmcDirectionVertical), + ObmcTestParam(32, 16, kObmcDirectionVertical), + ObmcTestParam(32, 32, kObmcDirectionVertical), + ObmcTestParam(2, 2, kObmcDirectionHorizontal), + ObmcTestParam(2, 4, kObmcDirectionHorizontal), + ObmcTestParam(4, 2, kObmcDirectionHorizontal), + ObmcTestParam(4, 4, kObmcDirectionHorizontal), + ObmcTestParam(4, 8, kObmcDirectionHorizontal), + ObmcTestParam(8, 4, kObmcDirectionHorizontal), + ObmcTestParam(8, 8, kObmcDirectionHorizontal), + ObmcTestParam(8, 16, kObmcDirectionHorizontal), + ObmcTestParam(16, 8, kObmcDirectionHorizontal), + ObmcTestParam(16, 16, kObmcDirectionHorizontal), + ObmcTestParam(16, 32, kObmcDirectionHorizontal), + ObmcTestParam(32, 16, kObmcDirectionHorizontal), + ObmcTestParam(32, 32, kObmcDirectionHorizontal), +}; + +using ObmcBlendTest8bpp = ObmcBlendTest<8, uint8_t>; + +TEST_P(ObmcBlendTest8bpp, Blending) { + Test(/*digest=*/nullptr, /*use_fixed_values=*/true, 0); + Test(/*digest=*/nullptr, /*use_fixed_values=*/true, 1); + Test(/*digest=*/nullptr, /*use_fixed_values=*/true, 128); + Test(/*digest=*/nullptr, /*use_fixed_values=*/true, 255); + Test(GetDigest8bpp(GetDigestId()), /*use_fixed_values=*/false, -1); +} + +TEST_P(ObmcBlendTest8bpp, DISABLED_Speed) { + TestSpeed( + GetDigestSpeed8bpp(GetDigestId()), + (kNumSpeedTests * 32 * 32) / (GetParam().height * GetParam().width)); +} + +INSTANTIATE_TEST_SUITE_P(C, ObmcBlendTest8bpp, + testing::ValuesIn(kObmcTestParam)); + +#if LIBGAV1_ENABLE_SSE4_1 +INSTANTIATE_TEST_SUITE_P(SSE41, ObmcBlendTest8bpp, + testing::ValuesIn(kObmcTestParam)); +#endif + +#if LIBGAV1_ENABLE_NEON +INSTANTIATE_TEST_SUITE_P(NEON, ObmcBlendTest8bpp, + testing::ValuesIn(kObmcTestParam)); +#endif + +#if LIBGAV1_MAX_BITDEPTH >= 10 +using ObmcBlendTest10bpp = ObmcBlendTest<10, uint16_t>; + +TEST_P(ObmcBlendTest10bpp, Blending) { + Test(/*digest=*/nullptr, /*use_fixed_values=*/true, 0); + Test(/*digest=*/nullptr, /*use_fixed_values=*/true, 1); + Test(/*digest=*/nullptr, /*use_fixed_values=*/true, 128); + Test(/*digest=*/nullptr, /*use_fixed_values=*/true, (1 << 10) - 1); + Test(GetDigest10bpp(GetDigestId()), /*use_fixed_values=*/false, -1); +} + +TEST_P(ObmcBlendTest10bpp, DISABLED_Speed) { + TestSpeed( + GetDigestSpeed10bpp(GetDigestId()), + (kNumSpeedTests * 32 * 32) / (GetParam().height * GetParam().width)); +} + +INSTANTIATE_TEST_SUITE_P(C, ObmcBlendTest10bpp, + testing::ValuesIn(kObmcTestParam)); +#if LIBGAV1_ENABLE_SSE4_1 +INSTANTIATE_TEST_SUITE_P(SSE41, ObmcBlendTest10bpp, + testing::ValuesIn(kObmcTestParam)); +#endif +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +} // namespace +} // namespace dsp +} // namespace libgav1 diff --git a/src/dsp/super_res.cc b/src/dsp/super_res.cc index d041bd1..abb01a1 100644 --- a/src/dsp/super_res.cc +++ b/src/dsp/super_res.cc @@ -26,10 +26,10 @@ namespace { template <int bitdepth, typename Pixel> void SuperRes_C(const void* /*coefficients*/, void* const source, - const ptrdiff_t stride, const int height, + const ptrdiff_t source_stride, const int height, const int downscaled_width, const int upscaled_width, - const int initial_subpixel_x, const int step, - void* const dest) { + const int initial_subpixel_x, const int step, void* const dest, + ptrdiff_t dest_stride) { assert(step <= 1 << kSuperResScaleBits); auto* src = static_cast<Pixel*>(source) - DivideBy2(kSuperResFilterTaps); auto* dst = static_cast<Pixel*>(dest); @@ -61,8 +61,8 @@ void SuperRes_C(const void* /*coefficients*/, void* const source, (1 << bitdepth) - 1); subpixel_x += step; } while (++x < upscaled_width); - src += stride; - dst += stride; + src += source_stride; + dst += dest_stride; } while (--y != 0); } diff --git a/src/dsp/super_res_test.cc b/src/dsp/super_res_test.cc new file mode 100644 index 0000000..a93fc31 --- /dev/null +++ b/src/dsp/super_res_test.cc @@ -0,0 +1,264 @@ +// Copyright 2021 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/super_res.h" + +#include <cstdint> +#include <cstdio> +#include <cstring> +#include <string> +#include <vector> + +#include "absl/strings/match.h" +#include "absl/strings/numbers.h" +#include "absl/strings/str_format.h" +#include "absl/strings/str_split.h" +#include "absl/strings/string_view.h" +#include "absl/time/clock.h" +#include "absl/time/time.h" +#include "gtest/gtest.h" +#include "src/dsp/dsp.h" +#include "src/utils/common.h" +#include "src/utils/constants.h" +#include "src/utils/cpu.h" +#include "src/utils/memory.h" +#include "tests/third_party/libvpx/acm_random.h" +#include "tests/utils.h" + +namespace libgav1 { +namespace dsp { +namespace { + +constexpr int kNumSpeedTests = 5e5; + +const char* GetDigest8bpp(int id) { + static const char* const kDigestSuperRes[] = { + "52eb4eac1df0c51599d57696405b69d0", "ccb07cc8295fd1440ff2e3b9199ec4f9", + "baef34cca795b95f3d1fd81d609da679", "03f1579c2773c8ba9c867316a22b94a3"}; + return kDigestSuperRes[id]; +} + +#if LIBGAV1_MAX_BITDEPTH >= 10 +const char* GetDigest10bpp(int id) { + static const char* const kDigestSuperRes[] = { + "8fd78e05d944aeb11fac278b47ee60ba", "948eaecb70fa5614ce1c1c95e9942dc3", + "126cd7727e787e0625ec3f5ce97f8fa0", "85c806c41d40b841764bcb54f6d3a712"}; + return kDigestSuperRes[id]; +} +#endif + +struct SuperResTestParam { + SuperResTestParam(int downscaled_width, int upscaled_width) + : downscaled_width(downscaled_width), upscaled_width(upscaled_width) {} + int downscaled_width; + int upscaled_width; +}; + +template <int bitdepth, typename Pixel, typename Coefficient> +class SuperResTest : public testing::TestWithParam<SuperResTestParam>, + public test_utils::MaxAlignedAllocable { + public: + SuperResTest() = default; + void SetUp() override { + test_utils::ResetDspTable(bitdepth); + SuperResInit_C(); + const dsp::Dsp* const dsp = dsp::GetDspTable(bitdepth); + ASSERT_NE(dsp, nullptr); + + const testing::TestInfo* const test_info = + testing::UnitTest::GetInstance()->current_test_info(); + const std::vector<std::string> split_test_name = + absl::StrSplit(test_info->name(), '/'); + ASSERT_TRUE(absl::SimpleAtoi(split_test_name[1], &test_id_)); + const absl::string_view test_case = test_info->test_suite_name(); + if (absl::StartsWith(test_case, "C/")) { + } else if (absl::StartsWith(test_case, "NEON/")) { + SuperResInit_NEON(); + } else if (absl::StartsWith(test_case, "SSE41/")) { + SuperResInit_SSE4_1(); + } else { + FAIL() << "Unrecognized architecture prefix in test case name: " + << test_case; + } + super_res_coefficients_ = dsp->super_res_coefficients; + func_ = dsp->super_res; + } + + void TestComputeSuperRes(int fixed_value, int num_runs); + + private: + static constexpr int kHeight = 127; + // The maximum width that must be allocated. + static constexpr int kUpscaledBufferWidth = 192; + // Allow room for the filter taps. + static constexpr int kStride = + ((kUpscaledBufferWidth + 2 * kSuperResHorizontalBorder + 15) & ~15); + const int kDownscaledWidth = GetParam().downscaled_width; + const int kUpscaledWidth = GetParam().upscaled_width; + int test_id_; + SuperResCoefficientsFunc super_res_coefficients_; + SuperResFunc func_; + Pixel source_buffer_[kHeight][kStride]; + alignas(kMaxAlignment) Pixel dest_buffer_[kHeight][kStride]; + alignas(kMaxAlignment) Coefficient + superres_coefficients_[kSuperResFilterTaps * kUpscaledBufferWidth]; +}; + +template <int bitdepth, typename Pixel, typename Coefficient> +void SuperResTest<bitdepth, Pixel, Coefficient>::TestComputeSuperRes( + int fixed_value, int num_runs) { + if (func_ == nullptr) return; + const int superres_width = kDownscaledWidth << kSuperResScaleBits; + const int step = (superres_width + kUpscaledWidth / 2) / kUpscaledWidth; + const int error = step * kUpscaledWidth - superres_width; + const int initial_subpixel_x = + ((-((kUpscaledWidth - kDownscaledWidth) << (kSuperResScaleBits - 1)) + + DivideBy2(kUpscaledWidth)) / + kUpscaledWidth + + (1 << (kSuperResExtraBits - 1)) - error / 2) & + kSuperResScaleMask; + if (super_res_coefficients_ != nullptr) { + super_res_coefficients_(kUpscaledWidth, initial_subpixel_x, step, + superres_coefficients_); + } + memset(dest_buffer_, 0, sizeof(dest_buffer_)); + if (fixed_value != 0) { + SetBlock<Pixel>(kHeight, kStride, fixed_value, source_buffer_[0], kStride); + } else { + // Random values. + libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed()); + const int bitdepth_mask = (1 << bitdepth) - 1; + for (int y = 0; y < kHeight; ++y) { + for (int x = 0; x < kStride; ++x) { + source_buffer_[y][x] = rnd.Rand16() & bitdepth_mask; + } + } + } + // Offset starting point in the buffer to accommodate line extension. + Pixel* src_ptr = source_buffer_[0] + kSuperResHorizontalBorder; + + const absl::Time start = absl::Now(); + for (int i = 0; i < num_runs; ++i) { + func_(superres_coefficients_, src_ptr, kStride, kHeight, kDownscaledWidth, + kUpscaledWidth, initial_subpixel_x, step, dest_buffer_, kStride); + } + const absl::Duration elapsed_time = absl::Now() - start; + + if (fixed_value != 0) { + for (int y = 0; y < kHeight; ++y) { + for (int x = 0; x < kUpscaledWidth; ++x) { + EXPECT_TRUE(dest_buffer_[y][x] == fixed_value) + << "At location [" << y << ", " << x + << "]\nexpected: " << fixed_value + << "\nactual: " << dest_buffer_[y][x]; + } + } + } else if (num_runs == 1) { + // Random values. + if ((kUpscaledWidth & 15) != 0) { + // The SIMD functions overwrite up to 15 pixels in each row. Reset them. + for (int y = 0; y < kHeight; ++y) { + for (int x = kUpscaledWidth; x < Align(kUpscaledWidth, 16); ++x) { + dest_buffer_[y][x] = 0; + } + } + } + const char* expected_digest; + if (bitdepth == 8) { + expected_digest = GetDigest8bpp(test_id_); + } else { +#if LIBGAV1_MAX_BITDEPTH >= 10 + expected_digest = GetDigest10bpp(test_id_); +#endif + } + test_utils::CheckMd5Digest( + "SuperRes", + absl::StrFormat("width %d, step %d, start %d", kUpscaledWidth, step, + initial_subpixel_x) + .c_str(), + expected_digest, dest_buffer_, sizeof(dest_buffer_), elapsed_time); + } else { + // Speed test. + printf("Mode SuperRes [width %d, step %d, start %d]: %d us\n", + kUpscaledWidth, step, initial_subpixel_x, + static_cast<int>(absl::ToInt64Microseconds(elapsed_time))); + } +} + +using SuperResTest8bpp = SuperResTest<8, uint8_t, int8_t>; + +TEST_P(SuperResTest8bpp, FixedValues) { + TestComputeSuperRes(100, 1); + TestComputeSuperRes(255, 1); + TestComputeSuperRes(1, 1); +} + +TEST_P(SuperResTest8bpp, RandomValues) { TestComputeSuperRes(0, 1); } + +TEST_P(SuperResTest8bpp, DISABLED_Speed) { + TestComputeSuperRes(0, kNumSpeedTests); +} + +const SuperResTestParam kSuperResTestParams[] = { + SuperResTestParam(96, 192), + SuperResTestParam(171, 192), + SuperResTestParam(102, 128), + SuperResTestParam(61, 121), +}; + +INSTANTIATE_TEST_SUITE_P(C, SuperResTest8bpp, + testing::ValuesIn(kSuperResTestParams)); + +#if LIBGAV1_ENABLE_NEON +INSTANTIATE_TEST_SUITE_P(NEON, SuperResTest8bpp, + testing::ValuesIn(kSuperResTestParams)); +#endif + +#if LIBGAV1_ENABLE_SSE4_1 +INSTANTIATE_TEST_SUITE_P(SSE41, SuperResTest8bpp, + testing::ValuesIn(kSuperResTestParams)); +#endif + +#if LIBGAV1_MAX_BITDEPTH >= 10 +using SuperResTest10bpp = SuperResTest<10, uint16_t, int16_t>; + +TEST_P(SuperResTest10bpp, FixedValues) { + TestComputeSuperRes(100, 1); + TestComputeSuperRes(511, 1); + TestComputeSuperRes(1, 1); +} + +TEST_P(SuperResTest10bpp, RandomValues) { TestComputeSuperRes(0, 1); } + +TEST_P(SuperResTest10bpp, DISABLED_Speed) { + TestComputeSuperRes(0, kNumSpeedTests); +} + +INSTANTIATE_TEST_SUITE_P(C, SuperResTest10bpp, + testing::ValuesIn(kSuperResTestParams)); + +#if LIBGAV1_ENABLE_SSE4_1 +INSTANTIATE_TEST_SUITE_P(SSE41, SuperResTest10bpp, + testing::ValuesIn(kSuperResTestParams)); +#endif + +#if LIBGAV1_ENABLE_NEON +INSTANTIATE_TEST_SUITE_P(NEON, SuperResTest10bpp, + testing::ValuesIn(kSuperResTestParams)); +#endif +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +} // namespace +} // namespace dsp +} // namespace libgav1 diff --git a/src/dsp/warp_test.cc b/src/dsp/warp_test.cc new file mode 100644 index 0000000..e7384f4 --- /dev/null +++ b/src/dsp/warp_test.cc @@ -0,0 +1,649 @@ +// Copyright 2021 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/warp.h" + +#include <algorithm> +#include <cassert> +#include <cmath> +#include <cstddef> +#include <cstdint> +#include <cstdio> +#include <cstdlib> +#include <ostream> +#include <string> +#include <type_traits> + +#include "absl/base/macros.h" +#include "absl/strings/match.h" +#include "absl/strings/str_format.h" +#include "absl/strings/string_view.h" +#include "absl/time/clock.h" +#include "absl/time/time.h" +#include "gtest/gtest.h" +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/post_filter.h" +#include "src/utils/common.h" +#include "src/utils/constants.h" +#include "src/utils/cpu.h" +#include "src/utils/memory.h" +#include "tests/block_utils.h" +#include "tests/third_party/libvpx/acm_random.h" +#include "tests/utils.h" + +namespace libgav1 { +namespace dsp { +namespace { + +constexpr int kSourceBorderHorizontal = 16; +constexpr int kSourceBorderVertical = 13; + +constexpr int kMaxSourceBlockWidth = + kMaxSuperBlockSizeInPixels + kSourceBorderHorizontal * 2; +constexpr int kMaxSourceBlockHeight = + kMaxSuperBlockSizeInPixels + kSourceBorderVertical * 2; +constexpr int kMaxDestBlockWidth = + kMaxSuperBlockSizeInPixels + kConvolveBorderLeftTop * 2; +constexpr int kMaxDestBlockHeight = + kMaxSuperBlockSizeInPixels + kConvolveBorderLeftTop * 2; + +constexpr uint16_t kDivisorLookup[257] = { + 16384, 16320, 16257, 16194, 16132, 16070, 16009, 15948, 15888, 15828, 15768, + 15709, 15650, 15592, 15534, 15477, 15420, 15364, 15308, 15252, 15197, 15142, + 15087, 15033, 14980, 14926, 14873, 14821, 14769, 14717, 14665, 14614, 14564, + 14513, 14463, 14413, 14364, 14315, 14266, 14218, 14170, 14122, 14075, 14028, + 13981, 13935, 13888, 13843, 13797, 13752, 13707, 13662, 13618, 13574, 13530, + 13487, 13443, 13400, 13358, 13315, 13273, 13231, 13190, 13148, 13107, 13066, + 13026, 12985, 12945, 12906, 12866, 12827, 12788, 12749, 12710, 12672, 12633, + 12596, 12558, 12520, 12483, 12446, 12409, 12373, 12336, 12300, 12264, 12228, + 12193, 12157, 12122, 12087, 12053, 12018, 11984, 11950, 11916, 11882, 11848, + 11815, 11782, 11749, 11716, 11683, 11651, 11619, 11586, 11555, 11523, 11491, + 11460, 11429, 11398, 11367, 11336, 11305, 11275, 11245, 11215, 11185, 11155, + 11125, 11096, 11067, 11038, 11009, 10980, 10951, 10923, 10894, 10866, 10838, + 10810, 10782, 10755, 10727, 10700, 10673, 10645, 10618, 10592, 10565, 10538, + 10512, 10486, 10460, 10434, 10408, 10382, 10356, 10331, 10305, 10280, 10255, + 10230, 10205, 10180, 10156, 10131, 10107, 10082, 10058, 10034, 10010, 9986, + 9963, 9939, 9916, 9892, 9869, 9846, 9823, 9800, 9777, 9754, 9732, + 9709, 9687, 9664, 9642, 9620, 9598, 9576, 9554, 9533, 9511, 9489, + 9468, 9447, 9425, 9404, 9383, 9362, 9341, 9321, 9300, 9279, 9259, + 9239, 9218, 9198, 9178, 9158, 9138, 9118, 9098, 9079, 9059, 9039, + 9020, 9001, 8981, 8962, 8943, 8924, 8905, 8886, 8867, 8849, 8830, + 8812, 8793, 8775, 8756, 8738, 8720, 8702, 8684, 8666, 8648, 8630, + 8613, 8595, 8577, 8560, 8542, 8525, 8508, 8490, 8473, 8456, 8439, + 8422, 8405, 8389, 8372, 8355, 8339, 8322, 8306, 8289, 8273, 8257, + 8240, 8224, 8208, 8192}; + +template <bool is_compound> +const char* GetDigest8bpp(int id) { + static const char* const kDigest[] = { + "77ba358a0f5e19a8e69fa0a95712578e", "141b23d13a04e0b84d26d514de76d6b0", + "b0265858454b979852ffadae323f0fb7", "9cf38e3579265b656f1f2100ba15b0e9", + "ab51d05cc255ef8e37921182df1d89b1", "e3e96f90a4b07ca733e40f057dc01c41", + "4eee8c1a52a62a266db9b1c9338e124c", "901a87d8f88f6324dbc0960a6de861ac", + "da9cb6faf6adaeeae12b6784f39186c5", "14450ab05536cdb0d2f499716ccb559d", + "566b396cbf008bbb869b364fdc81860d", "681a872baf2de4e58d73ea9ab8643a72", + "7f17d290d513a7416761b3a01f10fd2f", + }; + static const char* const kCompoundDigest[] = { + "7e9339d265b7beac7bbe32fe7bb0fccb", "f747d663b427bb38a3ff36b0815a394c", + "858cf54d2253281a919fbdb48fe91c53", "4721dd97a212c6068bd488f400259afc", + "36878c7906492bc740112abdea77616f", "89deb68aa35764bbf3024b501a6bed50", + "8ac5b08f9b2afd38143c357646af0f82", "bf6e2a64835ea0c9d7467394253d0eb2", + "7b0a539acd2a27eff398dd084abad933", "61c8d81b397c1cf727ff8a9fabab90af", + "4d412349a25a832c1fb3fb29e3f0e2b3", "2c6dd2a9a4ede9fa00adb567ba646f30", + "b2a0ce68db3cadd207299f73112bed74", + }; + return is_compound ? kCompoundDigest[id] : kDigest[id]; +} + +#if LIBGAV1_MAX_BITDEPTH >= 10 +template <bool is_compound> +const char* GetDigest10bpp(int id) { + static const char* const kDigest[] = { + "1fef54f56a0bafccf7f8da1ac3b18b76", "8a65c72f171feafa2f393d31d6b7fe1b", + "808019346f2f1f45f8cf2e9fc9a49320", "c28e2f2c6c830a29bcc2452166cba521", + "f040674d6f54e8910d655f0d11fd8cdd", "473af9bb1c6023965c2284b716feef97", + "e4f6d7babd0813d5afb0f575ebfa8166", "58f96ef8a880963a213624bb0d06d47c", + "1ec0995fa4490628b679d03683233388", "9526fb102fde7dc1a7e160e65af6da33", + "f0457427d0c0e31d82ea4f612f7f86f1", "ddc82ae298cccebad493ba9de0f69fbd", + "5ed615091e2f62df26de7e91a985cb81", + }; + static const char* const kCompoundDigest[] = { + "8e6986ae143260e0b8b4887f15a141a1", "0a7f0db8316b8c3569f08834dd0c6f50", + "90705b2e7dbe083e8a1f70f29d6f257e", "e428a75bea77d769d21f3f7a1d2b0b38", + "a570b13d790c085c4ab50d71dd085d56", "e5d043c6cd6ff6dbab6e38a8877e93bd", + "12ea96991e46e3e9aa78ab812ffa0525", "84293a94a53f1cf814fa25e793c3fe27", + "b98a7502c84ac8437266f702dcc0a92e", "d8db5d52e9b0a5be0ad2d517d5bd16e9", + "f3be504bbb609ce4cc71c5539252638a", "fcde83b54e14e9de23460644f244b047", + "42eb66e752e9ef289b47053b5c73fdd6", + }; + return is_compound ? kCompoundDigest[id] : kDigest[id]; +} +#endif + +int RandomWarpedParam(int seed_offset, int bits) { + libvpx_test::ACMRandom rnd(seed_offset + + libvpx_test::ACMRandom::DeterministicSeed()); + // 1 in 8 chance of generating zero (arbitrary). + const bool zero = (rnd.Rand16() & 7) == 0; + if (zero) return 0; + // Generate uniform values in the range [-(1 << bits), 1] U [1, 1 << + // bits]. + const int mask = (1 << bits) - 1; + const int value = 1 + (rnd.RandRange(1u << 31) & mask); + const bool sign = (rnd.Rand16() & 1) != 0; + return sign ? value : -value; +} + +// This function is a copy from warp_prediction.cc. +template <typename T> +void GenerateApproximateDivisor(T value, int16_t* division_factor, + int16_t* division_shift) { + const int n = FloorLog2(std::abs(value)); + const T e = std::abs(value) - (static_cast<T>(1) << n); + const int entry = (n > kDivisorLookupBits) + ? RightShiftWithRounding(e, n - kDivisorLookupBits) + : static_cast<int>(e << (kDivisorLookupBits - n)); + *division_shift = n + kDivisorLookupPrecisionBits; + *division_factor = + (value < 0) ? -kDivisorLookup[entry] : kDivisorLookup[entry]; +} + +// This function is a copy from warp_prediction.cc. +int16_t GetShearParameter(int value) { + return static_cast<int16_t>( + LeftShift(RightShiftWithRoundingSigned(value, kWarpParamRoundingBits), + kWarpParamRoundingBits)); +} + +// This function is a copy from warp_prediction.cc. +// This function is used here to help generate valid warp parameters. +bool SetupShear(const int* params, int16_t* alpha, int16_t* beta, + int16_t* gamma, int16_t* delta) { + int16_t division_shift; + int16_t division_factor; + GenerateApproximateDivisor<int32_t>(params[2], &division_factor, + &division_shift); + const int alpha0 = + Clip3(params[2] - (1 << kWarpedModelPrecisionBits), INT16_MIN, INT16_MAX); + const int beta0 = Clip3(params[3], INT16_MIN, INT16_MAX); + const int64_t v = LeftShift(params[4], kWarpedModelPrecisionBits); + const int gamma0 = + Clip3(RightShiftWithRoundingSigned(v * division_factor, division_shift), + INT16_MIN, INT16_MAX); + const int64_t w = static_cast<int64_t>(params[3]) * params[4]; + const int delta0 = Clip3( + params[5] - + RightShiftWithRoundingSigned(w * division_factor, division_shift) - + (1 << kWarpedModelPrecisionBits), + INT16_MIN, INT16_MAX); + + *alpha = GetShearParameter(alpha0); + *beta = GetShearParameter(beta0); + *gamma = GetShearParameter(gamma0); + *delta = GetShearParameter(delta0); + if ((4 * std::abs(*alpha) + 7 * std::abs(*beta) >= + (1 << kWarpedModelPrecisionBits)) || + (4 * std::abs(*gamma) + 4 * std::abs(*delta) >= + (1 << kWarpedModelPrecisionBits))) { + return false; // NOLINT (easier condition to understand). + } + + return true; +} + +void GenerateWarpedModel(int* params, int16_t* alpha, int16_t* beta, + int16_t* gamma, int16_t* delta, int seed) { + do { + params[0] = RandomWarpedParam(seed, kWarpedModelPrecisionBits + 6); + params[1] = RandomWarpedParam(seed, kWarpedModelPrecisionBits + 6); + params[2] = RandomWarpedParam(seed, kWarpedModelPrecisionBits - 3) + + (1 << kWarpedModelPrecisionBits); + params[3] = RandomWarpedParam(seed, kWarpedModelPrecisionBits - 3); + params[4] = RandomWarpedParam(seed, kWarpedModelPrecisionBits - 3); + params[5] = RandomWarpedParam(seed, kWarpedModelPrecisionBits - 3) + + (1 << kWarpedModelPrecisionBits); + ++seed; + } while (params[2] == 0 || !SetupShear(params, alpha, beta, gamma, delta)); +} + +struct WarpTestParam { + WarpTestParam(int width, int height) : width(width), height(height) {} + int width; + int height; +}; + +template <bool is_compound, int bitdepth, typename Pixel> +class WarpTest : public testing::TestWithParam<WarpTestParam> { + public: + WarpTest() = default; + ~WarpTest() override = default; + + void SetUp() override { + test_utils::ResetDspTable(bitdepth); + WarpInit_C(); + const dsp::Dsp* const dsp = dsp::GetDspTable(bitdepth); + ASSERT_NE(dsp, nullptr); + const testing::TestInfo* const test_info = + testing::UnitTest::GetInstance()->current_test_info(); + const absl::string_view test_case = test_info->test_suite_name(); + if (absl::StartsWith(test_case, "C/")) { + } else if (absl::StartsWith(test_case, "NEON/")) { + WarpInit_NEON(); + } else if (absl::StartsWith(test_case, "SSE41/")) { + WarpInit_SSE4_1(); + } else { + FAIL() << "Unrecognized architecture prefix in test case name: " + << test_case; + } + func_ = is_compound ? dsp->warp_compound : dsp->warp; + } + + protected: + using DestType = + typename std::conditional<is_compound, uint16_t, Pixel>::type; + + void SetInputData(bool use_fixed_values, int value); + void Test(bool use_fixed_values, int value, int num_runs = 1); + void TestFixedValues(); + void TestRandomValues(); + void TestSpeed(); + + const WarpTestParam param_ = GetParam(); + + private: + int warp_params_[8]; + dsp::WarpFunc func_; + // Warp filters are 7-tap, which needs 3 pixels (kConvolveBorderLeftTop) + // padding. Destination buffer indices are based on subsampling values (x+y): + // 0: (4:4:4), 1:(4:2:2), 2: (4:2:0). + Pixel source_[kMaxSourceBlockHeight * kMaxSourceBlockWidth] = {}; + DestType dest_[3][kMaxDestBlockHeight * kMaxDestBlockWidth] = {}; +}; + +template <bool is_compound, int bitdepth, typename Pixel> +void WarpTest<is_compound, bitdepth, Pixel>::SetInputData(bool use_fixed_values, + int value) { + if (use_fixed_values) { + for (int y = 0; y < param_.height; ++y) { + const int row = kSourceBorderVertical + y; + Memset(source_ + row * kMaxSourceBlockWidth + kSourceBorderHorizontal, + value, param_.width); + } + } else { + const int mask = (1 << bitdepth) - 1; + libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed()); + for (int y = 0; y < param_.height; ++y) { + const int row = kSourceBorderVertical + y; + for (int x = 0; x < param_.width; ++x) { + const int column = kSourceBorderHorizontal + x; + source_[row * kMaxSourceBlockWidth + column] = rnd.Rand16() & mask; + } + } + } + PostFilter::ExtendFrame<Pixel>( + &source_[kSourceBorderVertical * kMaxSourceBlockWidth + + kSourceBorderHorizontal], + param_.width, param_.height, kMaxSourceBlockWidth, + kSourceBorderHorizontal, kSourceBorderHorizontal, kSourceBorderVertical, + kSourceBorderVertical); +} + +template <bool is_compound, int bitdepth, typename Pixel> +void WarpTest<is_compound, bitdepth, Pixel>::Test(bool use_fixed_values, + int value, + int num_runs /*= 1*/) { + if (func_ == nullptr) return; + SetInputData(use_fixed_values, value); + libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed()); + const int source_offset = + kSourceBorderVertical * kMaxSourceBlockWidth + kSourceBorderHorizontal; + const int dest_offset = + kConvolveBorderLeftTop * kMaxDestBlockWidth + kConvolveBorderLeftTop; + const Pixel* const src = source_ + source_offset; + const ptrdiff_t src_stride = kMaxSourceBlockWidth * sizeof(Pixel); + const ptrdiff_t dst_stride = + is_compound ? kMaxDestBlockWidth : kMaxDestBlockWidth * sizeof(Pixel); + + absl::Duration elapsed_time; + for (int subsampling_x = 0; subsampling_x <= 1; ++subsampling_x) { + for (int subsampling_y = 0; subsampling_y <= 1; ++subsampling_y) { + if (subsampling_x == 0 && subsampling_y == 1) { + // When both are 0: 4:4:4 + // When both are 1: 4:2:0 + // When only |subsampling_x| is 1: 4:2:2 + // Having only |subsampling_y| == 1 is unsupported. + continue; + } + int params[8]; + int16_t alpha; + int16_t beta; + int16_t gamma; + int16_t delta; + GenerateWarpedModel(params, &alpha, &beta, &gamma, &delta, rnd.Rand8()); + + const int dest_id = subsampling_x + subsampling_y; + DestType* const dst = dest_[dest_id] + dest_offset; + const absl::Time start = absl::Now(); + for (int n = 0; n < num_runs; ++n) { + func_(src, src_stride, param_.width, param_.height, params, + subsampling_x, subsampling_y, 0, 0, param_.width, param_.height, + alpha, beta, gamma, delta, dst, dst_stride); + } + elapsed_time += absl::Now() - start; + } + } + + if (use_fixed_values) { + // For fixed values, input and output are identical. + for (size_t i = 0; i < ABSL_ARRAYSIZE(dest_); ++i) { + // |is_compound| holds a few more bits of precision and an offset value. + Pixel compensated_dest[kMaxDestBlockWidth * kMaxDestBlockHeight]; + const int compound_offset = (bitdepth == 8) ? 0 : kCompoundOffset; + if (is_compound) { + for (int y = 0; y < param_.height; ++y) { + for (int x = 0; x < param_.width; ++x) { + const int compound_value = + dest_[i][dest_offset + y * kMaxDestBlockWidth + x]; + const int remove_offset = compound_value - compound_offset; + const int full_shift = + remove_offset >> + (kInterRoundBitsVertical - kInterRoundBitsCompoundVertical); + compensated_dest[y * kMaxDestBlockWidth + x] = + Clip3(full_shift, 0, (1 << bitdepth) - 1); + } + } + } + Pixel* pixel_dest = + is_compound ? compensated_dest + : reinterpret_cast<Pixel*>(dest_[i] + dest_offset); + const bool success = test_utils::CompareBlocks( + src, pixel_dest, param_.width, param_.height, kMaxSourceBlockWidth, + kMaxDestBlockWidth, false); + EXPECT_TRUE(success) << "subsampling_x + subsampling_y: " << i; + } + } else { + // (width, height): + // (8, 8), id = 0. (8, 16), id = 1. (16, 8), id = 2. + // (16, 16), id = 3. (16, 32), id = 4. (32, 16), id = 5. + // ... + // (128, 128), id = 12. + int id; + if (param_.width == param_.height) { + id = 3 * static_cast<int>(FloorLog2(param_.width) - 3); + } else if (param_.width < param_.height) { + id = 1 + 3 * static_cast<int>(FloorLog2(param_.width) - 3); + } else { + id = 2 + 3 * static_cast<int>(FloorLog2(param_.height) - 3); + } + + const char* expected_digest; + if (bitdepth == 8) { + expected_digest = GetDigest8bpp<is_compound>(id); + } else { +#if LIBGAV1_MAX_BITDEPTH >= 10 + expected_digest = GetDigest10bpp<is_compound>(id); +#endif + } + test_utils::CheckMd5Digest( + "Warp", absl::StrFormat("%dx%d", param_.width, param_.height).c_str(), + expected_digest, dest_, sizeof(dest_), elapsed_time); + } +} + +template <bool is_compound, int bitdepth, typename Pixel> +void WarpTest<is_compound, bitdepth, Pixel>::TestFixedValues() { + Test(true, 0); + Test(true, 1); + Test(true, 128); + Test(true, (1 << bitdepth) - 1); +} + +template <bool is_compound, int bitdepth, typename Pixel> +void WarpTest<is_compound, bitdepth, Pixel>::TestRandomValues() { + Test(false, 0); +} + +template <bool is_compound, int bitdepth, typename Pixel> +void WarpTest<is_compound, bitdepth, Pixel>::TestSpeed() { + const int num_runs = static_cast<int>(1.0e7 / (param_.width * param_.height)); + Test(false, 0, num_runs); +} + +void ApplyFilterToSignedInput(const int min_input, const int max_input, + const int8_t filter[kSubPixelTaps], + int* min_output, int* max_output) { + int min = 0, max = 0; + for (int i = 0; i < kSubPixelTaps; ++i) { + const int tap = filter[i]; + if (tap > 0) { + max += max_input * tap; + min += min_input * tap; + } else { + min += max_input * tap; + max += min_input * tap; + } + } + *min_output = min; + *max_output = max; +} + +void ApplyFilterToUnsignedInput(const int max_input, + const int8_t filter[kSubPixelTaps], + int* min_output, int* max_output) { + ApplyFilterToSignedInput(0, max_input, filter, min_output, max_output); +} + +// Validate the maximum ranges for different parts of the Warp process. +template <int bitdepth> +void ShowRange() { + constexpr int horizontal_bits = (bitdepth == kBitdepth12) + ? kInterRoundBitsHorizontal12bpp + : kInterRoundBitsHorizontal; + constexpr int vertical_bits = (bitdepth == kBitdepth12) + ? kInterRoundBitsVertical12bpp + : kInterRoundBitsVertical; + constexpr int compound_vertical_bits = kInterRoundBitsCompoundVertical; + + constexpr int compound_offset = (bitdepth == 8) ? 0 : kCompoundOffset; + + constexpr int max_input = (1 << bitdepth) - 1; + + const int8_t* worst_warp_filter = kWarpedFilters8[93]; + + // First pass. + printf("Bitdepth: %2d Input range: [%8d, %8d]\n", bitdepth, 0, + max_input); + + int min = 0, max = 0; + ApplyFilterToUnsignedInput(max_input, worst_warp_filter, &min, &max); + + int first_pass_offset; + if (bitdepth == 8) { + // Derive an offset for 8 bit. + for (first_pass_offset = 1; - first_pass_offset > min; + first_pass_offset <<= 1) { + } + printf(" 8bpp intermediate offset: %d.\n", first_pass_offset); + min += first_pass_offset; + max += first_pass_offset; + assert(min > 0); + assert(max < UINT16_MAX); + } else { + // 10bpp and 12bpp require int32_t for the intermediate values. Adding an + // offset is not required. + assert(min > INT32_MIN); + assert(max > INT16_MAX && max < INT32_MAX); + } + + printf(" intermediate range: [%8d, %8d]\n", min, max); + + const int first_pass_min = RightShiftWithRounding(min, horizontal_bits); + const int first_pass_max = RightShiftWithRounding(max, horizontal_bits); + + printf(" first pass output range: [%8d, %8d]\n", first_pass_min, + first_pass_max); + + // Second pass. + if (bitdepth == 8) { + ApplyFilterToUnsignedInput(first_pass_max, worst_warp_filter, &min, &max); + } else { + ApplyFilterToSignedInput(first_pass_min, first_pass_max, worst_warp_filter, + &min, &max); + } + + if (bitdepth == 8) { + // Remove the offset that was applied in the first pass since we must use + // int32_t for this phase anyway. 128 is the sum of the filter taps. + const int offset_removal = (first_pass_offset >> horizontal_bits) * 128; + printf(" 8bpp intermediate offset removal: %d.\n", offset_removal); + max -= offset_removal; + min -= offset_removal; + assert(min < INT16_MIN && min > INT32_MIN); + assert(max > INT16_MAX && max < INT32_MAX); + } else { + // 10bpp and 12bpp require int32_t for the intermediate values. Adding an + // offset is not required. + assert(min > INT32_MIN); + assert(max > INT16_MAX && max < INT32_MAX); + } + + printf(" intermediate range: [%8d, %8d]\n", min, max); + + // Second pass non-compound output is clipped to Pixel values. + const int second_pass_min = + Clip3(RightShiftWithRounding(min, vertical_bits), 0, max_input); + const int second_pass_max = + Clip3(RightShiftWithRounding(max, vertical_bits), 0, max_input); + printf(" second pass output range: [%8d, %8d]\n", second_pass_min, + second_pass_max); + + // Output is Pixel so matches Pixel values. + assert(second_pass_min == 0); + assert(second_pass_max == max_input); + + const int compound_second_pass_min = + RightShiftWithRounding(min, compound_vertical_bits) + compound_offset; + const int compound_second_pass_max = + RightShiftWithRounding(max, compound_vertical_bits) + compound_offset; + + printf(" compound second pass output range: [%8d, %8d]\n", + compound_second_pass_min, compound_second_pass_max); + + if (bitdepth == 8) { + // 8bpp output is int16_t without an offset. + assert(compound_second_pass_min > INT16_MIN); + assert(compound_second_pass_max < INT16_MAX); + } else { + // 10bpp and 12bpp use the offset to fit inside uint16_t. + assert(compound_second_pass_min > 0); + assert(compound_second_pass_max < UINT16_MAX); + } + + printf("\n"); +} + +TEST(WarpTest, ShowRange) { + ShowRange<kBitdepth8>(); + ShowRange<kBitdepth10>(); + ShowRange<kBitdepth12>(); +} + +using WarpTest8bpp = WarpTest</*is_compound=*/false, 8, uint8_t>; +// TODO(jzern): Coverage could be added for kInterRoundBitsCompoundVertical via +// WarpCompoundTest. +// using WarpCompoundTest8bpp = WarpTest</*is_compound=*/true, 8, uint8_t>; + +// Verifies the sum of the warped filter coefficients is 128 for every filter. +// +// Verifies the properties used in the calculation of ranges of variables in +// the block warp process: +// * The maximum sum of the positive warped filter coefficients is 175. +// * The minimum (i.e., most negative) sum of the negative warped filter +// coefficients is -47. +// +// NOTE: This test is independent of the bitdepth and the implementation of the +// block warp function, so it just needs to be a test in the WarpTest8bpp class +// and does not need to be defined with TEST_P. +TEST(WarpTest8bpp, WarpedFilterCoefficientSums) { + int max_positive_sum = 0; + int min_negative_sum = 0; + for (const auto& filter : kWarpedFilters) { + int sum = 0; + int positive_sum = 0; + int negative_sum = 0; + for (const auto coefficient : filter) { + sum += coefficient; + if (coefficient > 0) { + positive_sum += coefficient; + } else { + negative_sum += coefficient; + } + } + EXPECT_EQ(sum, 128); + max_positive_sum = std::max(positive_sum, max_positive_sum); + min_negative_sum = std::min(negative_sum, min_negative_sum); + } + EXPECT_EQ(max_positive_sum, 175); + EXPECT_EQ(min_negative_sum, -47); +} + +TEST_P(WarpTest8bpp, FixedValues) { TestFixedValues(); } + +TEST_P(WarpTest8bpp, RandomValues) { TestRandomValues(); } + +TEST_P(WarpTest8bpp, DISABLED_Speed) { TestSpeed(); } +const WarpTestParam warp_test_param[] = { + WarpTestParam(8, 8), WarpTestParam(8, 16), WarpTestParam(16, 8), + WarpTestParam(16, 16), WarpTestParam(16, 32), WarpTestParam(32, 16), + WarpTestParam(32, 32), WarpTestParam(32, 64), WarpTestParam(64, 32), + WarpTestParam(64, 64), WarpTestParam(64, 128), WarpTestParam(128, 64), + WarpTestParam(128, 128), +}; + +INSTANTIATE_TEST_SUITE_P(C, WarpTest8bpp, testing::ValuesIn(warp_test_param)); + +#if LIBGAV1_ENABLE_NEON +INSTANTIATE_TEST_SUITE_P(NEON, WarpTest8bpp, + testing::ValuesIn(warp_test_param)); +#endif + +#if LIBGAV1_ENABLE_SSE4_1 +INSTANTIATE_TEST_SUITE_P(SSE41, WarpTest8bpp, + testing::ValuesIn(warp_test_param)); +#endif + +#if LIBGAV1_MAX_BITDEPTH >= 10 +using WarpTest10bpp = WarpTest</*is_compound=*/false, 10, uint16_t>; +// TODO(jzern): Coverage could be added for kInterRoundBitsCompoundVertical via +// WarpCompoundTest. +// using WarpCompoundTest10bpp = WarpTest</*is_compound=*/true, 10, uint16_t>; + +TEST_P(WarpTest10bpp, FixedValues) { TestFixedValues(); } + +TEST_P(WarpTest10bpp, RandomValues) { TestRandomValues(); } + +TEST_P(WarpTest10bpp, DISABLED_Speed) { TestSpeed(); } + +INSTANTIATE_TEST_SUITE_P(C, WarpTest10bpp, testing::ValuesIn(warp_test_param)); +#endif + +std::ostream& operator<<(std::ostream& os, const WarpTestParam& warp_param) { + return os << "BlockSize" << warp_param.width << "x" << warp_param.height; +} + +} // namespace +} // namespace dsp +} // namespace libgav1 diff --git a/src/dsp/weight_mask_test.cc b/src/dsp/weight_mask_test.cc new file mode 100644 index 0000000..77b608e --- /dev/null +++ b/src/dsp/weight_mask_test.cc @@ -0,0 +1,390 @@ +// Copyright 2020 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/weight_mask.h" + +#include <algorithm> +#include <cstdint> +#include <ostream> +#include <string> +#include <type_traits> + +#include "absl/strings/match.h" +#include "absl/strings/str_format.h" +#include "absl/time/clock.h" +#include "absl/time/time.h" +#include "gtest/gtest.h" +#include "src/dsp/dsp.h" +#include "src/utils/common.h" +#include "src/utils/constants.h" +#include "src/utils/cpu.h" +#include "src/utils/memory.h" +#include "tests/third_party/libvpx/acm_random.h" +#include "tests/utils.h" + +namespace libgav1 { +namespace dsp { +namespace { + +constexpr int kNumSpeedTests = 50000; +constexpr int kMaxPredictionSize = 128; +// weight_mask is only used with kCompoundPredictionTypeDiffWeighted with +// convolve producing the most extreme ranges. +// This includes kCompoundOffset in 10bpp and 12bpp. +// see: src/dsp/convolve.cc & src/dsp/warp.cc. +constexpr int kCompoundPredictionRange[3][2] = { + // 8bpp + {-5132, 9212}, + // 10bpp + {3988, 61532}, + // 12bpp + {3974, 61559}, +}; + +const char* GetDigest8bpp(int id) { + static const char* const kDigest[] = { + "035267cb2ac5a0f8ff50c2d30ad52226", + "3231f4972dd858b734e0cc48c4cd001e", + "7e163b69721a13ec9f75b5cd74ffee3f", + "" /*kBlock4x16*/, + "b75e90abc224acca8754c82039b3ba93", + "9f555f3a2c1a933a663d6103b8118dea", + "8539e54f34cd6668ff6e6606210be201", + "20f85c9db7c878c21fbf2052936f269e", + "620ec166de57b0639260b2d72eebfc3e", + "be666394b5a894d78f4097b6cca272fe", + "57a96816e84cdb381f596c23827b5922", + "f2e0d348f608f246b6d8d799b66c189e", + "161ac051f38372d9339d36728b9926ba", + "d5fad48aaf132a81cb62bba4f07bbebb", + "e10be2dca2f7dae38dae75150fc1612d", + "7f744481eb551bbc224b5236c82cbade", + "0d99bbf31ecddc1c2d5063a68c0e9375", + "5fb8ec5f582f0ebfe519ed55860f67c4", + + // mask_is_inverse = true. + "a4250ca39daa700836138371d36d465f", + "abe9a9a1c3a5accda9bfefd4d6e81ccb", + "e95b08878d0bb5f2293c27c3a6fe0253", + "" /*kBlock4x16*/, + "e1c52be02ce9ab2800015bb08b866c31", + "eea1dc73811f73866edfeb4555865f20", + "3178e64085645bd819256a8ab43c7b0a", + "ee83884e4d5cd2c9ac04879116bab681", + "d107eff7d5ae9ba14d2c6b3b8d9fca49", + "400aeea7d299626fc336c46b1ad7a9d8", + "e9e26a400f67f3ad36350fe4171fc613", + "4c31ad714f470f34127febaf1bac714b", + "bbdcb1097c66d561dd4ea16b3fb73f97", + "3a21dfbf53e4c964e303a75a3308ce15", + "3416dab4512fd0dc61d788b433cd624e", + "68ace8f01fdd74aec3fee528c8167738", + "9fabe05a6523da81a45150e19f75acff", + "7c0643e4d02421d06d7ca71822a94e1d", + }; + return kDigest[id]; +} + +#if LIBGAV1_MAX_BITDEPTH >= 10 +const char* GetDigest10bpp(int id) { + static const char* const kDigest[] = { + "1dc9bdd042e5228705b857b42798e364", + "c054c8644bd482ce78a139d8e063e013", + "bbe4ac48f013f34c84779da05b0bcbe0", + "" /*kBlock4x16*/, + "13d4759277637a607f25439182553708", + "f089667610561a47d50f9f930ad7c454", + "46715e6f7819f59725bdb083f4403255", + "3774541c339ae3af920ef2b1d6abf6a1", + "94913b01d226cb5eb273dfee84b51f65", + "be0c0847629dfff8e0e991ed67697a7d", + "716b5398b77d7459274d4ea9c91ebd8e", + "f5c1b0b461df4182529949472242b421", + "5e9576ea4cf107249ce4ae89a72b9c95", + "da021bcdf7936f7bd9a2399c69e4d37c", + "b3a310a39c1900e00f992839ff188656", + "9f3a15351af5945615f296242ec56a38", + "b6e0bd03c521c5f00e90530daa7d4432", + "3270d7f621d488aec5b76bcf121debd0", + + // mask_is_inverse = true. + "33df96dd246683133eefe4caea6e3f7d", + "73e0ccc5d42806548a4b59f856256c1e", + "3561a0358cf831aee9477d07feafae2d", + "" /*kBlock4x16*/, + "c5a2e633c0cd6925e68f21f47f0e2d84", + "8755a2d3840dde5fd6a0cce6bd6642c5", + "85ec538b72cecd6ea1fddab5ce3b4e64", + "a53e0dec84c675c4c6b1f5792b0232ff", + "86180da325f9727670a98cf2dbf7410e", + "a5fdc95104948047e179b2bc3d47f51d", + "9b95b3858187838e4669180e2ddb295e", + "6e40ca55608f6bf2f8cd91c8dbf3ddbf", + "d3a092672e921b588279d57e50b31888", + "9883eb19b733ee9f1cb6a6b6a1a00bb5", + "dd34764e068b228b7820321b06864e63", + "6c743dc9c8c87c7044151d29993e5042", + "44925dab01011a98b8ab1f0308fa852a", + "6d984b2ccfa056278e2130771127a943", + }; + return kDigest[id]; +} +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +struct WeightMaskTestParam { + WeightMaskTestParam(int width, int height, bool mask_is_inverse) + : width(width), height(height), mask_is_inverse(mask_is_inverse) {} + int width; + int height; + bool mask_is_inverse; +}; + +std::ostream& operator<<(std::ostream& os, const WeightMaskTestParam& param) { + return os << param.width << "x" << param.height + << ", mask_is_inverse: " << param.mask_is_inverse; +} + +template <int bitdepth> +class WeightMaskTest : public testing::TestWithParam<WeightMaskTestParam>, + public test_utils::MaxAlignedAllocable { + public: + WeightMaskTest() = default; + ~WeightMaskTest() override = default; + + void SetUp() override { + test_utils::ResetDspTable(bitdepth); + WeightMaskInit_C(); + const dsp::Dsp* const dsp = dsp::GetDspTable(bitdepth); + ASSERT_NE(dsp, nullptr); + const int width_index = FloorLog2(width_) - 3; + const int height_index = FloorLog2(height_) - 3; + const testing::TestInfo* const test_info = + testing::UnitTest::GetInstance()->current_test_info(); + const char* const test_case = test_info->test_suite_name(); + if (absl::StartsWith(test_case, "C/")) { + } else if (absl::StartsWith(test_case, "NEON/")) { + WeightMaskInit_NEON(); + } else if (absl::StartsWith(test_case, "SSE41/")) { + WeightMaskInit_SSE4_1(); + } + func_ = dsp->weight_mask[width_index][height_index][mask_is_inverse_]; + } + + protected: + void SetInputData(bool use_fixed_values, int value_1, int value_2); + void Test(int num_runs, bool use_fixed_values, int value_1, int value_2); + + private: + const int width_ = GetParam().width; + const int height_ = GetParam().height; + const bool mask_is_inverse_ = GetParam().mask_is_inverse; + using PredType = + typename std::conditional<bitdepth == 8, int16_t, uint16_t>::type; + alignas( + kMaxAlignment) PredType block_1_[kMaxPredictionSize * kMaxPredictionSize]; + alignas( + kMaxAlignment) PredType block_2_[kMaxPredictionSize * kMaxPredictionSize]; + uint8_t mask_[kMaxPredictionSize * kMaxPredictionSize] = {}; + dsp::WeightMaskFunc func_; +}; + +template <int bitdepth> +void WeightMaskTest<bitdepth>::SetInputData(const bool use_fixed_values, + const int value_1, + const int value_2) { + if (use_fixed_values) { + std::fill(block_1_, block_1_ + kMaxPredictionSize * kMaxPredictionSize, + value_1); + std::fill(block_2_, block_2_ + kMaxPredictionSize * kMaxPredictionSize, + value_2); + } else { + constexpr int bitdepth_index = (bitdepth - 8) >> 1; + libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed()); + for (int y = 0; y < height_; ++y) { + for (int x = 0; x < width_; ++x) { + const int min_val = kCompoundPredictionRange[bitdepth_index][0]; + const int max_val = kCompoundPredictionRange[bitdepth_index][1]; + block_1_[y * width_ + x] = + static_cast<PredType>(rnd(max_val - min_val) + min_val); + block_2_[y * width_ + x] = + static_cast<PredType>(rnd(max_val - min_val) + min_val); + } + } + } +} + +BlockSize DimensionsToBlockSize(int width, int height) { + if (width == 4) { + if (height == 4) return kBlock4x4; + if (height == 8) return kBlock4x8; + if (height == 16) return kBlock4x16; + return kBlockInvalid; + } + if (width == 8) { + if (height == 4) return kBlock8x4; + if (height == 8) return kBlock8x8; + if (height == 16) return kBlock8x16; + if (height == 32) return kBlock8x32; + return kBlockInvalid; + } + if (width == 16) { + if (height == 4) return kBlock16x4; + if (height == 8) return kBlock16x8; + if (height == 16) return kBlock16x16; + if (height == 32) return kBlock16x32; + if (height == 64) return kBlock16x64; + return kBlockInvalid; + } + if (width == 32) { + if (height == 8) return kBlock32x8; + if (height == 16) return kBlock32x16; + if (height == 32) return kBlock32x32; + if (height == 64) return kBlock32x64; + return kBlockInvalid; + } + if (width == 64) { + if (height == 16) return kBlock64x16; + if (height == 32) return kBlock64x32; + if (height == 64) return kBlock64x64; + if (height == 128) return kBlock64x128; + return kBlockInvalid; + } + if (width == 128) { + if (height == 64) return kBlock128x64; + if (height == 128) return kBlock128x128; + return kBlockInvalid; + } + return kBlockInvalid; +} + +template <int bitdepth> +void WeightMaskTest<bitdepth>::Test(const int num_runs, + const bool use_fixed_values, + const int value_1, const int value_2) { + if (func_ == nullptr) return; + SetInputData(use_fixed_values, value_1, value_2); + const absl::Time start = absl::Now(); + for (int i = 0; i < num_runs; ++i) { + func_(block_1_, block_2_, mask_, kMaxPredictionSize); + } + const absl::Duration elapsed_time = absl::Now() - start; + if (use_fixed_values) { + int fixed_value = (value_1 - value_2 == 0) ? 38 : 64; + if (mask_is_inverse_) fixed_value = 64 - fixed_value; + for (int y = 0; y < height_; ++y) { + for (int x = 0; x < width_; ++x) { + ASSERT_EQ(static_cast<int>(mask_[y * kMaxPredictionSize + x]), + fixed_value) + << "x: " << x << " y: " << y; + } + } + } else { + const int id_offset = mask_is_inverse_ ? kMaxBlockSizes - 4 : 0; + const int id = id_offset + + static_cast<int>(DimensionsToBlockSize(width_, height_)) - 4; + if (bitdepth == 8) { + test_utils::CheckMd5Digest( + absl::StrFormat("BlockSize %dx%d", width_, height_).c_str(), + "WeightMask", GetDigest8bpp(id), mask_, sizeof(mask_), elapsed_time); +#if LIBGAV1_MAX_BITDEPTH >= 10 + } else { + test_utils::CheckMd5Digest( + absl::StrFormat("BlockSize %dx%d", width_, height_).c_str(), + "WeightMask", GetDigest10bpp(id), mask_, sizeof(mask_), elapsed_time); +#endif + } + } +} + +const WeightMaskTestParam weight_mask_test_param[] = { + WeightMaskTestParam(8, 8, false), WeightMaskTestParam(8, 16, false), + WeightMaskTestParam(8, 32, false), WeightMaskTestParam(16, 8, false), + WeightMaskTestParam(16, 16, false), WeightMaskTestParam(16, 32, false), + WeightMaskTestParam(16, 64, false), WeightMaskTestParam(32, 8, false), + WeightMaskTestParam(32, 16, false), WeightMaskTestParam(32, 32, false), + WeightMaskTestParam(32, 64, false), WeightMaskTestParam(64, 16, false), + WeightMaskTestParam(64, 32, false), WeightMaskTestParam(64, 64, false), + WeightMaskTestParam(64, 128, false), WeightMaskTestParam(128, 64, false), + WeightMaskTestParam(128, 128, false), WeightMaskTestParam(8, 8, true), + WeightMaskTestParam(8, 16, true), WeightMaskTestParam(8, 32, true), + WeightMaskTestParam(16, 8, true), WeightMaskTestParam(16, 16, true), + WeightMaskTestParam(16, 32, true), WeightMaskTestParam(16, 64, true), + WeightMaskTestParam(32, 8, true), WeightMaskTestParam(32, 16, true), + WeightMaskTestParam(32, 32, true), WeightMaskTestParam(32, 64, true), + WeightMaskTestParam(64, 16, true), WeightMaskTestParam(64, 32, true), + WeightMaskTestParam(64, 64, true), WeightMaskTestParam(64, 128, true), + WeightMaskTestParam(128, 64, true), WeightMaskTestParam(128, 128, true), +}; + +using WeightMaskTest8bpp = WeightMaskTest<8>; + +TEST_P(WeightMaskTest8bpp, FixedValues) { + const int min = kCompoundPredictionRange[0][0]; + const int max = kCompoundPredictionRange[0][1]; + Test(1, true, min, min); + Test(1, true, min, max); + Test(1, true, max, min); + Test(1, true, max, max); +} + +TEST_P(WeightMaskTest8bpp, RandomValues) { Test(1, false, -1, -1); } + +TEST_P(WeightMaskTest8bpp, DISABLED_Speed) { + Test(kNumSpeedTests, false, -1, -1); +} + +INSTANTIATE_TEST_SUITE_P(C, WeightMaskTest8bpp, + testing::ValuesIn(weight_mask_test_param)); +#if LIBGAV1_ENABLE_NEON +INSTANTIATE_TEST_SUITE_P(NEON, WeightMaskTest8bpp, + testing::ValuesIn(weight_mask_test_param)); +#endif +#if LIBGAV1_ENABLE_SSE4_1 +INSTANTIATE_TEST_SUITE_P(SSE41, WeightMaskTest8bpp, + testing::ValuesIn(weight_mask_test_param)); +#endif + +#if LIBGAV1_MAX_BITDEPTH >= 10 +using WeightMaskTest10bpp = WeightMaskTest<10>; + +TEST_P(WeightMaskTest10bpp, FixedValues) { + const int min = kCompoundPredictionRange[1][0]; + const int max = kCompoundPredictionRange[1][1]; + Test(1, true, min, min); + Test(1, true, min, max); + Test(1, true, max, min); + Test(1, true, max, max); +} + +TEST_P(WeightMaskTest10bpp, RandomValues) { Test(1, false, -1, -1); } + +TEST_P(WeightMaskTest10bpp, DISABLED_Speed) { + Test(kNumSpeedTests, false, -1, -1); +} + +INSTANTIATE_TEST_SUITE_P(C, WeightMaskTest10bpp, + testing::ValuesIn(weight_mask_test_param)); +#if LIBGAV1_ENABLE_NEON +INSTANTIATE_TEST_SUITE_P(NEON, WeightMaskTest10bpp, + testing::ValuesIn(weight_mask_test_param)); +#endif +#if LIBGAV1_ENABLE_SSE4_1 +INSTANTIATE_TEST_SUITE_P(SSE41, WeightMaskTest10bpp, + testing::ValuesIn(weight_mask_test_param)); +#endif +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +} // namespace +} // namespace dsp +} // namespace libgav1 diff --git a/src/dsp/x86/average_blend_sse4.cc b/src/dsp/x86/average_blend_sse4.cc index 8e008d1..ec9f589 100644 --- a/src/dsp/x86/average_blend_sse4.cc +++ b/src/dsp/x86/average_blend_sse4.cc @@ -30,6 +30,7 @@ namespace libgav1 { namespace dsp { +namespace low_bitdepth { namespace { constexpr int kInterPostRoundBit = 4; @@ -138,13 +139,232 @@ void Init8bpp() { } } // namespace +} // namespace low_bitdepth -void AverageBlendInit_SSE4_1() { Init8bpp(); } +#if LIBGAV1_MAX_BITDEPTH >= 10 +namespace high_bitdepth { +namespace { + +constexpr int kInterPostRoundBitPlusOne = 5; + +template <const int width, const int offset> +inline void AverageBlendRow(const uint16_t* prediction_0, + const uint16_t* prediction_1, + const __m128i& compound_offset, + const __m128i& round_offset, const __m128i& max, + const __m128i& zero, uint16_t* dst, + const ptrdiff_t dest_stride) { + // pred_0/1 max range is 16b. + const __m128i pred_0 = LoadUnaligned16(prediction_0 + offset); + const __m128i pred_1 = LoadUnaligned16(prediction_1 + offset); + const __m128i pred_00 = _mm_cvtepu16_epi32(pred_0); + const __m128i pred_01 = _mm_unpackhi_epi16(pred_0, zero); + const __m128i pred_10 = _mm_cvtepu16_epi32(pred_1); + const __m128i pred_11 = _mm_unpackhi_epi16(pred_1, zero); + + const __m128i pred_add_0 = _mm_add_epi32(pred_00, pred_10); + const __m128i pred_add_1 = _mm_add_epi32(pred_01, pred_11); + const __m128i compound_offset_0 = _mm_sub_epi32(pred_add_0, compound_offset); + const __m128i compound_offset_1 = _mm_sub_epi32(pred_add_1, compound_offset); + // RightShiftWithRounding and Clip3. + const __m128i round_0 = _mm_add_epi32(compound_offset_0, round_offset); + const __m128i round_1 = _mm_add_epi32(compound_offset_1, round_offset); + const __m128i res_0 = _mm_srai_epi32(round_0, kInterPostRoundBitPlusOne); + const __m128i res_1 = _mm_srai_epi32(round_1, kInterPostRoundBitPlusOne); + const __m128i result = _mm_min_epi16(_mm_packus_epi32(res_0, res_1), max); + if (width != 4) { + // Store width=8/16/32/64/128. + StoreUnaligned16(dst + offset, result); + return; + } + assert(width == 4); + StoreLo8(dst, result); + StoreHi8(dst + dest_stride, result); +} + +void AverageBlend10bpp_SSE4_1(const void* prediction_0, + const void* prediction_1, const int width, + const int height, void* const dest, + const ptrdiff_t dst_stride) { + auto* dst = static_cast<uint16_t*>(dest); + const ptrdiff_t dest_stride = dst_stride / sizeof(dst[0]); + const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); + const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); + const __m128i compound_offset = + _mm_set1_epi32(kCompoundOffset + kCompoundOffset); + const __m128i round_offset = + _mm_set1_epi32((1 << kInterPostRoundBitPlusOne) >> 1); + const __m128i max = _mm_set1_epi16((1 << kBitdepth10) - 1); + const __m128i zero = _mm_setzero_si128(); + int y = height; + + if (width == 4) { + const ptrdiff_t dest_stride2 = dest_stride << 1; + const ptrdiff_t width2 = width << 1; + do { + // row0,1 + AverageBlendRow<4, 0>(pred_0, pred_1, compound_offset, round_offset, max, + zero, dst, dest_stride); + dst += dest_stride2; + pred_0 += width2; + pred_1 += width2; + y -= 2; + } while (y != 0); + return; + } + if (width == 8) { + const ptrdiff_t dest_stride2 = dest_stride << 1; + const ptrdiff_t width2 = width << 1; + do { + // row0. + AverageBlendRow<8, 0>(pred_0, pred_1, compound_offset, round_offset, max, + zero, dst, dest_stride); + // row1. + AverageBlendRow<8, 0>(pred_0 + width, pred_1 + width, compound_offset, + round_offset, max, zero, dst + dest_stride, + dest_stride); + dst += dest_stride2; + pred_0 += width2; + pred_1 += width2; + y -= 2; + } while (y != 0); + return; + } + if (width == 16) { + const ptrdiff_t dest_stride2 = dest_stride << 1; + const ptrdiff_t width2 = width << 1; + do { + // row0. + AverageBlendRow<8, 0>(pred_0, pred_1, compound_offset, round_offset, max, + zero, dst, dest_stride); + AverageBlendRow<8, 8>(pred_0, pred_1, compound_offset, round_offset, max, + zero, dst, dest_stride); + // row1. + AverageBlendRow<8, 0>(pred_0 + width, pred_1 + width, compound_offset, + round_offset, max, zero, dst + dest_stride, + dest_stride); + AverageBlendRow<8, 8>(pred_0 + width, pred_1 + width, compound_offset, + round_offset, max, zero, dst + dest_stride, + dest_stride); + dst += dest_stride2; + pred_0 += width2; + pred_1 += width2; + y -= 2; + } while (y != 0); + return; + } + if (width == 32) { + do { + // pred [0 - 15]. + AverageBlendRow<8, 0>(pred_0, pred_1, compound_offset, round_offset, max, + zero, dst, dest_stride); + AverageBlendRow<8, 8>(pred_0, pred_1, compound_offset, round_offset, max, + zero, dst, dest_stride); + // pred [16 - 31]. + AverageBlendRow<8, 16>(pred_0, pred_1, compound_offset, round_offset, max, + zero, dst, dest_stride); + AverageBlendRow<8, 24>(pred_0, pred_1, compound_offset, round_offset, max, + zero, dst, dest_stride); + dst += dest_stride; + pred_0 += width; + pred_1 += width; + } while (--y != 0); + return; + } + if (width == 64) { + do { + // pred [0 - 31]. + AverageBlendRow<8, 0>(pred_0, pred_1, compound_offset, round_offset, max, + zero, dst, dest_stride); + AverageBlendRow<8, 8>(pred_0, pred_1, compound_offset, round_offset, max, + zero, dst, dest_stride); + AverageBlendRow<8, 16>(pred_0, pred_1, compound_offset, round_offset, max, + zero, dst, dest_stride); + AverageBlendRow<8, 24>(pred_0, pred_1, compound_offset, round_offset, max, + zero, dst, dest_stride); + // pred [31 - 63]. + AverageBlendRow<8, 32>(pred_0, pred_1, compound_offset, round_offset, max, + zero, dst, dest_stride); + AverageBlendRow<8, 40>(pred_0, pred_1, compound_offset, round_offset, max, + zero, dst, dest_stride); + AverageBlendRow<8, 48>(pred_0, pred_1, compound_offset, round_offset, max, + zero, dst, dest_stride); + AverageBlendRow<8, 56>(pred_0, pred_1, compound_offset, round_offset, max, + zero, dst, dest_stride); + dst += dest_stride; + pred_0 += width; + pred_1 += width; + } while (--y != 0); + return; + } + assert(width == 128); + do { + // pred [0 - 31]. + AverageBlendRow<8, 0>(pred_0, pred_1, compound_offset, round_offset, max, + zero, dst, dest_stride); + AverageBlendRow<8, 8>(pred_0, pred_1, compound_offset, round_offset, max, + zero, dst, dest_stride); + AverageBlendRow<8, 16>(pred_0, pred_1, compound_offset, round_offset, max, + zero, dst, dest_stride); + AverageBlendRow<8, 24>(pred_0, pred_1, compound_offset, round_offset, max, + zero, dst, dest_stride); + // pred [31 - 63]. + AverageBlendRow<8, 32>(pred_0, pred_1, compound_offset, round_offset, max, + zero, dst, dest_stride); + AverageBlendRow<8, 40>(pred_0, pred_1, compound_offset, round_offset, max, + zero, dst, dest_stride); + AverageBlendRow<8, 48>(pred_0, pred_1, compound_offset, round_offset, max, + zero, dst, dest_stride); + AverageBlendRow<8, 56>(pred_0, pred_1, compound_offset, round_offset, max, + zero, dst, dest_stride); + + // pred [64 - 95]. + AverageBlendRow<8, 64>(pred_0, pred_1, compound_offset, round_offset, max, + zero, dst, dest_stride); + AverageBlendRow<8, 72>(pred_0, pred_1, compound_offset, round_offset, max, + zero, dst, dest_stride); + AverageBlendRow<8, 80>(pred_0, pred_1, compound_offset, round_offset, max, + zero, dst, dest_stride); + AverageBlendRow<8, 88>(pred_0, pred_1, compound_offset, round_offset, max, + zero, dst, dest_stride); + // pred [96 - 127]. + AverageBlendRow<8, 96>(pred_0, pred_1, compound_offset, round_offset, max, + zero, dst, dest_stride); + AverageBlendRow<8, 104>(pred_0, pred_1, compound_offset, round_offset, max, + zero, dst, dest_stride); + AverageBlendRow<8, 112>(pred_0, pred_1, compound_offset, round_offset, max, + zero, dst, dest_stride); + AverageBlendRow<8, 120>(pred_0, pred_1, compound_offset, round_offset, max, + zero, dst, dest_stride); + dst += dest_stride; + pred_0 += width; + pred_1 += width; + } while (--y != 0); +} + +void Init10bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10); + assert(dsp != nullptr); +#if DSP_ENABLED_10BPP_SSE4_1(AverageBlend) + dsp->average_blend = AverageBlend10bpp_SSE4_1; +#endif +} + +} // namespace +} // namespace high_bitdepth +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +void AverageBlendInit_SSE4_1() { + low_bitdepth::Init8bpp(); +#if LIBGAV1_MAX_BITDEPTH >= 10 + high_bitdepth::Init10bpp(); +#endif // LIBGAV1_MAX_BITDEPTH >= 10 +} } // namespace dsp } // namespace libgav1 -#else // !LIBGAV1_TARGETING_SSE4_1 +#else // !LIBGAV1_TARGETING_SSE4_1 namespace libgav1 { namespace dsp { diff --git a/src/dsp/x86/average_blend_sse4.h b/src/dsp/x86/average_blend_sse4.h index 937e8e2..cd07112 100644 --- a/src/dsp/x86/average_blend_sse4.h +++ b/src/dsp/x86/average_blend_sse4.h @@ -32,9 +32,13 @@ void AverageBlendInit_SSE4_1(); // If sse4 is enabled and the baseline isn't set due to a higher level of // optimization being enabled, signal the sse4 implementation should be used. #if LIBGAV1_TARGETING_SSE4_1 + #ifndef LIBGAV1_Dsp8bpp_AverageBlend #define LIBGAV1_Dsp8bpp_AverageBlend LIBGAV1_CPU_SSE4_1 #endif +#ifndef LIBGAV1_Dsp10bpp_AverageBlend +#define LIBGAV1_Dsp10bpp_AverageBlend LIBGAV1_CPU_SSE4_1 +#endif #endif // LIBGAV1_TARGETING_SSE4_1 diff --git a/src/dsp/x86/cdef_avx2.cc b/src/dsp/x86/cdef_avx2.cc new file mode 100644 index 0000000..d41dc38 --- /dev/null +++ b/src/dsp/x86/cdef_avx2.cc @@ -0,0 +1,784 @@ +// Copyright 2021 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/cdef.h" +#include "src/utils/cpu.h" + +#if LIBGAV1_TARGETING_AVX2 +#include <immintrin.h> + +#include <algorithm> +#include <cassert> +#include <cstddef> +#include <cstdint> +#include <cstdlib> + +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/dsp/x86/common_avx2.h" +#include "src/utils/common.h" +#include "src/utils/constants.h" + +namespace libgav1 { +namespace dsp { +namespace low_bitdepth { +namespace { + +#include "src/dsp/cdef.inc" + +// Used when calculating odd |cost[x]| values. +// Holds elements 1 3 5 7 7 7 7 7 +alignas(32) constexpr uint32_t kCdefDivisionTableOddPairsPadded[] = { + 420, 210, 140, 105, 420, 210, 140, 105, + 105, 105, 105, 105, 105, 105, 105, 105}; + +// ---------------------------------------------------------------------------- +// Refer to CdefDirection_C(). +// +// int32_t partial[8][15] = {}; +// for (int i = 0; i < 8; ++i) { +// for (int j = 0; j < 8; ++j) { +// const int x = 1; +// partial[0][i + j] += x; +// partial[1][i + j / 2] += x; +// partial[2][i] += x; +// partial[3][3 + i - j / 2] += x; +// partial[4][7 + i - j] += x; +// partial[5][3 - i / 2 + j] += x; +// partial[6][j] += x; +// partial[7][i / 2 + j] += x; +// } +// } +// +// Using the code above, generate the position count for partial[8][15]. +// +// partial[0]: 1 2 3 4 5 6 7 8 7 6 5 4 3 2 1 +// partial[1]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0 +// partial[2]: 8 8 8 8 8 8 8 8 0 0 0 0 0 0 0 +// partial[3]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0 +// partial[4]: 1 2 3 4 5 6 7 8 7 6 5 4 3 2 1 +// partial[5]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0 +// partial[6]: 8 8 8 8 8 8 8 8 0 0 0 0 0 0 0 +// partial[7]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0 +// +// The SIMD code shifts the input horizontally, then adds vertically to get the +// correct partial value for the given position. +// ---------------------------------------------------------------------------- + +// ---------------------------------------------------------------------------- +// partial[0][i + j] += x; +// +// 00 01 02 03 04 05 06 07 00 00 00 00 00 00 00 +// 00 10 11 12 13 14 15 16 17 00 00 00 00 00 00 +// 00 00 20 21 22 23 24 25 26 27 00 00 00 00 00 +// 00 00 00 30 31 32 33 34 35 36 37 00 00 00 00 +// 00 00 00 00 40 41 42 43 44 45 46 47 00 00 00 +// 00 00 00 00 00 50 51 52 53 54 55 56 57 00 00 +// 00 00 00 00 00 00 60 61 62 63 64 65 66 67 00 +// 00 00 00 00 00 00 00 70 71 72 73 74 75 76 77 +// +// partial[4] is the same except the source is reversed. +LIBGAV1_ALWAYS_INLINE void AddPartial_D0_D4(__m256i* v_src_16, + __m256i* partial_lo, + __m256i* partial_hi) { + // 00 01 02 03 04 05 06 07 + *partial_lo = v_src_16[0]; + // 00 00 00 00 00 00 00 00 + *partial_hi = _mm256_setzero_si256(); + + // 00 10 11 12 13 14 15 16 + *partial_lo = + _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_src_16[1], 2)); + // 17 00 00 00 00 00 00 00 + *partial_hi = + _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_src_16[1], 14)); + + // 00 00 20 21 22 23 24 25 + *partial_lo = + _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_src_16[2], 4)); + // 26 27 00 00 00 00 00 00 + *partial_hi = + _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_src_16[2], 12)); + + // 00 00 00 30 31 32 33 34 + *partial_lo = + _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_src_16[3], 6)); + // 35 36 37 00 00 00 00 00 + *partial_hi = + _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_src_16[3], 10)); + + // 00 00 00 00 40 41 42 43 + *partial_lo = + _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_src_16[4], 8)); + // 44 45 46 47 00 00 00 00 + *partial_hi = + _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_src_16[4], 8)); + + // 00 00 00 00 00 50 51 52 + *partial_lo = + _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_src_16[5], 10)); + // 53 54 55 56 57 00 00 00 + *partial_hi = + _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_src_16[5], 6)); + + // 00 00 00 00 00 00 60 61 + *partial_lo = + _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_src_16[6], 12)); + // 62 63 64 65 66 67 00 00 + *partial_hi = + _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_src_16[6], 4)); + + // 00 00 00 00 00 00 00 70 + *partial_lo = + _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_src_16[7], 14)); + // 71 72 73 74 75 76 77 00 + *partial_hi = + _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_src_16[7], 2)); +} + +// ---------------------------------------------------------------------------- +// partial[1][i + j / 2] += x; +// +// A0 = src[0] + src[1], A1 = src[2] + src[3], ... +// +// A0 A1 A2 A3 00 00 00 00 00 00 00 00 00 00 00 +// 00 B0 B1 B2 B3 00 00 00 00 00 00 00 00 00 00 +// 00 00 C0 C1 C2 C3 00 00 00 00 00 00 00 00 00 +// 00 00 00 D0 D1 D2 D3 00 00 00 00 00 00 00 00 +// 00 00 00 00 E0 E1 E2 E3 00 00 00 00 00 00 00 +// 00 00 00 00 00 F0 F1 F2 F3 00 00 00 00 00 00 +// 00 00 00 00 00 00 G0 G1 G2 G3 00 00 00 00 00 +// 00 00 00 00 00 00 00 H0 H1 H2 H3 00 00 00 00 +// +// partial[3] is the same except the source is reversed. +LIBGAV1_ALWAYS_INLINE void AddPartial_D1_D3(__m256i* v_src_16, + __m256i* partial_lo, + __m256i* partial_hi) { + __m256i v_d1_temp[8]; + const __m256i v_zero = _mm256_setzero_si256(); + + for (int i = 0; i < 8; ++i) { + v_d1_temp[i] = _mm256_hadd_epi16(v_src_16[i], v_zero); + } + + *partial_lo = *partial_hi = v_zero; + // A0 A1 A2 A3 00 00 00 00 + *partial_lo = _mm256_add_epi16(*partial_lo, v_d1_temp[0]); + + // 00 B0 B1 B2 B3 00 00 00 + *partial_lo = + _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_d1_temp[1], 2)); + + // 00 00 C0 C1 C2 C3 00 00 + *partial_lo = + _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_d1_temp[2], 4)); + // 00 00 00 D0 D1 D2 D3 00 + *partial_lo = + _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_d1_temp[3], 6)); + // 00 00 00 00 E0 E1 E2 E3 + *partial_lo = + _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_d1_temp[4], 8)); + + // 00 00 00 00 00 F0 F1 F2 + *partial_lo = + _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_d1_temp[5], 10)); + // F3 00 00 00 00 00 00 00 + *partial_hi = + _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_d1_temp[5], 6)); + + // 00 00 00 00 00 00 G0 G1 + *partial_lo = + _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_d1_temp[6], 12)); + // G2 G3 00 00 00 00 00 00 + *partial_hi = + _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_d1_temp[6], 4)); + + // 00 00 00 00 00 00 00 H0 + *partial_lo = + _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_d1_temp[7], 14)); + // H1 H2 H3 00 00 00 00 00 + *partial_hi = + _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_d1_temp[7], 2)); +} + +// ---------------------------------------------------------------------------- +// partial[7][i / 2 + j] += x; +// +// 00 01 02 03 04 05 06 07 00 00 00 00 00 00 00 +// 10 11 12 13 14 15 16 17 00 00 00 00 00 00 00 +// 00 20 21 22 23 24 25 26 27 00 00 00 00 00 00 +// 00 30 31 32 33 34 35 36 37 00 00 00 00 00 00 +// 00 00 40 41 42 43 44 45 46 47 00 00 00 00 00 +// 00 00 50 51 52 53 54 55 56 57 00 00 00 00 00 +// 00 00 00 60 61 62 63 64 65 66 67 00 00 00 00 +// 00 00 00 70 71 72 73 74 75 76 77 00 00 00 00 +// +// partial[5] is the same except the source is reversed. +LIBGAV1_ALWAYS_INLINE void AddPartial_D7_D5(__m256i* v_src, __m256i* partial_lo, + __m256i* partial_hi) { + __m256i v_pair_add[4]; + // Add vertical source pairs. + v_pair_add[0] = _mm256_add_epi16(v_src[0], v_src[1]); + v_pair_add[1] = _mm256_add_epi16(v_src[2], v_src[3]); + v_pair_add[2] = _mm256_add_epi16(v_src[4], v_src[5]); + v_pair_add[3] = _mm256_add_epi16(v_src[6], v_src[7]); + + // 00 01 02 03 04 05 06 07 + // 10 11 12 13 14 15 16 17 + *partial_lo = v_pair_add[0]; + // 00 00 00 00 00 00 00 00 + // 00 00 00 00 00 00 00 00 + *partial_hi = _mm256_setzero_si256(); + + // 00 20 21 22 23 24 25 26 + // 00 30 31 32 33 34 35 36 + *partial_lo = + _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_pair_add[1], 2)); + // 27 00 00 00 00 00 00 00 + // 37 00 00 00 00 00 00 00 + *partial_hi = + _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_pair_add[1], 14)); + + // 00 00 40 41 42 43 44 45 + // 00 00 50 51 52 53 54 55 + *partial_lo = + _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_pair_add[2], 4)); + // 46 47 00 00 00 00 00 00 + // 56 57 00 00 00 00 00 00 + *partial_hi = + _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_pair_add[2], 12)); + + // 00 00 00 60 61 62 63 64 + // 00 00 00 70 71 72 73 74 + *partial_lo = + _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_pair_add[3], 6)); + // 65 66 67 00 00 00 00 00 + // 75 76 77 00 00 00 00 00 + *partial_hi = + _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_pair_add[3], 10)); +} + +LIBGAV1_ALWAYS_INLINE void AddPartial(const uint8_t* src, ptrdiff_t stride, + __m256i* partial) { + // 8x8 input + // 00 01 02 03 04 05 06 07 + // 10 11 12 13 14 15 16 17 + // 20 21 22 23 24 25 26 27 + // 30 31 32 33 34 35 36 37 + // 40 41 42 43 44 45 46 47 + // 50 51 52 53 54 55 56 57 + // 60 61 62 63 64 65 66 67 + // 70 71 72 73 74 75 76 77 + __m256i v_src[8]; + for (auto& i : v_src) { + i = _mm256_castsi128_si256(LoadLo8(src)); + // Dup lower lane. + i = _mm256_permute2x128_si256(i, i, 0x0); + src += stride; + } + + const __m256i v_zero = _mm256_setzero_si256(); + // partial for direction 2 + // -------------------------------------------------------------------------- + // partial[2][i] += x; + // 00 10 20 30 40 50 60 70 xx xx xx xx xx xx xx xx + // 01 11 21 33 41 51 61 71 xx xx xx xx xx xx xx xx + // 02 12 22 33 42 52 62 72 xx xx xx xx xx xx xx xx + // 03 13 23 33 43 53 63 73 xx xx xx xx xx xx xx xx + // 04 14 24 34 44 54 64 74 xx xx xx xx xx xx xx xx + // 05 15 25 35 45 55 65 75 xx xx xx xx xx xx xx xx + // 06 16 26 36 46 56 66 76 xx xx xx xx xx xx xx xx + // 07 17 27 37 47 57 67 77 xx xx xx xx xx xx xx xx + const __m256i v_src_4_0 = _mm256_unpacklo_epi64(v_src[0], v_src[4]); + const __m256i v_src_5_1 = _mm256_unpacklo_epi64(v_src[1], v_src[5]); + const __m256i v_src_6_2 = _mm256_unpacklo_epi64(v_src[2], v_src[6]); + const __m256i v_src_7_3 = _mm256_unpacklo_epi64(v_src[3], v_src[7]); + const __m256i v_hsum_4_0 = _mm256_sad_epu8(v_src_4_0, v_zero); + const __m256i v_hsum_5_1 = _mm256_sad_epu8(v_src_5_1, v_zero); + const __m256i v_hsum_6_2 = _mm256_sad_epu8(v_src_6_2, v_zero); + const __m256i v_hsum_7_3 = _mm256_sad_epu8(v_src_7_3, v_zero); + const __m256i v_hsum_1_0 = _mm256_unpacklo_epi16(v_hsum_4_0, v_hsum_5_1); + const __m256i v_hsum_3_2 = _mm256_unpacklo_epi16(v_hsum_6_2, v_hsum_7_3); + const __m256i v_hsum_5_4 = _mm256_unpackhi_epi16(v_hsum_4_0, v_hsum_5_1); + const __m256i v_hsum_7_6 = _mm256_unpackhi_epi16(v_hsum_6_2, v_hsum_7_3); + partial[2] = + _mm256_unpacklo_epi64(_mm256_unpacklo_epi32(v_hsum_1_0, v_hsum_3_2), + _mm256_unpacklo_epi32(v_hsum_5_4, v_hsum_7_6)); + + const __m256i extend_reverse = SetrM128i( + _mm_set_epi32(static_cast<int>(0x80078006), static_cast<int>(0x80058004), + static_cast<int>(0x80038002), static_cast<int>(0x80018000)), + _mm_set_epi32(static_cast<int>(0x80008001), static_cast<int>(0x80028003), + static_cast<int>(0x80048005), + static_cast<int>(0x80068007))); + + for (auto& i : v_src) { + // Zero extend unsigned 8 to 16. The upper lane is reversed. + i = _mm256_shuffle_epi8(i, extend_reverse); + } + + // partial for direction 6 + // -------------------------------------------------------------------------- + // partial[6][j] += x; + // 00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx + // 10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx + // 20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx + // 30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx + // 40 41 42 43 44 45 46 47 xx xx xx xx xx xx xx xx + // 50 51 52 53 54 55 56 57 xx xx xx xx xx xx xx xx + // 60 61 62 63 64 65 66 67 xx xx xx xx xx xx xx xx + // 70 71 72 73 74 75 76 77 xx xx xx xx xx xx xx xx + partial[6] = v_src[0]; + for (int i = 1; i < 8; ++i) { + partial[6] = _mm256_add_epi16(partial[6], v_src[i]); + } + + AddPartial_D0_D4(v_src, &partial[0], &partial[4]); + AddPartial_D1_D3(v_src, &partial[1], &partial[3]); + AddPartial_D7_D5(v_src, &partial[7], &partial[5]); +} + +inline __m256i SumVectorPair_S32(__m256i a) { + a = _mm256_hadd_epi32(a, a); + a = _mm256_add_epi32(a, _mm256_srli_si256(a, 4)); + return a; +} + +// |cost[0]| and |cost[4]| square the input and sum with the corresponding +// element from the other end of the vector: +// |kCdefDivisionTable[]| element: +// cost[0] += (Square(partial[0][i]) + Square(partial[0][14 - i])) * +// kCdefDivisionTable[i + 1]; +// cost[0] += Square(partial[0][7]) * kCdefDivisionTable[8]; +inline void Cost0Or4_Pair(uint32_t* cost, const __m256i partial_0, + const __m256i partial_4, + const __m256i division_table) { + const __m256i division_table_0 = + _mm256_permute2x128_si256(division_table, division_table, 0x0); + const __m256i division_table_1 = + _mm256_permute2x128_si256(division_table, division_table, 0x11); + + // partial_lo + const __m256i a = partial_0; + // partial_hi + const __m256i b = partial_4; + + // Reverse and clear upper 2 bytes. + const __m256i reverser = _mm256_broadcastsi128_si256(_mm_set_epi32( + static_cast<int>(0x80800100), 0x03020504, 0x07060908, 0x0b0a0d0c)); + + // 14 13 12 11 10 09 08 ZZ + const __m256i b_reversed = _mm256_shuffle_epi8(b, reverser); + // 00 14 01 13 02 12 03 11 + const __m256i ab_lo = _mm256_unpacklo_epi16(a, b_reversed); + // 04 10 05 09 06 08 07 ZZ + const __m256i ab_hi = _mm256_unpackhi_epi16(a, b_reversed); + + // Square(partial[0][i]) + Square(partial[0][14 - i]) + const __m256i square_lo = _mm256_madd_epi16(ab_lo, ab_lo); + const __m256i square_hi = _mm256_madd_epi16(ab_hi, ab_hi); + + const __m256i c = _mm256_mullo_epi32(square_lo, division_table_0); + const __m256i d = _mm256_mullo_epi32(square_hi, division_table_1); + const __m256i e = SumVectorPair_S32(_mm256_add_epi32(c, d)); + // Copy upper 32bit sum to lower lane. + const __m128i sums = + _mm256_castsi256_si128(_mm256_permute4x64_epi64(e, 0x08)); + cost[0] = _mm_cvtsi128_si32(sums); + cost[4] = _mm_cvtsi128_si32(_mm_srli_si128(sums, 8)); +} + +template <int index_a, int index_b> +inline void CostOdd_Pair(uint32_t* cost, const __m256i partial_a, + const __m256i partial_b, + const __m256i division_table[2]) { + // partial_lo + const __m256i a = partial_a; + // partial_hi + const __m256i b = partial_b; + + // Reverse and clear upper 10 bytes. + const __m256i reverser = _mm256_broadcastsi128_si256( + _mm_set_epi32(static_cast<int>(0x80808080), static_cast<int>(0x80808080), + static_cast<int>(0x80800100), 0x03020504)); + + // 10 09 08 ZZ ZZ ZZ ZZ ZZ + const __m256i b_reversed = _mm256_shuffle_epi8(b, reverser); + // 00 10 01 09 02 08 03 ZZ + const __m256i ab_lo = _mm256_unpacklo_epi16(a, b_reversed); + // 04 ZZ 05 ZZ 06 ZZ 07 ZZ + const __m256i ab_hi = _mm256_unpackhi_epi16(a, b_reversed); + + // Square(partial[0][i]) + Square(partial[0][14 - i]) + const __m256i square_lo = _mm256_madd_epi16(ab_lo, ab_lo); + const __m256i square_hi = _mm256_madd_epi16(ab_hi, ab_hi); + + const __m256i c = _mm256_mullo_epi32(square_lo, division_table[0]); + const __m256i d = _mm256_mullo_epi32(square_hi, division_table[1]); + const __m256i e = SumVectorPair_S32(_mm256_add_epi32(c, d)); + // Copy upper 32bit sum to lower lane. + const __m128i sums = + _mm256_castsi256_si128(_mm256_permute4x64_epi64(e, 0x08)); + cost[index_a] = _mm_cvtsi128_si32(sums); + cost[index_b] = _mm_cvtsi128_si32(_mm_srli_si128(sums, 8)); +} + +inline void Cost2And6_Pair(uint32_t* cost, const __m256i partial_a, + const __m256i partial_b, + const __m256i division_table) { + // The upper lane is a "don't care", so only use the lower lane for + // calculating cost. + const __m256i a = _mm256_permute2x128_si256(partial_a, partial_b, 0x20); + + const __m256i square_a = _mm256_madd_epi16(a, a); + const __m256i b = _mm256_mullo_epi32(square_a, division_table); + const __m256i c = SumVectorPair_S32(b); + // Copy upper 32bit sum to lower lane. + const __m128i sums = + _mm256_castsi256_si128(_mm256_permute4x64_epi64(c, 0x08)); + cost[2] = _mm_cvtsi128_si32(sums); + cost[6] = _mm_cvtsi128_si32(_mm_srli_si128(sums, 8)); +} + +void CdefDirection_AVX2(const void* const source, ptrdiff_t stride, + uint8_t* const direction, int* const variance) { + assert(direction != nullptr); + assert(variance != nullptr); + const auto* src = static_cast<const uint8_t*>(source); + uint32_t cost[8]; + + // partial[0] = add partial 0,4 low + // partial[1] = add partial 1,3 low + // partial[2] = add partial 2 low + // partial[3] = add partial 1,3 high + // partial[4] = add partial 0,4 high + // partial[5] = add partial 7,5 high + // partial[6] = add partial 6 low + // partial[7] = add partial 7,5 low + __m256i partial[8]; + + AddPartial(src, stride, partial); + + const __m256i division_table = LoadUnaligned32(kCdefDivisionTable); + const __m256i division_table_7 = + _mm256_broadcastd_epi32(_mm_cvtsi32_si128(kCdefDivisionTable[7])); + + Cost2And6_Pair(cost, partial[2], partial[6], division_table_7); + + Cost0Or4_Pair(cost, partial[0], partial[4], division_table); + + const __m256i division_table_odd[2] = { + LoadUnaligned32(kCdefDivisionTableOddPairsPadded), + LoadUnaligned32(kCdefDivisionTableOddPairsPadded + 8)}; + + CostOdd_Pair<1, 3>(cost, partial[1], partial[3], division_table_odd); + CostOdd_Pair<7, 5>(cost, partial[7], partial[5], division_table_odd); + + uint32_t best_cost = 0; + *direction = 0; + for (int i = 0; i < 8; ++i) { + if (cost[i] > best_cost) { + best_cost = cost[i]; + *direction = i; + } + } + *variance = (best_cost - cost[(*direction + 4) & 7]) >> 10; +} + +// ------------------------------------------------------------------------- +// CdefFilter + +// Load 4 vectors based on the given |direction|. +inline void LoadDirection(const uint16_t* const src, const ptrdiff_t stride, + __m128i* output, const int direction) { + // Each |direction| describes a different set of source values. Expand this + // set by negating each set. For |direction| == 0 this gives a diagonal line + // from top right to bottom left. The first value is y, the second x. Negative + // y values move up. + // a b c d + // {-1, 1}, {1, -1}, {-2, 2}, {2, -2} + // c + // a + // 0 + // b + // d + const int y_0 = kCdefDirections[direction][0][0]; + const int x_0 = kCdefDirections[direction][0][1]; + const int y_1 = kCdefDirections[direction][1][0]; + const int x_1 = kCdefDirections[direction][1][1]; + output[0] = LoadUnaligned16(src - y_0 * stride - x_0); + output[1] = LoadUnaligned16(src + y_0 * stride + x_0); + output[2] = LoadUnaligned16(src - y_1 * stride - x_1); + output[3] = LoadUnaligned16(src + y_1 * stride + x_1); +} + +// Load 4 vectors based on the given |direction|. Use when |block_width| == 4 to +// do 2 rows at a time. +void LoadDirection4(const uint16_t* const src, const ptrdiff_t stride, + __m128i* output, const int direction) { + const int y_0 = kCdefDirections[direction][0][0]; + const int x_0 = kCdefDirections[direction][0][1]; + const int y_1 = kCdefDirections[direction][1][0]; + const int x_1 = kCdefDirections[direction][1][1]; + output[0] = LoadHi8(LoadLo8(src - y_0 * stride - x_0), + src - y_0 * stride + stride - x_0); + output[1] = LoadHi8(LoadLo8(src + y_0 * stride + x_0), + src + y_0 * stride + stride + x_0); + output[2] = LoadHi8(LoadLo8(src - y_1 * stride - x_1), + src - y_1 * stride + stride - x_1); + output[3] = LoadHi8(LoadLo8(src + y_1 * stride + x_1), + src + y_1 * stride + stride + x_1); +} + +inline __m256i Constrain(const __m256i& pixel, const __m256i& reference, + const __m128i& damping, const __m256i& threshold) { + const __m256i diff = _mm256_sub_epi16(pixel, reference); + const __m256i abs_diff = _mm256_abs_epi16(diff); + // sign(diff) * Clip3(threshold - (std::abs(diff) >> damping), + // 0, std::abs(diff)) + const __m256i shifted_diff = _mm256_srl_epi16(abs_diff, damping); + // For bitdepth == 8, the threshold range is [0, 15] and the damping range is + // [3, 6]. If pixel == kCdefLargeValue(0x4000), shifted_diff will always be + // larger than threshold. Subtract using saturation will return 0 when pixel + // == kCdefLargeValue. + static_assert(kCdefLargeValue == 0x4000, "Invalid kCdefLargeValue"); + const __m256i thresh_minus_shifted_diff = + _mm256_subs_epu16(threshold, shifted_diff); + const __m256i clamp_abs_diff = + _mm256_min_epi16(thresh_minus_shifted_diff, abs_diff); + // Restore the sign. + return _mm256_sign_epi16(clamp_abs_diff, diff); +} + +inline __m256i ApplyConstrainAndTap(const __m256i& pixel, const __m256i& val, + const __m256i& tap, const __m128i& damping, + const __m256i& threshold) { + const __m256i constrained = Constrain(val, pixel, damping, threshold); + return _mm256_mullo_epi16(constrained, tap); +} + +template <int width, bool enable_primary = true, bool enable_secondary = true> +void CdefFilter_AVX2(const uint16_t* src, const ptrdiff_t src_stride, + const int height, const int primary_strength, + const int secondary_strength, const int damping, + const int direction, void* dest, + const ptrdiff_t dst_stride) { + static_assert(width == 8 || width == 4, "Invalid CDEF width."); + static_assert(enable_primary || enable_secondary, ""); + constexpr bool clipping_required = enable_primary && enable_secondary; + auto* dst = static_cast<uint8_t*>(dest); + __m128i primary_damping_shift, secondary_damping_shift; + + // FloorLog2() requires input to be > 0. + // 8-bit damping range: Y: [3, 6], UV: [2, 5]. + if (enable_primary) { + // primary_strength: [0, 15] -> FloorLog2: [0, 3] so a clamp is necessary + // for UV filtering. + primary_damping_shift = + _mm_cvtsi32_si128(std::max(0, damping - FloorLog2(primary_strength))); + } + if (enable_secondary) { + // secondary_strength: [0, 4] -> FloorLog2: [0, 2] so no clamp to 0 is + // necessary. + assert(damping - FloorLog2(secondary_strength) >= 0); + secondary_damping_shift = + _mm_cvtsi32_si128(damping - FloorLog2(secondary_strength)); + } + const __m256i primary_tap_0 = _mm256_broadcastw_epi16( + _mm_cvtsi32_si128(kCdefPrimaryTaps[primary_strength & 1][0])); + const __m256i primary_tap_1 = _mm256_broadcastw_epi16( + _mm_cvtsi32_si128(kCdefPrimaryTaps[primary_strength & 1][1])); + const __m256i secondary_tap_0 = + _mm256_broadcastw_epi16(_mm_cvtsi32_si128(kCdefSecondaryTap0)); + const __m256i secondary_tap_1 = + _mm256_broadcastw_epi16(_mm_cvtsi32_si128(kCdefSecondaryTap1)); + const __m256i cdef_large_value_mask = _mm256_broadcastw_epi16( + _mm_cvtsi32_si128(static_cast<int16_t>(~kCdefLargeValue))); + const __m256i primary_threshold = + _mm256_broadcastw_epi16(_mm_cvtsi32_si128(primary_strength)); + const __m256i secondary_threshold = + _mm256_broadcastw_epi16(_mm_cvtsi32_si128(secondary_strength)); + + int y = height; + do { + __m128i pixel_128; + if (width == 8) { + pixel_128 = LoadUnaligned16(src); + } else { + pixel_128 = LoadHi8(LoadLo8(src), src + src_stride); + } + + __m256i pixel = SetrM128i(pixel_128, pixel_128); + + __m256i min = pixel; + __m256i max = pixel; + __m256i sum_pair; + + if (enable_primary) { + // Primary |direction|. + __m128i primary_val_128[4]; + if (width == 8) { + LoadDirection(src, src_stride, primary_val_128, direction); + } else { + LoadDirection4(src, src_stride, primary_val_128, direction); + } + + __m256i primary_val[2]; + primary_val[0] = SetrM128i(primary_val_128[0], primary_val_128[1]); + primary_val[1] = SetrM128i(primary_val_128[2], primary_val_128[3]); + + if (clipping_required) { + min = _mm256_min_epu16(min, primary_val[0]); + min = _mm256_min_epu16(min, primary_val[1]); + + // The source is 16 bits, however, we only really care about the lower + // 8 bits. The upper 8 bits contain the "large" flag. After the final + // primary max has been calculated, zero out the upper 8 bits. Use this + // to find the "16 bit" max. + const __m256i max_p01 = _mm256_max_epu8(primary_val[0], primary_val[1]); + max = _mm256_max_epu16( + max, _mm256_and_si256(max_p01, cdef_large_value_mask)); + } + + sum_pair = ApplyConstrainAndTap(pixel, primary_val[0], primary_tap_0, + primary_damping_shift, primary_threshold); + sum_pair = _mm256_add_epi16( + sum_pair, + ApplyConstrainAndTap(pixel, primary_val[1], primary_tap_1, + primary_damping_shift, primary_threshold)); + } else { + sum_pair = _mm256_setzero_si256(); + } + + if (enable_secondary) { + // Secondary |direction| values (+/- 2). Clamp |direction|. + __m128i secondary_val_128[8]; + if (width == 8) { + LoadDirection(src, src_stride, secondary_val_128, direction + 2); + LoadDirection(src, src_stride, secondary_val_128 + 4, direction - 2); + } else { + LoadDirection4(src, src_stride, secondary_val_128, direction + 2); + LoadDirection4(src, src_stride, secondary_val_128 + 4, direction - 2); + } + + __m256i secondary_val[4]; + secondary_val[0] = SetrM128i(secondary_val_128[0], secondary_val_128[1]); + secondary_val[1] = SetrM128i(secondary_val_128[2], secondary_val_128[3]); + secondary_val[2] = SetrM128i(secondary_val_128[4], secondary_val_128[5]); + secondary_val[3] = SetrM128i(secondary_val_128[6], secondary_val_128[7]); + + if (clipping_required) { + min = _mm256_min_epu16(min, secondary_val[0]); + min = _mm256_min_epu16(min, secondary_val[1]); + min = _mm256_min_epu16(min, secondary_val[2]); + min = _mm256_min_epu16(min, secondary_val[3]); + + const __m256i max_s01 = + _mm256_max_epu8(secondary_val[0], secondary_val[1]); + const __m256i max_s23 = + _mm256_max_epu8(secondary_val[2], secondary_val[3]); + const __m256i max_s = _mm256_max_epu8(max_s01, max_s23); + max = _mm256_max_epu8(max, + _mm256_and_si256(max_s, cdef_large_value_mask)); + } + + sum_pair = _mm256_add_epi16( + sum_pair, + ApplyConstrainAndTap(pixel, secondary_val[0], secondary_tap_0, + secondary_damping_shift, secondary_threshold)); + sum_pair = _mm256_add_epi16( + sum_pair, + ApplyConstrainAndTap(pixel, secondary_val[1], secondary_tap_1, + secondary_damping_shift, secondary_threshold)); + sum_pair = _mm256_add_epi16( + sum_pair, + ApplyConstrainAndTap(pixel, secondary_val[2], secondary_tap_0, + secondary_damping_shift, secondary_threshold)); + sum_pair = _mm256_add_epi16( + sum_pair, + ApplyConstrainAndTap(pixel, secondary_val[3], secondary_tap_1, + secondary_damping_shift, secondary_threshold)); + } + + __m128i sum = _mm_add_epi16(_mm256_castsi256_si128(sum_pair), + _mm256_extracti128_si256(sum_pair, 1)); + + // Clip3(pixel + ((8 + sum - (sum < 0)) >> 4), min, max)) + const __m128i sum_lt_0 = _mm_srai_epi16(sum, 15); + // 8 + sum + sum = _mm_add_epi16(sum, _mm_set1_epi16(8)); + // (... - (sum < 0)) >> 4 + sum = _mm_add_epi16(sum, sum_lt_0); + sum = _mm_srai_epi16(sum, 4); + // pixel + ... + sum = _mm_add_epi16(sum, _mm256_castsi256_si128(pixel)); + if (clipping_required) { + const __m128i min_128 = _mm_min_epu16(_mm256_castsi256_si128(min), + _mm256_extracti128_si256(min, 1)); + + const __m128i max_128 = _mm_max_epu16(_mm256_castsi256_si128(max), + _mm256_extracti128_si256(max, 1)); + // Clip3 + sum = _mm_min_epi16(sum, max_128); + sum = _mm_max_epi16(sum, min_128); + } + + const __m128i result = _mm_packus_epi16(sum, sum); + if (width == 8) { + src += src_stride; + StoreLo8(dst, result); + dst += dst_stride; + --y; + } else { + src += src_stride << 1; + Store4(dst, result); + dst += dst_stride; + Store4(dst, _mm_srli_si128(result, 4)); + dst += dst_stride; + y -= 2; + } + } while (y != 0); +} + +void Init8bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(8); + assert(dsp != nullptr); + dsp->cdef_direction = CdefDirection_AVX2; + + dsp->cdef_filters[0][0] = CdefFilter_AVX2<4>; + dsp->cdef_filters[0][1] = + CdefFilter_AVX2<4, /*enable_primary=*/true, /*enable_secondary=*/false>; + dsp->cdef_filters[0][2] = CdefFilter_AVX2<4, /*enable_primary=*/false>; + dsp->cdef_filters[1][0] = CdefFilter_AVX2<8>; + dsp->cdef_filters[1][1] = + CdefFilter_AVX2<8, /*enable_primary=*/true, /*enable_secondary=*/false>; + dsp->cdef_filters[1][2] = CdefFilter_AVX2<8, /*enable_primary=*/false>; +} + +} // namespace +} // namespace low_bitdepth + +void CdefInit_AVX2() { low_bitdepth::Init8bpp(); } + +} // namespace dsp +} // namespace libgav1 +#else // !LIBGAV1_TARGETING_AVX2 +namespace libgav1 { +namespace dsp { + +void CdefInit_AVX2() {} + +} // namespace dsp +} // namespace libgav1 +#endif // LIBGAV1_TARGETING_AVX2 diff --git a/src/dsp/x86/cdef_avx2.h b/src/dsp/x86/cdef_avx2.h new file mode 100644 index 0000000..41f2d3f --- /dev/null +++ b/src/dsp/x86/cdef_avx2.h @@ -0,0 +1,45 @@ +/* + * Copyright 2021 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_X86_CDEF_AVX2_H_ +#define LIBGAV1_SRC_DSP_X86_CDEF_AVX2_H_ + +#include "src/dsp/dsp.h" +#include "src/utils/cpu.h" + +namespace libgav1 { +namespace dsp { + +// Initializes Dsp::cdef_direction and Dsp::cdef_filters. This function is not +// thread-safe. +void CdefInit_AVX2(); + +} // namespace dsp +} // namespace libgav1 + +#if LIBGAV1_TARGETING_AVX2 + +#ifndef LIBGAV1_Dsp8bpp_CdefDirection +#define LIBGAV1_Dsp8bpp_CdefDirection LIBGAV1_CPU_AVX2 +#endif + +#ifndef LIBGAV1_Dsp8bpp_CdefFilters +#define LIBGAV1_Dsp8bpp_CdefFilters LIBGAV1_CPU_AVX2 +#endif + +#endif // LIBGAV1_TARGETING_AVX2 + +#endif // LIBGAV1_SRC_DSP_X86_CDEF_AVX2_H_ diff --git a/src/dsp/x86/cdef_sse4.cc b/src/dsp/x86/cdef_sse4.cc index 3211a2d..6ede778 100644 --- a/src/dsp/x86/cdef_sse4.cc +++ b/src/dsp/x86/cdef_sse4.cc @@ -349,8 +349,8 @@ inline uint32_t SumVector_S32(__m128i a) { inline uint32_t Cost0Or4(const __m128i a, const __m128i b, const __m128i division_table[2]) { // Reverse and clear upper 2 bytes. - const __m128i reverser = - _mm_set_epi32(0x80800100, 0x03020504, 0x07060908, 0x0b0a0d0c); + const __m128i reverser = _mm_set_epi32(static_cast<int>(0x80800100), + 0x03020504, 0x07060908, 0x0b0a0d0c); // 14 13 12 11 10 09 08 ZZ const __m128i b_reversed = _mm_shuffle_epi8(b, reverser); // 00 14 01 13 02 12 03 11 @@ -371,7 +371,8 @@ inline uint32_t CostOdd(const __m128i a, const __m128i b, const __m128i division_table[2]) { // Reverse and clear upper 10 bytes. const __m128i reverser = - _mm_set_epi32(0x80808080, 0x80808080, 0x80800100, 0x03020504); + _mm_set_epi32(static_cast<int>(0x80808080), static_cast<int>(0x80808080), + static_cast<int>(0x80800100), 0x03020504); // 10 09 08 ZZ ZZ ZZ ZZ ZZ const __m128i b_reversed = _mm_shuffle_epi8(b, reverser); // 00 10 01 09 02 08 03 ZZ @@ -717,7 +718,7 @@ void CdefInit_SSE4_1() { low_bitdepth::Init8bpp(); } } // namespace dsp } // namespace libgav1 -#else // !LIBGAV1_TARGETING_SSE4_1 +#else // !LIBGAV1_TARGETING_SSE4_1 namespace libgav1 { namespace dsp { diff --git a/src/dsp/x86/common_avx2.h b/src/dsp/x86/common_avx2.h index 4ce7de2..373116a 100644 --- a/src/dsp/x86/common_avx2.h +++ b/src/dsp/x86/common_avx2.h @@ -27,109 +27,60 @@ #include <cassert> #include <cstddef> #include <cstdint> +#include <cstring> namespace libgav1 { namespace dsp { - -//------------------------------------------------------------------------------ -// Compatibility functions. - -inline __m256i SetrM128i(const __m128i lo, const __m128i hi) { - // For compatibility with older gcc toolchains (< 8) use - // _mm256_inserti128_si256 over _mm256_setr_m128i. Newer gcc implementations - // are implemented similarly to the following, clang uses a different method - // but no differences in assembly have been observed. - return _mm256_inserti128_si256(_mm256_castsi128_si256(lo), hi, 1); -} - -//------------------------------------------------------------------------------ -// Load functions. - -inline __m256i LoadAligned32(const void* a) { - assert((reinterpret_cast<uintptr_t>(a) & 0x1f) == 0); - return _mm256_load_si256(static_cast<const __m256i*>(a)); -} - -inline void LoadAligned64(const void* a, __m256i dst[2]) { - assert((reinterpret_cast<uintptr_t>(a) & 0x1f) == 0); - dst[0] = _mm256_load_si256(static_cast<const __m256i*>(a) + 0); - dst[1] = _mm256_load_si256(static_cast<const __m256i*>(a) + 1); -} - -inline __m256i LoadUnaligned32(const void* a) { - return _mm256_loadu_si256(static_cast<const __m256i*>(a)); -} - -//------------------------------------------------------------------------------ -// Load functions to avoid MemorySanitizer's use-of-uninitialized-value warning. - -inline __m256i MaskOverreads(const __m256i source, - const ptrdiff_t over_read_in_bytes) { - __m256i dst = source; -#if LIBGAV1_MSAN - if (over_read_in_bytes >= 32) return _mm256_setzero_si256(); - if (over_read_in_bytes > 0) { - __m128i m = _mm_set1_epi8(-1); - for (ptrdiff_t i = 0; i < over_read_in_bytes % 16; ++i) { - m = _mm_srli_si128(m, 1); - } - const __m256i mask = (over_read_in_bytes < 16) - ? SetrM128i(_mm_set1_epi8(-1), m) - : SetrM128i(m, _mm_setzero_si128()); - dst = _mm256_and_si256(dst, mask); - } -#else - static_cast<void>(over_read_in_bytes); -#endif - return dst; -} - -inline __m256i LoadAligned32Msan(const void* const source, - const ptrdiff_t over_read_in_bytes) { - return MaskOverreads(LoadAligned32(source), over_read_in_bytes); -} - -inline void LoadAligned64Msan(const void* const source, - const ptrdiff_t over_read_in_bytes, - __m256i dst[2]) { - dst[0] = MaskOverreads(LoadAligned32(source), over_read_in_bytes); - dst[1] = MaskOverreads(LoadAligned32(static_cast<const __m256i*>(source) + 1), - over_read_in_bytes); -} - -inline __m256i LoadUnaligned32Msan(const void* const source, - const ptrdiff_t over_read_in_bytes) { - return MaskOverreads(LoadUnaligned32(source), over_read_in_bytes); -} - -//------------------------------------------------------------------------------ -// Store functions. - -inline void StoreAligned32(void* a, const __m256i v) { - assert((reinterpret_cast<uintptr_t>(a) & 0x1f) == 0); - _mm256_store_si256(static_cast<__m256i*>(a), v); -} - -inline void StoreAligned64(void* a, const __m256i v[2]) { - assert((reinterpret_cast<uintptr_t>(a) & 0x1f) == 0); - _mm256_store_si256(static_cast<__m256i*>(a) + 0, v[0]); - _mm256_store_si256(static_cast<__m256i*>(a) + 1, v[1]); -} - -inline void StoreUnaligned32(void* a, const __m256i v) { - _mm256_storeu_si256(static_cast<__m256i*>(a), v); -} - -//------------------------------------------------------------------------------ -// Arithmetic utilities. - -inline __m256i RightShiftWithRounding_S16(const __m256i v_val_d, int bits) { - assert(bits <= 16); - const __m256i v_bias_d = - _mm256_set1_epi16(static_cast<int16_t>((1 << bits) >> 1)); - const __m256i v_tmp_d = _mm256_add_epi16(v_val_d, v_bias_d); - return _mm256_srai_epi16(v_tmp_d, bits); -} +namespace avx2 { + +#include "src/dsp/x86/common_avx2.inc" +#include "src/dsp/x86/common_sse4.inc" + +} // namespace avx2 + +// NOLINTBEGIN(misc-unused-using-decls) +// These function aliases shall not be visible to external code. They are +// restricted to x86/*_avx2.cc files only. This scheme exists to distinguish two +// possible implementations of common functions, which may differ based on +// whether the compiler is permitted to use avx2 instructions. + +// common_sse4.inc +using avx2::Load2; +using avx2::Load2x2; +using avx2::Load4; +using avx2::Load4x2; +using avx2::LoadAligned16; +using avx2::LoadAligned16Msan; +using avx2::LoadHi8; +using avx2::LoadHi8Msan; +using avx2::LoadLo8; +using avx2::LoadLo8Msan; +using avx2::LoadUnaligned16; +using avx2::LoadUnaligned16Msan; +using avx2::MaskHighNBytes; +using avx2::RightShiftWithRounding_S16; +using avx2::RightShiftWithRounding_S32; +using avx2::RightShiftWithRounding_U16; +using avx2::RightShiftWithRounding_U32; +using avx2::Store2; +using avx2::Store4; +using avx2::StoreAligned16; +using avx2::StoreHi8; +using avx2::StoreLo8; +using avx2::StoreUnaligned16; + +// common_avx2.inc +using avx2::LoadAligned32; +using avx2::LoadAligned32Msan; +using avx2::LoadAligned64; +using avx2::LoadAligned64Msan; +using avx2::LoadUnaligned32; +using avx2::LoadUnaligned32Msan; +using avx2::SetrM128i; +using avx2::StoreAligned32; +using avx2::StoreAligned64; +using avx2::StoreUnaligned32; +// NOLINTEND } // namespace dsp } // namespace libgav1 diff --git a/src/dsp/x86/common_avx2.inc b/src/dsp/x86/common_avx2.inc new file mode 100644 index 0000000..53b4e2e --- /dev/null +++ b/src/dsp/x86/common_avx2.inc @@ -0,0 +1,121 @@ +/* + * Copyright 2021 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +//------------------------------------------------------------------------------ +// Compatibility functions. + +inline __m256i SetrM128i(const __m128i lo, const __m128i hi) { + // For compatibility with older gcc toolchains (< 8) use + // _mm256_inserti128_si256 over _mm256_setr_m128i. Newer gcc implementations + // are implemented similarly to the following, clang uses a different method + // but no differences in assembly have been observed. + return _mm256_inserti128_si256(_mm256_castsi128_si256(lo), hi, 1); +} + +//------------------------------------------------------------------------------ +// Load functions. + +inline __m256i LoadAligned32(const void* a) { + assert((reinterpret_cast<uintptr_t>(a) & 0x1f) == 0); + return _mm256_load_si256(static_cast<const __m256i*>(a)); +} + +inline void LoadAligned64(const void* a, __m256i dst[2]) { + assert((reinterpret_cast<uintptr_t>(a) & 0x1f) == 0); + dst[0] = _mm256_load_si256(static_cast<const __m256i*>(a) + 0); + dst[1] = _mm256_load_si256(static_cast<const __m256i*>(a) + 1); +} + +inline __m256i LoadUnaligned32(const void* a) { + return _mm256_loadu_si256(static_cast<const __m256i*>(a)); +} + +//------------------------------------------------------------------------------ +// Load functions to avoid MemorySanitizer's use-of-uninitialized-value warning. + +inline __m256i MaskOverreads(const __m256i source, + const ptrdiff_t over_read_in_bytes) { + __m256i dst = source; +#if LIBGAV1_MSAN + if (over_read_in_bytes >= 32) return _mm256_setzero_si256(); + if (over_read_in_bytes > 0) { + __m128i m = _mm_set1_epi8(-1); + for (ptrdiff_t i = 0; i < over_read_in_bytes % 16; ++i) { + m = _mm_srli_si128(m, 1); + } + const __m256i mask = (over_read_in_bytes < 16) + ? SetrM128i(_mm_set1_epi8(-1), m) + : SetrM128i(m, _mm_setzero_si128()); + dst = _mm256_and_si256(dst, mask); + } +#else + static_cast<void>(over_read_in_bytes); +#endif + return dst; +} + +inline __m256i LoadAligned32Msan(const void* const source, + const ptrdiff_t over_read_in_bytes) { + return MaskOverreads(LoadAligned32(source), over_read_in_bytes); +} + +inline void LoadAligned64Msan(const void* const source, + const ptrdiff_t over_read_in_bytes, + __m256i dst[2]) { + dst[0] = MaskOverreads(LoadAligned32(source), over_read_in_bytes); + dst[1] = MaskOverreads(LoadAligned32(static_cast<const __m256i*>(source) + 1), + over_read_in_bytes); +} + +inline __m256i LoadUnaligned32Msan(const void* const source, + const ptrdiff_t over_read_in_bytes) { + return MaskOverreads(LoadUnaligned32(source), over_read_in_bytes); +} + +//------------------------------------------------------------------------------ +// Store functions. + +inline void StoreAligned32(void* a, const __m256i v) { + assert((reinterpret_cast<uintptr_t>(a) & 0x1f) == 0); + _mm256_store_si256(static_cast<__m256i*>(a), v); +} + +inline void StoreAligned64(void* a, const __m256i v[2]) { + assert((reinterpret_cast<uintptr_t>(a) & 0x1f) == 0); + _mm256_store_si256(static_cast<__m256i*>(a) + 0, v[0]); + _mm256_store_si256(static_cast<__m256i*>(a) + 1, v[1]); +} + +inline void StoreUnaligned32(void* a, const __m256i v) { + _mm256_storeu_si256(static_cast<__m256i*>(a), v); +} + +//------------------------------------------------------------------------------ +// Arithmetic utilities. + +inline __m256i RightShiftWithRounding_S16(const __m256i v_val_d, int bits) { + assert(bits <= 16); + const __m256i v_bias_d = + _mm256_set1_epi16(static_cast<int16_t>((1 << bits) >> 1)); + const __m256i v_tmp_d = _mm256_add_epi16(v_val_d, v_bias_d); + return _mm256_srai_epi16(v_tmp_d, bits); +} + +inline __m256i RightShiftWithRounding_S32(const __m256i v_val_d, int bits) { + const __m256i v_bias_d = _mm256_set1_epi32((1 << bits) >> 1); + const __m256i v_tmp_d = _mm256_add_epi32(v_val_d, v_bias_d); + return _mm256_srai_epi32(v_tmp_d, bits); +} diff --git a/src/dsp/x86/common_sse4.h b/src/dsp/x86/common_sse4.h index c510f8c..41a3a68 100644 --- a/src/dsp/x86/common_sse4.h +++ b/src/dsp/x86/common_sse4.h @@ -28,7 +28,6 @@ #include <cassert> #include <cstddef> #include <cstdint> -#include <cstdlib> #include <cstring> #if 0 @@ -71,192 +70,58 @@ inline void PrintRegX(const int r, const char* const name) { #define PR(var, N) PrintReg(var, #var, N) #define PD(var) PrintReg(var, #var); #define PX(var) PrintRegX(var, #var); -#endif // 0 - -namespace libgav1 { -namespace dsp { - -//------------------------------------------------------------------------------ -// Load functions. - -inline __m128i Load2(const void* src) { - int16_t val; - memcpy(&val, src, sizeof(val)); - return _mm_cvtsi32_si128(val); -} - -inline __m128i Load2x2(const void* src1, const void* src2) { - uint16_t val1; - uint16_t val2; - memcpy(&val1, src1, sizeof(val1)); - memcpy(&val2, src2, sizeof(val2)); - return _mm_cvtsi32_si128(val1 | (val2 << 16)); -} - -// Load 2 uint8_t values into |lane| * 2 and |lane| * 2 + 1. -template <int lane> -inline __m128i Load2(const void* const buf, __m128i val) { - uint16_t temp; - memcpy(&temp, buf, 2); - return _mm_insert_epi16(val, temp, lane); -} - -inline __m128i Load4(const void* src) { - // With new compilers such as clang 8.0.0 we can use the new _mm_loadu_si32 - // intrinsic. Both _mm_loadu_si32(src) and the code here are compiled into a - // movss instruction. - // - // Until compiler support of _mm_loadu_si32 is widespread, use of - // _mm_loadu_si32 is banned. - int val; - memcpy(&val, src, sizeof(val)); - return _mm_cvtsi32_si128(val); -} - -inline __m128i Load4x2(const void* src1, const void* src2) { - // With new compilers such as clang 8.0.0 we can use the new _mm_loadu_si32 - // intrinsic. Both _mm_loadu_si32(src) and the code here are compiled into a - // movss instruction. - // - // Until compiler support of _mm_loadu_si32 is widespread, use of - // _mm_loadu_si32 is banned. - int val1, val2; - memcpy(&val1, src1, sizeof(val1)); - memcpy(&val2, src2, sizeof(val2)); - return _mm_insert_epi32(_mm_cvtsi32_si128(val1), val2, 1); -} -inline __m128i LoadLo8(const void* a) { - return _mm_loadl_epi64(static_cast<const __m128i*>(a)); -} - -inline __m128i LoadHi8(const __m128i v, const void* a) { - const __m128 x = - _mm_loadh_pi(_mm_castsi128_ps(v), static_cast<const __m64*>(a)); - return _mm_castps_si128(x); -} - -inline __m128i LoadUnaligned16(const void* a) { - return _mm_loadu_si128(static_cast<const __m128i*>(a)); -} - -inline __m128i LoadAligned16(const void* a) { - assert((reinterpret_cast<uintptr_t>(a) & 0xf) == 0); - return _mm_load_si128(static_cast<const __m128i*>(a)); -} - -//------------------------------------------------------------------------------ -// Load functions to avoid MemorySanitizer's use-of-uninitialized-value warning. - -inline __m128i MaskOverreads(const __m128i source, - const ptrdiff_t over_read_in_bytes) { - __m128i dst = source; #if LIBGAV1_MSAN - if (over_read_in_bytes > 0) { - __m128i mask = _mm_set1_epi8(-1); - for (ptrdiff_t i = 0; i < over_read_in_bytes; ++i) { - mask = _mm_srli_si128(mask, 1); - } - dst = _mm_and_si128(dst, mask); - } -#else - static_cast<void>(over_read_in_bytes); -#endif - return dst; -} +#include <sanitizer/msan_interface.h> -inline __m128i LoadLo8Msan(const void* const source, - const ptrdiff_t over_read_in_bytes) { - return MaskOverreads(LoadLo8(source), over_read_in_bytes + 8); +inline void PrintShadow(const void* r, const char* const name, + const size_t size) { + fprintf(stderr, "Shadow for %s:\n", name); + __msan_print_shadow(r, size); } +#define PS(var, N) PrintShadow(var, #var, N) -inline __m128i LoadHi8Msan(const __m128i v, const void* source, - const ptrdiff_t over_read_in_bytes) { - return MaskOverreads(LoadHi8(v, source), over_read_in_bytes); -} - -inline __m128i LoadAligned16Msan(const void* const source, - const ptrdiff_t over_read_in_bytes) { - return MaskOverreads(LoadAligned16(source), over_read_in_bytes); -} +#endif // LIBGAV1_MSAN -inline __m128i LoadUnaligned16Msan(const void* const source, - const ptrdiff_t over_read_in_bytes) { - return MaskOverreads(LoadUnaligned16(source), over_read_in_bytes); -} - -//------------------------------------------------------------------------------ -// Store functions. - -inline void Store2(void* dst, const __m128i x) { - const int val = _mm_cvtsi128_si32(x); - memcpy(dst, &val, 2); -} - -inline void Store4(void* dst, const __m128i x) { - const int val = _mm_cvtsi128_si32(x); - memcpy(dst, &val, sizeof(val)); -} - -inline void StoreLo8(void* a, const __m128i v) { - _mm_storel_epi64(static_cast<__m128i*>(a), v); -} - -inline void StoreHi8(void* a, const __m128i v) { - _mm_storeh_pi(static_cast<__m64*>(a), _mm_castsi128_ps(v)); -} - -inline void StoreAligned16(void* a, const __m128i v) { - assert((reinterpret_cast<uintptr_t>(a) & 0xf) == 0); - _mm_store_si128(static_cast<__m128i*>(a), v); -} - -inline void StoreUnaligned16(void* a, const __m128i v) { - _mm_storeu_si128(static_cast<__m128i*>(a), v); -} - -//------------------------------------------------------------------------------ -// Arithmetic utilities. - -inline __m128i RightShiftWithRounding_U16(const __m128i v_val_d, int bits) { - assert(bits <= 16); - // Shift out all but the last bit. - const __m128i v_tmp_d = _mm_srli_epi16(v_val_d, bits - 1); - // Avg with zero will shift by 1 and round. - return _mm_avg_epu16(v_tmp_d, _mm_setzero_si128()); -} - -inline __m128i RightShiftWithRounding_S16(const __m128i v_val_d, int bits) { - assert(bits <= 16); - const __m128i v_bias_d = - _mm_set1_epi16(static_cast<int16_t>((1 << bits) >> 1)); - const __m128i v_tmp_d = _mm_add_epi16(v_val_d, v_bias_d); - return _mm_srai_epi16(v_tmp_d, bits); -} - -inline __m128i RightShiftWithRounding_U32(const __m128i v_val_d, int bits) { - const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1); - const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d); - return _mm_srli_epi32(v_tmp_d, bits); -} - -inline __m128i RightShiftWithRounding_S32(const __m128i v_val_d, int bits) { - const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1); - const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d); - return _mm_srai_epi32(v_tmp_d, bits); -} - -//------------------------------------------------------------------------------ -// Masking utilities -inline __m128i MaskHighNBytes(int n) { - static constexpr uint8_t kMask[32] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - }; +#endif // 0 - return LoadUnaligned16(kMask + n); -} +namespace libgav1 { +namespace dsp { +namespace sse4 { + +#include "src/dsp/x86/common_sse4.inc" + +} // namespace sse4 + +// NOLINTBEGIN(misc-unused-using-decls) +// These function aliases shall not be visible to external code. They are +// restricted to x86/*_sse4.cc files only. This scheme exists to distinguish two +// possible implementations of common functions, which may differ based on +// whether the compiler is permitted to use avx2 instructions. +using sse4::Load2; +using sse4::Load2x2; +using sse4::Load4; +using sse4::Load4x2; +using sse4::LoadAligned16; +using sse4::LoadAligned16Msan; +using sse4::LoadHi8; +using sse4::LoadHi8Msan; +using sse4::LoadLo8; +using sse4::LoadLo8Msan; +using sse4::LoadUnaligned16; +using sse4::LoadUnaligned16Msan; +using sse4::MaskHighNBytes; +using sse4::RightShiftWithRounding_S16; +using sse4::RightShiftWithRounding_S32; +using sse4::RightShiftWithRounding_U16; +using sse4::RightShiftWithRounding_U32; +using sse4::Store2; +using sse4::Store4; +using sse4::StoreAligned16; +using sse4::StoreHi8; +using sse4::StoreLo8; +using sse4::StoreUnaligned16; +// NOLINTEND } // namespace dsp } // namespace libgav1 diff --git a/src/dsp/x86/common_sse4.inc b/src/dsp/x86/common_sse4.inc new file mode 100644 index 0000000..35c56b8 --- /dev/null +++ b/src/dsp/x86/common_sse4.inc @@ -0,0 +1,206 @@ +/* + * Copyright 2021 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +//------------------------------------------------------------------------------ +// Load functions. + +inline __m128i Load2(const void* src) { + int16_t val; + memcpy(&val, src, sizeof(val)); + return _mm_cvtsi32_si128(val); +} + +inline __m128i Load2x2(const void* src1, const void* src2) { + uint16_t val1; + uint16_t val2; + memcpy(&val1, src1, sizeof(val1)); + memcpy(&val2, src2, sizeof(val2)); + return _mm_cvtsi32_si128(val1 | (val2 << 16)); +} + +// Load 2 uint8_t values into |lane| * 2 and |lane| * 2 + 1. +template <int lane> +inline __m128i Load2(const void* const buf, __m128i val) { + int16_t temp; + memcpy(&temp, buf, 2); + return _mm_insert_epi16(val, temp, lane); +} + +inline __m128i Load4(const void* src) { + // With new compilers such as clang 8.0.0 we can use the new _mm_loadu_si32 + // intrinsic. Both _mm_loadu_si32(src) and the code here are compiled into a + // movss instruction. + // + // Until compiler support of _mm_loadu_si32 is widespread, use of + // _mm_loadu_si32 is banned. + int val; + memcpy(&val, src, sizeof(val)); + return _mm_cvtsi32_si128(val); +} + +inline __m128i Load4x2(const void* src1, const void* src2) { + // With new compilers such as clang 8.0.0 we can use the new _mm_loadu_si32 + // intrinsic. Both _mm_loadu_si32(src) and the code here are compiled into a + // movss instruction. + // + // Until compiler support of _mm_loadu_si32 is widespread, use of + // _mm_loadu_si32 is banned. + int val1, val2; + memcpy(&val1, src1, sizeof(val1)); + memcpy(&val2, src2, sizeof(val2)); + return _mm_insert_epi32(_mm_cvtsi32_si128(val1), val2, 1); +} + +inline __m128i LoadLo8(const void* a) { + return _mm_loadl_epi64(static_cast<const __m128i*>(a)); +} + +inline __m128i LoadHi8(const __m128i v, const void* a) { + const __m128 x = + _mm_loadh_pi(_mm_castsi128_ps(v), static_cast<const __m64*>(a)); + return _mm_castps_si128(x); +} + +inline __m128i LoadUnaligned16(const void* a) { + return _mm_loadu_si128(static_cast<const __m128i*>(a)); +} + +inline __m128i LoadAligned16(const void* a) { + assert((reinterpret_cast<uintptr_t>(a) & 0xf) == 0); + return _mm_load_si128(static_cast<const __m128i*>(a)); +} + +//------------------------------------------------------------------------------ +// Load functions to avoid MemorySanitizer's use-of-uninitialized-value warning. + +inline __m128i MaskOverreads(const __m128i source, + const ptrdiff_t over_read_in_bytes) { + __m128i dst = source; +#if LIBGAV1_MSAN + if (over_read_in_bytes > 0) { + __m128i mask = _mm_set1_epi8(-1); + for (ptrdiff_t i = 0; i < over_read_in_bytes; ++i) { + mask = _mm_srli_si128(mask, 1); + } + dst = _mm_and_si128(dst, mask); + } +#else + static_cast<void>(over_read_in_bytes); +#endif + return dst; +} + +inline __m128i LoadLo8Msan(const void* const source, + const ptrdiff_t over_read_in_bytes) { + return MaskOverreads(LoadLo8(source), over_read_in_bytes + 8); +} + +inline __m128i LoadHi8Msan(const __m128i v, const void* source, + const ptrdiff_t over_read_in_bytes) { + return MaskOverreads(LoadHi8(v, source), over_read_in_bytes); +} + +inline __m128i LoadAligned16Msan(const void* const source, + const ptrdiff_t over_read_in_bytes) { + return MaskOverreads(LoadAligned16(source), over_read_in_bytes); +} + +inline __m128i LoadUnaligned16Msan(const void* const source, + const ptrdiff_t over_read_in_bytes) { + return MaskOverreads(LoadUnaligned16(source), over_read_in_bytes); +} + +//------------------------------------------------------------------------------ +// Store functions. + +inline void Store2(void* dst, const __m128i x) { + const int val = _mm_cvtsi128_si32(x); + memcpy(dst, &val, 2); +} + +inline void Store4(void* dst, const __m128i x) { + const int val = _mm_cvtsi128_si32(x); + memcpy(dst, &val, sizeof(val)); +} + +inline void StoreLo8(void* a, const __m128i v) { + _mm_storel_epi64(static_cast<__m128i*>(a), v); +} + +inline void StoreHi8(void* a, const __m128i v) { + _mm_storeh_pi(static_cast<__m64*>(a), _mm_castsi128_ps(v)); +} + +inline void StoreAligned16(void* a, const __m128i v) { + assert((reinterpret_cast<uintptr_t>(a) & 0xf) == 0); + _mm_store_si128(static_cast<__m128i*>(a), v); +} + +inline void StoreUnaligned16(void* a, const __m128i v) { + _mm_storeu_si128(static_cast<__m128i*>(a), v); +} + +//------------------------------------------------------------------------------ +// Arithmetic utilities. + +inline __m128i RightShiftWithRounding_U16(const __m128i v_val_d, int bits) { + assert(bits <= 16); + // Shift out all but the last bit. + const __m128i v_tmp_d = _mm_srli_epi16(v_val_d, bits - 1); + // Avg with zero will shift by 1 and round. + return _mm_avg_epu16(v_tmp_d, _mm_setzero_si128()); +} + +inline __m128i RightShiftWithRounding_S16(const __m128i v_val_d, int bits) { + assert(bits < 16); + const __m128i v_bias_d = + _mm_set1_epi16(static_cast<int16_t>((1 << bits) >> 1)); + const __m128i v_tmp_d = _mm_add_epi16(v_val_d, v_bias_d); + return _mm_srai_epi16(v_tmp_d, bits); +} + +inline __m128i RightShiftWithRounding_U32(const __m128i v_val_d, int bits) { + const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1); + const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d); + return _mm_srli_epi32(v_tmp_d, bits); +} + +inline __m128i RightShiftWithRounding_S32(const __m128i v_val_d, int bits) { + const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1); + const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d); + return _mm_srai_epi32(v_tmp_d, bits); +} + +// Use this when |bits| is not an immediate value. +inline __m128i VariableRightShiftWithRounding_S32(const __m128i v_val_d, + int bits) { + const __m128i v_bias_d = + _mm_set1_epi32(static_cast<int32_t>((1 << bits) >> 1)); + const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d); + return _mm_sra_epi32(v_tmp_d, _mm_cvtsi32_si128(bits)); +} + +//------------------------------------------------------------------------------ +// Masking utilities +inline __m128i MaskHighNBytes(int n) { + static constexpr uint8_t kMask[32] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + }; + + return LoadUnaligned16(kMask + n); +} diff --git a/src/dsp/x86/convolve_avx2.cc b/src/dsp/x86/convolve_avx2.cc index 3df2120..2ecb77c 100644 --- a/src/dsp/x86/convolve_avx2.cc +++ b/src/dsp/x86/convolve_avx2.cc @@ -26,7 +26,6 @@ #include "src/dsp/constants.h" #include "src/dsp/dsp.h" #include "src/dsp/x86/common_avx2.h" -#include "src/dsp/x86/common_sse4.h" #include "src/utils/common.h" #include "src/utils/constants.h" @@ -35,7 +34,7 @@ namespace dsp { namespace low_bitdepth { namespace { -constexpr int kHorizontalOffset = 3; +#include "src/dsp/x86/convolve_sse4.inc" // Multiply every entry in |src[]| by the corresponding entry in |taps[]| and // sum. The filters in |taps[]| are pre-shifted by 1. This prevents the final @@ -118,58 +117,15 @@ __m256i SimpleHorizontalTaps(const __m256i* const src, } template <int filter_index> -__m128i SumHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride, - const __m128i* const v_tap) { - // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17 - const __m128i v_src = LoadHi8(LoadLo8(&src[0]), &src[src_stride]); - - if (filter_index == 3) { - // 03 04 04 05 05 06 06 07 13 14 14 15 15 16 16 17 - const __m128i v_src_43 = _mm_shuffle_epi8( - v_src, _mm_set_epi32(0x0f0e0e0d, 0x0d0c0c0b, 0x07060605, 0x05040403)); - const __m128i v_sum_43 = _mm_maddubs_epi16(v_src_43, v_tap[0]); // k4k3 - return v_sum_43; - } - - // 02 03 03 04 04 05 05 06 12 13 13 14 14 15 15 16 - const __m128i v_src_32 = _mm_shuffle_epi8( - v_src, _mm_set_epi32(0x0e0d0d0c, 0x0c0b0b0a, 0x06050504, 0x04030302)); - // 04 05 05 06 06 07 07 xx 14 15 15 16 16 17 17 xx - const __m128i v_src_54 = _mm_shuffle_epi8( - v_src, _mm_set_epi32(0x800f0f0e, 0x0e0d0d0c, 0x80070706, 0x06050504)); - const __m128i v_madd_32 = _mm_maddubs_epi16(v_src_32, v_tap[0]); // k3k2 - const __m128i v_madd_54 = _mm_maddubs_epi16(v_src_54, v_tap[1]); // k5k4 - const __m128i v_sum_5432 = _mm_add_epi16(v_madd_54, v_madd_32); - return v_sum_5432; -} - -template <int filter_index> -__m128i SimpleHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride, - const __m128i* const v_tap) { - __m128i sum = SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap); - - // Normally the Horizontal pass does the downshift in two passes: - // kInterRoundBitsHorizontal - 1 and then (kFilterBits - - // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them - // requires adding the rounding offset from the skipped shift. - constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2); - - sum = _mm_add_epi16(sum, _mm_set1_epi16(first_shift_rounding_bit)); - sum = RightShiftWithRounding_S16(sum, kFilterBits - 1); - return _mm_packus_epi16(sum, sum); -} - -template <int filter_index> -__m128i HorizontalTaps8To16_2x2(const uint8_t* src, const ptrdiff_t src_stride, - const __m128i* const v_tap) { - const __m128i sum = - SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap); +__m256i HorizontalTaps8To16(const __m256i* const src, + const __m256i* const v_tap) { + const __m256i sum = SumHorizontalTaps<filter_index>(src, v_tap); return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1); } // Filter 2xh sizes. -template <int num_taps, int step, int filter_index, bool is_2d = false, +template <int num_taps, int filter_index, bool is_2d = false, bool is_compound = false> void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride, void* const dest, const ptrdiff_t pred_stride, @@ -183,7 +139,8 @@ void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride, assert(num_taps <= 4); if (num_taps <= 4) { if (!is_compound) { - int y = 0; + int y = height; + if (is_2d) y -= 1; do { if (is_2d) { const __m128i sum = @@ -202,8 +159,8 @@ void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride, } src += src_stride << 1; - y += 2; - } while (y < height - 1); + y -= 2; + } while (y != 0); // The 2d filters have an odd |height| because the horizontal pass // generates context for the vertical pass. @@ -236,7 +193,7 @@ void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride, } // Filter widths >= 4. -template <int num_taps, int step, int filter_index, bool is_2d = false, +template <int num_taps, int filter_index, bool is_2d = false, bool is_compound = false> void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride, void* const dest, const ptrdiff_t pred_stride, @@ -251,7 +208,22 @@ void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride, int x = 0; do { if (is_2d || is_compound) { - // placeholder + // Load into 2 128 bit lanes. + const __m256i src_long = + SetrM128i(LoadUnaligned16(&src[x]), LoadUnaligned16(&src[x + 8])); + const __m256i result = + HorizontalTaps8To16<filter_index>(&src_long, v_tap); + const __m256i src_long2 = SetrM128i(LoadUnaligned16(&src[x + 16]), + LoadUnaligned16(&src[x + 24])); + const __m256i result2 = + HorizontalTaps8To16<filter_index>(&src_long2, v_tap); + if (is_2d) { + StoreAligned32(&dest16[x], result); + StoreAligned32(&dest16[x + 16], result2); + } else { + StoreUnaligned32(&dest16[x], result); + StoreUnaligned32(&dest16[x + 16], result2); + } } else { // Load src used to calculate dest8[7:0] and dest8[23:16]. const __m256i src_long = LoadUnaligned32(&src[x]); @@ -264,7 +236,7 @@ void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride, // Combine results and store. StoreUnaligned32(&dest8[x], _mm256_unpacklo_epi64(result, result2)); } - x += step * 4; + x += 32; } while (x < width); src += src_stride; dest8 += pred_stride; @@ -272,9 +244,26 @@ void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride, } while (--y != 0); } else if (width == 16) { int y = height; + if (is_2d) y -= 1; do { if (is_2d || is_compound) { - // placeholder + // Load into 2 128 bit lanes. + const __m256i src_long = + SetrM128i(LoadUnaligned16(&src[0]), LoadUnaligned16(&src[8])); + const __m256i result = + HorizontalTaps8To16<filter_index>(&src_long, v_tap); + const __m256i src_long2 = + SetrM128i(LoadUnaligned16(&src[src_stride]), + LoadUnaligned16(&src[8 + src_stride])); + const __m256i result2 = + HorizontalTaps8To16<filter_index>(&src_long2, v_tap); + if (is_2d) { + StoreAligned32(&dest16[0], result); + StoreAligned32(&dest16[pred_stride], result2); + } else { + StoreUnaligned32(&dest16[0], result); + StoreUnaligned32(&dest16[pred_stride], result2); + } } else { // Load into 2 128 bit lanes. const __m256i src_long = SetrM128i(LoadUnaligned16(&src[0]), @@ -295,11 +284,37 @@ void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride, dest16 += pred_stride * 2; y -= 2; } while (y != 0); + + // The 2d filters have an odd |height| during the horizontal pass, so + // filter the remaining row. + if (is_2d) { + const __m256i src_long = + SetrM128i(LoadUnaligned16(&src[0]), LoadUnaligned16(&src[8])); + const __m256i result = + HorizontalTaps8To16<filter_index>(&src_long, v_tap); + StoreAligned32(&dest16[0], result); + } + } else if (width == 8) { int y = height; + if (is_2d) y -= 1; do { + // Load into 2 128 bit lanes. + const __m128i this_row = LoadUnaligned16(&src[0]); + const __m128i next_row = LoadUnaligned16(&src[src_stride]); + const __m256i src_long = SetrM128i(this_row, next_row); if (is_2d || is_compound) { - // placeholder + const __m256i result = + HorizontalTaps8To16<filter_index>(&src_long, v_tap); + if (is_2d) { + StoreAligned16(&dest16[0], _mm256_castsi256_si128(result)); + StoreAligned16(&dest16[pred_stride], + _mm256_extracti128_si256(result, 1)); + } else { + StoreUnaligned16(&dest16[0], _mm256_castsi256_si128(result)); + StoreUnaligned16(&dest16[pred_stride], + _mm256_extracti128_si256(result, 1)); + } } else { const __m128i this_row = LoadUnaligned16(&src[0]); const __m128i next_row = LoadUnaligned16(&src[src_stride]); @@ -315,11 +330,29 @@ void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride, dest16 += pred_stride * 2; y -= 2; } while (y != 0); + + // The 2d filters have an odd |height| during the horizontal pass, so + // filter the remaining row. + if (is_2d) { + const __m256i src_long = _mm256_castsi128_si256(LoadUnaligned16(&src[0])); + const __m256i result = + HorizontalTaps8To16<filter_index>(&src_long, v_tap); + StoreAligned16(&dest16[0], _mm256_castsi256_si128(result)); + } + } else { // width == 4 int y = height; + if (is_2d) y -= 1; do { + // Load into 2 128 bit lanes. + const __m128i this_row = LoadUnaligned16(&src[0]); + const __m128i next_row = LoadUnaligned16(&src[src_stride]); + const __m256i src_long = SetrM128i(this_row, next_row); if (is_2d || is_compound) { - // placeholder + const __m256i result = + HorizontalTaps8To16<filter_index>(&src_long, v_tap); + StoreLo8(&dest16[0], _mm256_castsi256_si128(result)); + StoreLo8(&dest16[pred_stride], _mm256_extracti128_si256(result, 1)); } else { const __m128i this_row = LoadUnaligned16(&src[0]); const __m128i next_row = LoadUnaligned16(&src[src_stride]); @@ -335,93 +368,176 @@ void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride, dest16 += pred_stride * 2; y -= 2; } while (y != 0); + + // The 2d filters have an odd |height| during the horizontal pass, so + // filter the remaining row. + if (is_2d) { + const __m256i src_long = _mm256_castsi128_si256(LoadUnaligned16(&src[0])); + const __m256i result = + HorizontalTaps8To16<filter_index>(&src_long, v_tap); + StoreLo8(&dest16[0], _mm256_castsi256_si128(result)); + } } } template <int num_taps, bool is_2d_vertical = false> LIBGAV1_ALWAYS_INLINE void SetupTaps(const __m128i* const filter, - __m128i* v_tap) { + __m256i* v_tap) { if (num_taps == 8) { - v_tap[0] = _mm_shufflelo_epi16(*filter, 0x0); // k1k0 - v_tap[1] = _mm_shufflelo_epi16(*filter, 0x55); // k3k2 - v_tap[2] = _mm_shufflelo_epi16(*filter, 0xaa); // k5k4 - v_tap[3] = _mm_shufflelo_epi16(*filter, 0xff); // k7k6 if (is_2d_vertical) { - v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]); - v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]); - v_tap[2] = _mm_cvtepi8_epi16(v_tap[2]); - v_tap[3] = _mm_cvtepi8_epi16(v_tap[3]); + v_tap[0] = _mm256_broadcastd_epi32(*filter); // k1k0 + v_tap[1] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 4)); // k3k2 + v_tap[2] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 8)); // k5k4 + v_tap[3] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 12)); // k7k6 } else { - v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]); - v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]); - v_tap[2] = _mm_unpacklo_epi64(v_tap[2], v_tap[2]); - v_tap[3] = _mm_unpacklo_epi64(v_tap[3], v_tap[3]); + v_tap[0] = _mm256_broadcastw_epi16(*filter); // k1k0 + v_tap[1] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 2)); // k3k2 + v_tap[2] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 4)); // k5k4 + v_tap[3] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 6)); // k7k6 } } else if (num_taps == 6) { - const __m128i adjusted_filter = _mm_srli_si128(*filter, 1); - v_tap[0] = _mm_shufflelo_epi16(adjusted_filter, 0x0); // k2k1 - v_tap[1] = _mm_shufflelo_epi16(adjusted_filter, 0x55); // k4k3 - v_tap[2] = _mm_shufflelo_epi16(adjusted_filter, 0xaa); // k6k5 if (is_2d_vertical) { - v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]); - v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]); - v_tap[2] = _mm_cvtepi8_epi16(v_tap[2]); + v_tap[0] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 2)); // k2k1 + v_tap[1] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 6)); // k4k3 + v_tap[2] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 10)); // k6k5 } else { - v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]); - v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]); - v_tap[2] = _mm_unpacklo_epi64(v_tap[2], v_tap[2]); + v_tap[0] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 1)); // k2k1 + v_tap[1] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 3)); // k4k3 + v_tap[2] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 5)); // k6k5 } } else if (num_taps == 4) { - v_tap[0] = _mm_shufflelo_epi16(*filter, 0x55); // k3k2 - v_tap[1] = _mm_shufflelo_epi16(*filter, 0xaa); // k5k4 if (is_2d_vertical) { - v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]); - v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]); + v_tap[0] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 4)); // k3k2 + v_tap[1] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 8)); // k5k4 } else { - v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]); - v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]); + v_tap[0] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 2)); // k3k2 + v_tap[1] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 4)); // k5k4 } } else { // num_taps == 2 - const __m128i adjusted_filter = _mm_srli_si128(*filter, 1); - v_tap[0] = _mm_shufflelo_epi16(adjusted_filter, 0x55); // k4k3 if (is_2d_vertical) { - v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]); + v_tap[0] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 6)); // k4k3 } else { - v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]); + v_tap[0] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 3)); // k4k3 } } } -template <int num_taps, bool is_2d_vertical = false> -LIBGAV1_ALWAYS_INLINE void SetupTaps(const __m128i* const filter, - __m256i* v_tap) { - if (num_taps == 8) { - v_tap[0] = _mm256_broadcastw_epi16(*filter); // k1k0 - v_tap[1] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 2)); // k3k2 - v_tap[2] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 4)); // k5k4 - v_tap[3] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 6)); // k7k6 - if (is_2d_vertical) { - // placeholder - } - } else if (num_taps == 6) { - v_tap[0] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 1)); // k2k1 - v_tap[1] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 3)); // k4k3 - v_tap[2] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 5)); // k6k5 - if (is_2d_vertical) { - // placeholder - } - } else if (num_taps == 4) { - v_tap[0] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 2)); // k3k2 - v_tap[1] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 4)); // k5k4 - if (is_2d_vertical) { - // placeholder - } - } else { // num_taps == 2 - v_tap[0] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 3)); // k4k3 - if (is_2d_vertical) { - // placeholder +template <int num_taps, bool is_compound> +__m256i SimpleSum2DVerticalTaps(const __m256i* const src, + const __m256i* const taps) { + __m256i sum_lo = + _mm256_madd_epi16(_mm256_unpacklo_epi16(src[0], src[1]), taps[0]); + __m256i sum_hi = + _mm256_madd_epi16(_mm256_unpackhi_epi16(src[0], src[1]), taps[0]); + if (num_taps >= 4) { + __m256i madd_lo = + _mm256_madd_epi16(_mm256_unpacklo_epi16(src[2], src[3]), taps[1]); + __m256i madd_hi = + _mm256_madd_epi16(_mm256_unpackhi_epi16(src[2], src[3]), taps[1]); + sum_lo = _mm256_add_epi32(sum_lo, madd_lo); + sum_hi = _mm256_add_epi32(sum_hi, madd_hi); + if (num_taps >= 6) { + madd_lo = + _mm256_madd_epi16(_mm256_unpacklo_epi16(src[4], src[5]), taps[2]); + madd_hi = + _mm256_madd_epi16(_mm256_unpackhi_epi16(src[4], src[5]), taps[2]); + sum_lo = _mm256_add_epi32(sum_lo, madd_lo); + sum_hi = _mm256_add_epi32(sum_hi, madd_hi); + if (num_taps == 8) { + madd_lo = + _mm256_madd_epi16(_mm256_unpacklo_epi16(src[6], src[7]), taps[3]); + madd_hi = + _mm256_madd_epi16(_mm256_unpackhi_epi16(src[6], src[7]), taps[3]); + sum_lo = _mm256_add_epi32(sum_lo, madd_lo); + sum_hi = _mm256_add_epi32(sum_hi, madd_hi); + } } } + + if (is_compound) { + return _mm256_packs_epi32( + RightShiftWithRounding_S32(sum_lo, kInterRoundBitsCompoundVertical - 1), + RightShiftWithRounding_S32(sum_hi, + kInterRoundBitsCompoundVertical - 1)); + } + + return _mm256_packs_epi32( + RightShiftWithRounding_S32(sum_lo, kInterRoundBitsVertical - 1), + RightShiftWithRounding_S32(sum_hi, kInterRoundBitsVertical - 1)); +} + +template <int num_taps, bool is_compound = false> +void Filter2DVertical16xH(const uint16_t* src, void* const dst, + const ptrdiff_t dst_stride, const int width, + const int height, const __m256i* const taps) { + assert(width >= 8); + constexpr int next_row = num_taps - 1; + // The Horizontal pass uses |width| as |stride| for the intermediate buffer. + const ptrdiff_t src_stride = width; + + auto* dst8 = static_cast<uint8_t*>(dst); + auto* dst16 = static_cast<uint16_t*>(dst); + + int x = 0; + do { + __m256i srcs[8]; + const uint16_t* src_x = src + x; + srcs[0] = LoadAligned32(src_x); + src_x += src_stride; + if (num_taps >= 4) { + srcs[1] = LoadAligned32(src_x); + src_x += src_stride; + srcs[2] = LoadAligned32(src_x); + src_x += src_stride; + if (num_taps >= 6) { + srcs[3] = LoadAligned32(src_x); + src_x += src_stride; + srcs[4] = LoadAligned32(src_x); + src_x += src_stride; + if (num_taps == 8) { + srcs[5] = LoadAligned32(src_x); + src_x += src_stride; + srcs[6] = LoadAligned32(src_x); + src_x += src_stride; + } + } + } + + auto* dst8_x = dst8 + x; + auto* dst16_x = dst16 + x; + int y = height; + do { + srcs[next_row] = LoadAligned32(src_x); + src_x += src_stride; + + const __m256i sum = + SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps); + if (is_compound) { + StoreUnaligned32(dst16_x, sum); + dst16_x += dst_stride; + } else { + const __m128i packed_sum = _mm_packus_epi16( + _mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum, 1)); + StoreUnaligned16(dst8_x, packed_sum); + dst8_x += dst_stride; + } + + srcs[0] = srcs[1]; + if (num_taps >= 4) { + srcs[1] = srcs[2]; + srcs[2] = srcs[3]; + if (num_taps >= 6) { + srcs[3] = srcs[4]; + srcs[4] = srcs[5]; + if (num_taps == 8) { + srcs[5] = srcs[6]; + srcs[6] = srcs[7]; + } + } + } + } while (--y != 0); + x += 16; + } while (x < width); } template <bool is_2d = false, bool is_compound = false> @@ -436,16 +552,16 @@ LIBGAV1_ALWAYS_INLINE void DoHorizontalPass2xH( if (filter_index == 4) { // 4 tap. SetupTaps<4>(&v_horizontal_filter, v_tap); - FilterHorizontal<4, 8, 4, is_2d, is_compound>( - src, src_stride, dst, dst_stride, width, height, v_tap); + FilterHorizontal<4, 4, is_2d, is_compound>(src, src_stride, dst, dst_stride, + width, height, v_tap); } else if (filter_index == 5) { // 4 tap. SetupTaps<4>(&v_horizontal_filter, v_tap); - FilterHorizontal<4, 8, 5, is_2d, is_compound>( - src, src_stride, dst, dst_stride, width, height, v_tap); + FilterHorizontal<4, 5, is_2d, is_compound>(src, src_stride, dst, dst_stride, + width, height, v_tap); } else { // 2 tap. SetupTaps<2>(&v_horizontal_filter, v_tap); - FilterHorizontal<2, 8, 3, is_2d, is_compound>( - src, src_stride, dst, dst_stride, width, height, v_tap); + FilterHorizontal<2, 3, is_2d, is_compound>(src, src_stride, dst, dst_stride, + width, height, v_tap); } } @@ -461,28 +577,792 @@ LIBGAV1_ALWAYS_INLINE void DoHorizontalPass( if (filter_index == 2) { // 8 tap. SetupTaps<8>(&v_horizontal_filter, v_tap); - FilterHorizontal<8, 8, 2, is_2d, is_compound>( - src, src_stride, dst, dst_stride, width, height, v_tap); + FilterHorizontal<8, 2, is_2d, is_compound>(src, src_stride, dst, dst_stride, + width, height, v_tap); } else if (filter_index == 1) { // 6 tap. SetupTaps<6>(&v_horizontal_filter, v_tap); - FilterHorizontal<6, 8, 1, is_2d, is_compound>( - src, src_stride, dst, dst_stride, width, height, v_tap); + FilterHorizontal<6, 1, is_2d, is_compound>(src, src_stride, dst, dst_stride, + width, height, v_tap); } else if (filter_index == 0) { // 6 tap. SetupTaps<6>(&v_horizontal_filter, v_tap); - FilterHorizontal<6, 8, 0, is_2d, is_compound>( - src, src_stride, dst, dst_stride, width, height, v_tap); + FilterHorizontal<6, 0, is_2d, is_compound>(src, src_stride, dst, dst_stride, + width, height, v_tap); } else if (filter_index == 4) { // 4 tap. SetupTaps<4>(&v_horizontal_filter, v_tap); - FilterHorizontal<4, 8, 4, is_2d, is_compound>( - src, src_stride, dst, dst_stride, width, height, v_tap); + FilterHorizontal<4, 4, is_2d, is_compound>(src, src_stride, dst, dst_stride, + width, height, v_tap); } else if (filter_index == 5) { // 4 tap. SetupTaps<4>(&v_horizontal_filter, v_tap); - FilterHorizontal<4, 8, 5, is_2d, is_compound>( - src, src_stride, dst, dst_stride, width, height, v_tap); + FilterHorizontal<4, 5, is_2d, is_compound>(src, src_stride, dst, dst_stride, + width, height, v_tap); } else { // 2 tap. SetupTaps<2>(&v_horizontal_filter, v_tap); - FilterHorizontal<2, 8, 3, is_2d, is_compound>( - src, src_stride, dst, dst_stride, width, height, v_tap); + FilterHorizontal<2, 3, is_2d, is_compound>(src, src_stride, dst, dst_stride, + width, height, v_tap); + } +} + +void Convolve2D_AVX2(const void* const reference, + const ptrdiff_t reference_stride, + const int horizontal_filter_index, + const int vertical_filter_index, + const int horizontal_filter_id, + const int vertical_filter_id, const int width, + const int height, void* prediction, + const ptrdiff_t pred_stride) { + const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width); + const int vert_filter_index = GetFilterIndex(vertical_filter_index, height); + const int vertical_taps = GetNumTapsInFilter(vert_filter_index); + + // The output of the horizontal filter is guaranteed to fit in 16 bits. + alignas(32) uint16_t + intermediate_result[kMaxSuperBlockSizeInPixels * + (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)]; + const int intermediate_height = height + vertical_taps - 1; + + const ptrdiff_t src_stride = reference_stride; + const auto* src = static_cast<const uint8_t*>(reference) - + (vertical_taps / 2 - 1) * src_stride - kHorizontalOffset; + if (width > 2) { + DoHorizontalPass</*is_2d=*/true>(src, src_stride, intermediate_result, + width, width, intermediate_height, + horizontal_filter_id, horiz_filter_index); + } else { + // Use non avx2 version for smaller widths. + DoHorizontalPass2xH</*is_2d=*/true>( + src, src_stride, intermediate_result, width, width, intermediate_height, + horizontal_filter_id, horiz_filter_index); + } + + // Vertical filter. + auto* dest = static_cast<uint8_t*>(prediction); + const ptrdiff_t dest_stride = pred_stride; + assert(vertical_filter_id != 0); + + const __m128i v_filter = + LoadLo8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]); + + // Use 256 bits for width > 8. + if (width > 8) { + __m256i taps_256[4]; + const __m128i v_filter_ext = _mm_cvtepi8_epi16(v_filter); + + if (vertical_taps == 8) { + SetupTaps<8, /*is_2d_vertical=*/true>(&v_filter_ext, taps_256); + Filter2DVertical16xH<8>(intermediate_result, dest, dest_stride, width, + height, taps_256); + } else if (vertical_taps == 6) { + SetupTaps<6, /*is_2d_vertical=*/true>(&v_filter_ext, taps_256); + Filter2DVertical16xH<6>(intermediate_result, dest, dest_stride, width, + height, taps_256); + } else if (vertical_taps == 4) { + SetupTaps<4, /*is_2d_vertical=*/true>(&v_filter_ext, taps_256); + Filter2DVertical16xH<4>(intermediate_result, dest, dest_stride, width, + height, taps_256); + } else { // |vertical_taps| == 2 + SetupTaps<2, /*is_2d_vertical=*/true>(&v_filter_ext, taps_256); + Filter2DVertical16xH<2>(intermediate_result, dest, dest_stride, width, + height, taps_256); + } + } else { // width <= 8 + __m128i taps[4]; + // Use 128 bit code. + if (vertical_taps == 8) { + SetupTaps<8, /*is_2d_vertical=*/true>(&v_filter, taps); + if (width == 2) { + Filter2DVertical2xH<8>(intermediate_result, dest, dest_stride, height, + taps); + } else if (width == 4) { + Filter2DVertical4xH<8>(intermediate_result, dest, dest_stride, height, + taps); + } else { + Filter2DVertical<8>(intermediate_result, dest, dest_stride, width, + height, taps); + } + } else if (vertical_taps == 6) { + SetupTaps<6, /*is_2d_vertical=*/true>(&v_filter, taps); + if (width == 2) { + Filter2DVertical2xH<6>(intermediate_result, dest, dest_stride, height, + taps); + } else if (width == 4) { + Filter2DVertical4xH<6>(intermediate_result, dest, dest_stride, height, + taps); + } else { + Filter2DVertical<6>(intermediate_result, dest, dest_stride, width, + height, taps); + } + } else if (vertical_taps == 4) { + SetupTaps<4, /*is_2d_vertical=*/true>(&v_filter, taps); + if (width == 2) { + Filter2DVertical2xH<4>(intermediate_result, dest, dest_stride, height, + taps); + } else if (width == 4) { + Filter2DVertical4xH<4>(intermediate_result, dest, dest_stride, height, + taps); + } else { + Filter2DVertical<4>(intermediate_result, dest, dest_stride, width, + height, taps); + } + } else { // |vertical_taps| == 2 + SetupTaps<2, /*is_2d_vertical=*/true>(&v_filter, taps); + if (width == 2) { + Filter2DVertical2xH<2>(intermediate_result, dest, dest_stride, height, + taps); + } else if (width == 4) { + Filter2DVertical4xH<2>(intermediate_result, dest, dest_stride, height, + taps); + } else { + Filter2DVertical<2>(intermediate_result, dest, dest_stride, width, + height, taps); + } + } + } +} + +// The 1D compound shift is always |kInterRoundBitsHorizontal|, even for 1D +// Vertical calculations. +__m256i Compound1DShift(const __m256i sum) { + return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1); +} + +template <int filter_index, bool unpack_high = false> +__m256i SumVerticalTaps(const __m256i* const srcs, const __m256i* const v_tap) { + __m256i v_src[4]; + + if (!unpack_high) { + if (filter_index < 2) { + // 6 taps. + v_src[0] = _mm256_unpacklo_epi8(srcs[0], srcs[1]); + v_src[1] = _mm256_unpacklo_epi8(srcs[2], srcs[3]); + v_src[2] = _mm256_unpacklo_epi8(srcs[4], srcs[5]); + } else if (filter_index == 2) { + // 8 taps. + v_src[0] = _mm256_unpacklo_epi8(srcs[0], srcs[1]); + v_src[1] = _mm256_unpacklo_epi8(srcs[2], srcs[3]); + v_src[2] = _mm256_unpacklo_epi8(srcs[4], srcs[5]); + v_src[3] = _mm256_unpacklo_epi8(srcs[6], srcs[7]); + } else if (filter_index == 3) { + // 2 taps. + v_src[0] = _mm256_unpacklo_epi8(srcs[0], srcs[1]); + } else if (filter_index > 3) { + // 4 taps. + v_src[0] = _mm256_unpacklo_epi8(srcs[0], srcs[1]); + v_src[1] = _mm256_unpacklo_epi8(srcs[2], srcs[3]); + } + } else { + if (filter_index < 2) { + // 6 taps. + v_src[0] = _mm256_unpackhi_epi8(srcs[0], srcs[1]); + v_src[1] = _mm256_unpackhi_epi8(srcs[2], srcs[3]); + v_src[2] = _mm256_unpackhi_epi8(srcs[4], srcs[5]); + } else if (filter_index == 2) { + // 8 taps. + v_src[0] = _mm256_unpackhi_epi8(srcs[0], srcs[1]); + v_src[1] = _mm256_unpackhi_epi8(srcs[2], srcs[3]); + v_src[2] = _mm256_unpackhi_epi8(srcs[4], srcs[5]); + v_src[3] = _mm256_unpackhi_epi8(srcs[6], srcs[7]); + } else if (filter_index == 3) { + // 2 taps. + v_src[0] = _mm256_unpackhi_epi8(srcs[0], srcs[1]); + } else if (filter_index > 3) { + // 4 taps. + v_src[0] = _mm256_unpackhi_epi8(srcs[0], srcs[1]); + v_src[1] = _mm256_unpackhi_epi8(srcs[2], srcs[3]); + } + } + return SumOnePassTaps<filter_index>(v_src, v_tap); +} + +template <int filter_index, bool is_compound = false> +void FilterVertical32xH(const uint8_t* src, const ptrdiff_t src_stride, + void* const dst, const ptrdiff_t dst_stride, + const int width, const int height, + const __m256i* const v_tap) { + const int num_taps = GetNumTapsInFilter(filter_index); + const int next_row = num_taps - 1; + auto* dst8 = static_cast<uint8_t*>(dst); + auto* dst16 = static_cast<uint16_t*>(dst); + assert(width >= 32); + int x = 0; + do { + const uint8_t* src_x = src + x; + __m256i srcs[8]; + srcs[0] = LoadUnaligned32(src_x); + src_x += src_stride; + if (num_taps >= 4) { + srcs[1] = LoadUnaligned32(src_x); + src_x += src_stride; + srcs[2] = LoadUnaligned32(src_x); + src_x += src_stride; + if (num_taps >= 6) { + srcs[3] = LoadUnaligned32(src_x); + src_x += src_stride; + srcs[4] = LoadUnaligned32(src_x); + src_x += src_stride; + if (num_taps == 8) { + srcs[5] = LoadUnaligned32(src_x); + src_x += src_stride; + srcs[6] = LoadUnaligned32(src_x); + src_x += src_stride; + } + } + } + + auto* dst8_x = dst8 + x; + auto* dst16_x = dst16 + x; + int y = height; + do { + srcs[next_row] = LoadUnaligned32(src_x); + src_x += src_stride; + + const __m256i sums = SumVerticalTaps<filter_index>(srcs, v_tap); + const __m256i sums_hi = + SumVerticalTaps<filter_index, /*unpack_high=*/true>(srcs, v_tap); + if (is_compound) { + const __m256i results = + Compound1DShift(_mm256_permute2x128_si256(sums, sums_hi, 0x20)); + const __m256i results_hi = + Compound1DShift(_mm256_permute2x128_si256(sums, sums_hi, 0x31)); + StoreUnaligned32(dst16_x, results); + StoreUnaligned32(dst16_x + 16, results_hi); + dst16_x += dst_stride; + } else { + const __m256i results = + RightShiftWithRounding_S16(sums, kFilterBits - 1); + const __m256i results_hi = + RightShiftWithRounding_S16(sums_hi, kFilterBits - 1); + const __m256i packed_results = _mm256_packus_epi16(results, results_hi); + + StoreUnaligned32(dst8_x, packed_results); + dst8_x += dst_stride; + } + + srcs[0] = srcs[1]; + if (num_taps >= 4) { + srcs[1] = srcs[2]; + srcs[2] = srcs[3]; + if (num_taps >= 6) { + srcs[3] = srcs[4]; + srcs[4] = srcs[5]; + if (num_taps == 8) { + srcs[5] = srcs[6]; + srcs[6] = srcs[7]; + } + } + } + } while (--y != 0); + x += 32; + } while (x < width); +} + +template <int filter_index, bool is_compound = false> +void FilterVertical16xH(const uint8_t* src, const ptrdiff_t src_stride, + void* const dst, const ptrdiff_t dst_stride, + const int /*width*/, const int height, + const __m256i* const v_tap) { + const int num_taps = GetNumTapsInFilter(filter_index); + const int next_row = num_taps; + auto* dst8 = static_cast<uint8_t*>(dst); + auto* dst16 = static_cast<uint16_t*>(dst); + + const uint8_t* src_x = src; + __m256i srcs[8 + 1]; + // The upper 128 bits hold the filter data for the next row. + srcs[0] = _mm256_castsi128_si256(LoadUnaligned16(src_x)); + src_x += src_stride; + if (num_taps >= 4) { + srcs[1] = _mm256_castsi128_si256(LoadUnaligned16(src_x)); + src_x += src_stride; + srcs[0] = + _mm256_inserti128_si256(srcs[0], _mm256_castsi256_si128(srcs[1]), 1); + srcs[2] = _mm256_castsi128_si256(LoadUnaligned16(src_x)); + src_x += src_stride; + srcs[1] = + _mm256_inserti128_si256(srcs[1], _mm256_castsi256_si128(srcs[2]), 1); + if (num_taps >= 6) { + srcs[3] = _mm256_castsi128_si256(LoadUnaligned16(src_x)); + src_x += src_stride; + srcs[2] = + _mm256_inserti128_si256(srcs[2], _mm256_castsi256_si128(srcs[3]), 1); + srcs[4] = _mm256_castsi128_si256(LoadUnaligned16(src_x)); + src_x += src_stride; + srcs[3] = + _mm256_inserti128_si256(srcs[3], _mm256_castsi256_si128(srcs[4]), 1); + if (num_taps == 8) { + srcs[5] = _mm256_castsi128_si256(LoadUnaligned16(src_x)); + src_x += src_stride; + srcs[4] = _mm256_inserti128_si256(srcs[4], + _mm256_castsi256_si128(srcs[5]), 1); + srcs[6] = _mm256_castsi128_si256(LoadUnaligned16(src_x)); + src_x += src_stride; + srcs[5] = _mm256_inserti128_si256(srcs[5], + _mm256_castsi256_si128(srcs[6]), 1); + } + } + } + + int y = height; + do { + srcs[next_row - 1] = _mm256_castsi128_si256(LoadUnaligned16(src_x)); + src_x += src_stride; + + srcs[next_row - 2] = _mm256_inserti128_si256( + srcs[next_row - 2], _mm256_castsi256_si128(srcs[next_row - 1]), 1); + + srcs[next_row] = _mm256_castsi128_si256(LoadUnaligned16(src_x)); + src_x += src_stride; + + srcs[next_row - 1] = _mm256_inserti128_si256( + srcs[next_row - 1], _mm256_castsi256_si128(srcs[next_row]), 1); + + const __m256i sums = SumVerticalTaps<filter_index>(srcs, v_tap); + const __m256i sums_hi = + SumVerticalTaps<filter_index, /*unpack_high=*/true>(srcs, v_tap); + if (is_compound) { + const __m256i results = + Compound1DShift(_mm256_permute2x128_si256(sums, sums_hi, 0x20)); + const __m256i results_hi = + Compound1DShift(_mm256_permute2x128_si256(sums, sums_hi, 0x31)); + + StoreUnaligned32(dst16, results); + StoreUnaligned32(dst16 + dst_stride, results_hi); + dst16 += dst_stride << 1; + } else { + const __m256i results = RightShiftWithRounding_S16(sums, kFilterBits - 1); + const __m256i results_hi = + RightShiftWithRounding_S16(sums_hi, kFilterBits - 1); + const __m256i packed_results = _mm256_packus_epi16(results, results_hi); + const __m128i this_dst = _mm256_castsi256_si128(packed_results); + const auto next_dst = _mm256_extracti128_si256(packed_results, 1); + + StoreUnaligned16(dst8, this_dst); + StoreUnaligned16(dst8 + dst_stride, next_dst); + dst8 += dst_stride << 1; + } + + srcs[0] = srcs[2]; + if (num_taps >= 4) { + srcs[1] = srcs[3]; + srcs[2] = srcs[4]; + if (num_taps >= 6) { + srcs[3] = srcs[5]; + srcs[4] = srcs[6]; + if (num_taps == 8) { + srcs[5] = srcs[7]; + srcs[6] = srcs[8]; + } + } + } + y -= 2; + } while (y != 0); +} + +template <int filter_index, bool is_compound = false> +void FilterVertical8xH(const uint8_t* src, const ptrdiff_t src_stride, + void* const dst, const ptrdiff_t dst_stride, + const int /*width*/, const int height, + const __m256i* const v_tap) { + const int num_taps = GetNumTapsInFilter(filter_index); + const int next_row = num_taps; + auto* dst8 = static_cast<uint8_t*>(dst); + auto* dst16 = static_cast<uint16_t*>(dst); + + const uint8_t* src_x = src; + __m256i srcs[8 + 1]; + // The upper 128 bits hold the filter data for the next row. + srcs[0] = _mm256_castsi128_si256(LoadLo8(src_x)); + src_x += src_stride; + if (num_taps >= 4) { + srcs[1] = _mm256_castsi128_si256(LoadLo8(src_x)); + src_x += src_stride; + srcs[0] = + _mm256_inserti128_si256(srcs[0], _mm256_castsi256_si128(srcs[1]), 1); + srcs[2] = _mm256_castsi128_si256(LoadLo8(src_x)); + src_x += src_stride; + srcs[1] = + _mm256_inserti128_si256(srcs[1], _mm256_castsi256_si128(srcs[2]), 1); + if (num_taps >= 6) { + srcs[3] = _mm256_castsi128_si256(LoadLo8(src_x)); + src_x += src_stride; + srcs[2] = + _mm256_inserti128_si256(srcs[2], _mm256_castsi256_si128(srcs[3]), 1); + srcs[4] = _mm256_castsi128_si256(LoadLo8(src_x)); + src_x += src_stride; + srcs[3] = + _mm256_inserti128_si256(srcs[3], _mm256_castsi256_si128(srcs[4]), 1); + if (num_taps == 8) { + srcs[5] = _mm256_castsi128_si256(LoadLo8(src_x)); + src_x += src_stride; + srcs[4] = _mm256_inserti128_si256(srcs[4], + _mm256_castsi256_si128(srcs[5]), 1); + srcs[6] = _mm256_castsi128_si256(LoadLo8(src_x)); + src_x += src_stride; + srcs[5] = _mm256_inserti128_si256(srcs[5], + _mm256_castsi256_si128(srcs[6]), 1); + } + } + } + + int y = height; + do { + srcs[next_row - 1] = _mm256_castsi128_si256(LoadLo8(src_x)); + src_x += src_stride; + + srcs[next_row - 2] = _mm256_inserti128_si256( + srcs[next_row - 2], _mm256_castsi256_si128(srcs[next_row - 1]), 1); + + srcs[next_row] = _mm256_castsi128_si256(LoadLo8(src_x)); + src_x += src_stride; + + srcs[next_row - 1] = _mm256_inserti128_si256( + srcs[next_row - 1], _mm256_castsi256_si128(srcs[next_row]), 1); + + const __m256i sums = SumVerticalTaps<filter_index>(srcs, v_tap); + if (is_compound) { + const __m256i results = Compound1DShift(sums); + const __m128i this_dst = _mm256_castsi256_si128(results); + const auto next_dst = _mm256_extracti128_si256(results, 1); + + StoreUnaligned16(dst16, this_dst); + StoreUnaligned16(dst16 + dst_stride, next_dst); + dst16 += dst_stride << 1; + } else { + const __m256i results = RightShiftWithRounding_S16(sums, kFilterBits - 1); + const __m256i packed_results = _mm256_packus_epi16(results, results); + const __m128i this_dst = _mm256_castsi256_si128(packed_results); + const auto next_dst = _mm256_extracti128_si256(packed_results, 1); + + StoreLo8(dst8, this_dst); + StoreLo8(dst8 + dst_stride, next_dst); + dst8 += dst_stride << 1; + } + + srcs[0] = srcs[2]; + if (num_taps >= 4) { + srcs[1] = srcs[3]; + srcs[2] = srcs[4]; + if (num_taps >= 6) { + srcs[3] = srcs[5]; + srcs[4] = srcs[6]; + if (num_taps == 8) { + srcs[5] = srcs[7]; + srcs[6] = srcs[8]; + } + } + } + y -= 2; + } while (y != 0); +} + +template <int filter_index, bool is_compound = false> +void FilterVertical8xH(const uint8_t* src, const ptrdiff_t src_stride, + void* const dst, const ptrdiff_t dst_stride, + const int /*width*/, const int height, + const __m128i* const v_tap) { + const int num_taps = GetNumTapsInFilter(filter_index); + const int next_row = num_taps - 1; + auto* dst8 = static_cast<uint8_t*>(dst); + auto* dst16 = static_cast<uint16_t*>(dst); + + const uint8_t* src_x = src; + __m128i srcs[8]; + srcs[0] = LoadLo8(src_x); + src_x += src_stride; + if (num_taps >= 4) { + srcs[1] = LoadLo8(src_x); + src_x += src_stride; + srcs[2] = LoadLo8(src_x); + src_x += src_stride; + if (num_taps >= 6) { + srcs[3] = LoadLo8(src_x); + src_x += src_stride; + srcs[4] = LoadLo8(src_x); + src_x += src_stride; + if (num_taps == 8) { + srcs[5] = LoadLo8(src_x); + src_x += src_stride; + srcs[6] = LoadLo8(src_x); + src_x += src_stride; + } + } + } + + int y = height; + do { + srcs[next_row] = LoadLo8(src_x); + src_x += src_stride; + + const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap); + if (is_compound) { + const __m128i results = Compound1DShift(sums); + StoreUnaligned16(dst16, results); + dst16 += dst_stride; + } else { + const __m128i results = RightShiftWithRounding_S16(sums, kFilterBits - 1); + StoreLo8(dst8, _mm_packus_epi16(results, results)); + dst8 += dst_stride; + } + + srcs[0] = srcs[1]; + if (num_taps >= 4) { + srcs[1] = srcs[2]; + srcs[2] = srcs[3]; + if (num_taps >= 6) { + srcs[3] = srcs[4]; + srcs[4] = srcs[5]; + if (num_taps == 8) { + srcs[5] = srcs[6]; + srcs[6] = srcs[7]; + } + } + } + } while (--y != 0); +} + +void ConvolveVertical_AVX2(const void* const reference, + const ptrdiff_t reference_stride, + const int /*horizontal_filter_index*/, + const int vertical_filter_index, + const int /*horizontal_filter_id*/, + const int vertical_filter_id, const int width, + const int height, void* prediction, + const ptrdiff_t pred_stride) { + const int filter_index = GetFilterIndex(vertical_filter_index, height); + const int vertical_taps = GetNumTapsInFilter(filter_index); + const ptrdiff_t src_stride = reference_stride; + const auto* src = static_cast<const uint8_t*>(reference) - + (vertical_taps / 2 - 1) * src_stride; + auto* dest = static_cast<uint8_t*>(prediction); + const ptrdiff_t dest_stride = pred_stride; + assert(vertical_filter_id != 0); + + const __m128i v_filter = + LoadLo8(kHalfSubPixelFilters[filter_index][vertical_filter_id]); + + // Use 256 bits for width > 4. + if (width > 4) { + __m256i taps_256[4]; + if (filter_index < 2) { // 6 tap. + SetupTaps<6>(&v_filter, taps_256); + if (width == 8) { + FilterVertical8xH<0>(src, src_stride, dest, dest_stride, width, height, + taps_256); + } else if (width == 16) { + FilterVertical16xH<0>(src, src_stride, dest, dest_stride, width, height, + taps_256); + } else { + FilterVertical32xH<0>(src, src_stride, dest, dest_stride, width, height, + taps_256); + } + } else if (filter_index == 2) { // 8 tap. + SetupTaps<8>(&v_filter, taps_256); + if (width == 8) { + FilterVertical8xH<2>(src, src_stride, dest, dest_stride, width, height, + taps_256); + } else if (width == 16) { + FilterVertical16xH<2>(src, src_stride, dest, dest_stride, width, height, + taps_256); + } else { + FilterVertical32xH<2>(src, src_stride, dest, dest_stride, width, height, + taps_256); + } + } else if (filter_index == 3) { // 2 tap. + SetupTaps<2>(&v_filter, taps_256); + if (width == 8) { + FilterVertical8xH<3>(src, src_stride, dest, dest_stride, width, height, + taps_256); + } else if (width == 16) { + FilterVertical16xH<3>(src, src_stride, dest, dest_stride, width, height, + taps_256); + } else { + FilterVertical32xH<3>(src, src_stride, dest, dest_stride, width, height, + taps_256); + } + } else if (filter_index == 4) { // 4 tap. + SetupTaps<4>(&v_filter, taps_256); + if (width == 8) { + FilterVertical8xH<4>(src, src_stride, dest, dest_stride, width, height, + taps_256); + } else if (width == 16) { + FilterVertical16xH<4>(src, src_stride, dest, dest_stride, width, height, + taps_256); + } else { + FilterVertical32xH<4>(src, src_stride, dest, dest_stride, width, height, + taps_256); + } + } else { + SetupTaps<4>(&v_filter, taps_256); + if (width == 8) { + FilterVertical8xH<5>(src, src_stride, dest, dest_stride, width, height, + taps_256); + } else if (width == 16) { + FilterVertical16xH<5>(src, src_stride, dest, dest_stride, width, height, + taps_256); + } else { + FilterVertical32xH<5>(src, src_stride, dest, dest_stride, width, height, + taps_256); + } + } + } else { // width <= 8 + // Use 128 bit code. + __m128i taps[4]; + + if (filter_index < 2) { // 6 tap. + SetupTaps<6>(&v_filter, taps); + if (width == 2) { + FilterVertical2xH<6, 0>(src, src_stride, dest, dest_stride, height, + taps); + } else { + FilterVertical4xH<6, 0>(src, src_stride, dest, dest_stride, height, + taps); + } + } else if (filter_index == 2) { // 8 tap. + SetupTaps<8>(&v_filter, taps); + if (width == 2) { + FilterVertical2xH<8, 2>(src, src_stride, dest, dest_stride, height, + taps); + } else { + FilterVertical4xH<8, 2>(src, src_stride, dest, dest_stride, height, + taps); + } + } else if (filter_index == 3) { // 2 tap. + SetupTaps<2>(&v_filter, taps); + if (width == 2) { + FilterVertical2xH<2, 3>(src, src_stride, dest, dest_stride, height, + taps); + } else { + FilterVertical4xH<2, 3>(src, src_stride, dest, dest_stride, height, + taps); + } + } else if (filter_index == 4) { // 4 tap. + SetupTaps<4>(&v_filter, taps); + if (width == 2) { + FilterVertical2xH<4, 4>(src, src_stride, dest, dest_stride, height, + taps); + } else { + FilterVertical4xH<4, 4>(src, src_stride, dest, dest_stride, height, + taps); + } + } else { + SetupTaps<4>(&v_filter, taps); + if (width == 2) { + FilterVertical2xH<4, 5>(src, src_stride, dest, dest_stride, height, + taps); + } else { + FilterVertical4xH<4, 5>(src, src_stride, dest, dest_stride, height, + taps); + } + } + } +} + +void ConvolveCompoundVertical_AVX2( + const void* const reference, const ptrdiff_t reference_stride, + const int /*horizontal_filter_index*/, const int vertical_filter_index, + const int /*horizontal_filter_id*/, const int vertical_filter_id, + const int width, const int height, void* prediction, + const ptrdiff_t /*pred_stride*/) { + const int filter_index = GetFilterIndex(vertical_filter_index, height); + const int vertical_taps = GetNumTapsInFilter(filter_index); + const ptrdiff_t src_stride = reference_stride; + const auto* src = static_cast<const uint8_t*>(reference) - + (vertical_taps / 2 - 1) * src_stride; + auto* dest = static_cast<uint8_t*>(prediction); + const ptrdiff_t dest_stride = width; + assert(vertical_filter_id != 0); + + const __m128i v_filter = + LoadLo8(kHalfSubPixelFilters[filter_index][vertical_filter_id]); + + // Use 256 bits for width > 4. + if (width > 4) { + __m256i taps_256[4]; + if (filter_index < 2) { // 6 tap. + SetupTaps<6>(&v_filter, taps_256); + if (width == 8) { + FilterVertical8xH<0, /*is_compound=*/true>( + src, src_stride, dest, dest_stride, width, height, taps_256); + } else if (width == 16) { + FilterVertical16xH<0, /*is_compound=*/true>( + src, src_stride, dest, dest_stride, width, height, taps_256); + } else { + FilterVertical32xH<0, /*is_compound=*/true>( + src, src_stride, dest, dest_stride, width, height, taps_256); + } + } else if (filter_index == 2) { // 8 tap. + SetupTaps<8>(&v_filter, taps_256); + if (width == 8) { + FilterVertical8xH<2, /*is_compound=*/true>( + src, src_stride, dest, dest_stride, width, height, taps_256); + } else if (width == 16) { + FilterVertical16xH<2, /*is_compound=*/true>( + src, src_stride, dest, dest_stride, width, height, taps_256); + } else { + FilterVertical32xH<2, /*is_compound=*/true>( + src, src_stride, dest, dest_stride, width, height, taps_256); + } + } else if (filter_index == 3) { // 2 tap. + SetupTaps<2>(&v_filter, taps_256); + if (width == 8) { + FilterVertical8xH<3, /*is_compound=*/true>( + src, src_stride, dest, dest_stride, width, height, taps_256); + } else if (width == 16) { + FilterVertical16xH<3, /*is_compound=*/true>( + src, src_stride, dest, dest_stride, width, height, taps_256); + } else { + FilterVertical32xH<3, /*is_compound=*/true>( + src, src_stride, dest, dest_stride, width, height, taps_256); + } + } else if (filter_index == 4) { // 4 tap. + SetupTaps<4>(&v_filter, taps_256); + if (width == 8) { + FilterVertical8xH<4, /*is_compound=*/true>( + src, src_stride, dest, dest_stride, width, height, taps_256); + } else if (width == 16) { + FilterVertical16xH<4, /*is_compound=*/true>( + src, src_stride, dest, dest_stride, width, height, taps_256); + } else { + FilterVertical32xH<4, /*is_compound=*/true>( + src, src_stride, dest, dest_stride, width, height, taps_256); + } + } else { + SetupTaps<4>(&v_filter, taps_256); + if (width == 8) { + FilterVertical8xH<5, /*is_compound=*/true>( + src, src_stride, dest, dest_stride, width, height, taps_256); + } else if (width == 16) { + FilterVertical16xH<5, /*is_compound=*/true>( + src, src_stride, dest, dest_stride, width, height, taps_256); + } else { + FilterVertical32xH<5, /*is_compound=*/true>( + src, src_stride, dest, dest_stride, width, height, taps_256); + } + } + } else { // width <= 4 + // Use 128 bit code. + __m128i taps[4]; + + if (filter_index < 2) { // 6 tap. + SetupTaps<6>(&v_filter, taps); + FilterVertical4xH<6, 0, /*is_compound=*/true>(src, src_stride, dest, + dest_stride, height, taps); + } else if (filter_index == 2) { // 8 tap. + SetupTaps<8>(&v_filter, taps); + FilterVertical4xH<8, 2, /*is_compound=*/true>(src, src_stride, dest, + dest_stride, height, taps); + } else if (filter_index == 3) { // 2 tap. + SetupTaps<2>(&v_filter, taps); + FilterVertical4xH<2, 3, /*is_compound=*/true>(src, src_stride, dest, + dest_stride, height, taps); + } else if (filter_index == 4) { // 4 tap. + SetupTaps<4>(&v_filter, taps); + FilterVertical4xH<4, 4, /*is_compound=*/true>(src, src_stride, dest, + dest_stride, height, taps); + } else { + SetupTaps<4>(&v_filter, taps); + FilterVertical4xH<4, 5, /*is_compound=*/true>(src, src_stride, dest, + dest_stride, height, taps); + } } } @@ -509,10 +1389,140 @@ void ConvolveHorizontal_AVX2(const void* const reference, } } +void ConvolveCompoundHorizontal_AVX2( + const void* const reference, const ptrdiff_t reference_stride, + const int horizontal_filter_index, const int /*vertical_filter_index*/, + const int horizontal_filter_id, const int /*vertical_filter_id*/, + const int width, const int height, void* prediction, + const ptrdiff_t pred_stride) { + const int filter_index = GetFilterIndex(horizontal_filter_index, width); + // Set |src| to the outermost tap. + const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset; + auto* dest = static_cast<uint8_t*>(prediction); + // All compound functions output to the predictor buffer with |pred_stride| + // equal to |width|. + assert(pred_stride == width); + // Compound functions start at 4x4. + assert(width >= 4 && height >= 4); + +#ifdef NDEBUG + // Quiet compiler error. + (void)pred_stride; +#endif + + DoHorizontalPass</*is_2d=*/false, /*is_compound=*/true>( + src, reference_stride, dest, width, width, height, horizontal_filter_id, + filter_index); +} + +void ConvolveCompound2D_AVX2(const void* const reference, + const ptrdiff_t reference_stride, + const int horizontal_filter_index, + const int vertical_filter_index, + const int horizontal_filter_id, + const int vertical_filter_id, const int width, + const int height, void* prediction, + const ptrdiff_t pred_stride) { + const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width); + const int vert_filter_index = GetFilterIndex(vertical_filter_index, height); + const int vertical_taps = GetNumTapsInFilter(vert_filter_index); + + // The output of the horizontal filter is guaranteed to fit in 16 bits. + alignas(32) uint16_t + intermediate_result[kMaxSuperBlockSizeInPixels * + (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)]; + const int intermediate_height = height + vertical_taps - 1; + + const ptrdiff_t src_stride = reference_stride; + const auto* src = static_cast<const uint8_t*>(reference) - + (vertical_taps / 2 - 1) * src_stride - kHorizontalOffset; + DoHorizontalPass</*is_2d=*/true, /*is_compound=*/true>( + src, src_stride, intermediate_result, width, width, intermediate_height, + horizontal_filter_id, horiz_filter_index); + + // Vertical filter. + auto* dest = static_cast<uint8_t*>(prediction); + const ptrdiff_t dest_stride = pred_stride; + assert(vertical_filter_id != 0); + + const __m128i v_filter = + LoadLo8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]); + + // Use 256 bits for width > 8. + if (width > 8) { + __m256i taps_256[4]; + const __m128i v_filter_ext = _mm_cvtepi8_epi16(v_filter); + + if (vertical_taps == 8) { + SetupTaps<8, /*is_2d_vertical=*/true>(&v_filter_ext, taps_256); + Filter2DVertical16xH<8, /*is_compound=*/true>( + intermediate_result, dest, dest_stride, width, height, taps_256); + } else if (vertical_taps == 6) { + SetupTaps<6, /*is_2d_vertical=*/true>(&v_filter_ext, taps_256); + Filter2DVertical16xH<6, /*is_compound=*/true>( + intermediate_result, dest, dest_stride, width, height, taps_256); + } else if (vertical_taps == 4) { + SetupTaps<4, /*is_2d_vertical=*/true>(&v_filter_ext, taps_256); + Filter2DVertical16xH<4, /*is_compound=*/true>( + intermediate_result, dest, dest_stride, width, height, taps_256); + } else { // |vertical_taps| == 2 + SetupTaps<2, /*is_2d_vertical=*/true>(&v_filter_ext, taps_256); + Filter2DVertical16xH<2, /*is_compound=*/true>( + intermediate_result, dest, dest_stride, width, height, taps_256); + } + } else { // width <= 8 + __m128i taps[4]; + // Use 128 bit code. + if (vertical_taps == 8) { + SetupTaps<8, /*is_2d_vertical=*/true>(&v_filter, taps); + if (width == 4) { + Filter2DVertical4xH<8, /*is_compound=*/true>(intermediate_result, dest, + dest_stride, height, taps); + } else { + Filter2DVertical<8, /*is_compound=*/true>( + intermediate_result, dest, dest_stride, width, height, taps); + } + } else if (vertical_taps == 6) { + SetupTaps<6, /*is_2d_vertical=*/true>(&v_filter, taps); + if (width == 4) { + Filter2DVertical4xH<6, /*is_compound=*/true>(intermediate_result, dest, + dest_stride, height, taps); + } else { + Filter2DVertical<6, /*is_compound=*/true>( + intermediate_result, dest, dest_stride, width, height, taps); + } + } else if (vertical_taps == 4) { + SetupTaps<4, /*is_2d_vertical=*/true>(&v_filter, taps); + if (width == 4) { + Filter2DVertical4xH<4, /*is_compound=*/true>(intermediate_result, dest, + dest_stride, height, taps); + } else { + Filter2DVertical<4, /*is_compound=*/true>( + intermediate_result, dest, dest_stride, width, height, taps); + } + } else { // |vertical_taps| == 2 + SetupTaps<2, /*is_2d_vertical=*/true>(&v_filter, taps); + if (width == 4) { + Filter2DVertical4xH<2, /*is_compound=*/true>(intermediate_result, dest, + dest_stride, height, taps); + } else { + Filter2DVertical<2, /*is_compound=*/true>( + intermediate_result, dest, dest_stride, width, height, taps); + } + } + } +} + void Init8bpp() { Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8); assert(dsp != nullptr); dsp->convolve[0][0][0][1] = ConvolveHorizontal_AVX2; + dsp->convolve[0][0][1][0] = ConvolveVertical_AVX2; + dsp->convolve[0][0][1][1] = Convolve2D_AVX2; + + dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_AVX2; + dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_AVX2; + dsp->convolve[0][1][1][1] = ConvolveCompound2D_AVX2; } } // namespace @@ -523,7 +1533,7 @@ void ConvolveInit_AVX2() { low_bitdepth::Init8bpp(); } } // namespace dsp } // namespace libgav1 -#else // !LIBGAV1_TARGETING_AVX2 +#else // !LIBGAV1_TARGETING_AVX2 namespace libgav1 { namespace dsp { diff --git a/src/dsp/x86/convolve_avx2.h b/src/dsp/x86/convolve_avx2.h index 6179d98..e509bc9 100644 --- a/src/dsp/x86/convolve_avx2.h +++ b/src/dsp/x86/convolve_avx2.h @@ -38,6 +38,22 @@ void ConvolveInit_AVX2(); #define LIBGAV1_Dsp8bpp_ConvolveHorizontal LIBGAV1_CPU_AVX2 #endif +#ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundHorizontal +#define LIBGAV1_Dsp8bpp_ConvolveCompoundHorizontal LIBGAV1_CPU_AVX2 +#endif + +#ifndef LIBGAV1_Dsp8bpp_ConvolveVertical +#define LIBGAV1_Dsp8bpp_ConvolveVertical LIBGAV1_CPU_AVX2 +#endif + +#ifndef LIBGAV1_Dsp8bpp_Convolve2D +#define LIBGAV1_Dsp8bpp_Convolve2D LIBGAV1_CPU_AVX2 +#endif + +#ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundVertical +#define LIBGAV1_Dsp8bpp_ConvolveCompoundVertical LIBGAV1_CPU_AVX2 +#endif + #endif // LIBGAV1_TARGETING_AVX2 #endif // LIBGAV1_SRC_DSP_X86_CONVOLVE_AVX2_H_ diff --git a/src/dsp/x86/convolve_sse4.cc b/src/dsp/x86/convolve_sse4.cc index 3a0fff5..9b72fe4 100644 --- a/src/dsp/x86/convolve_sse4.cc +++ b/src/dsp/x86/convolve_sse4.cc @@ -34,41 +34,7 @@ namespace dsp { namespace low_bitdepth { namespace { -#include "src/dsp/convolve.inc" - -// Multiply every entry in |src[]| by the corresponding entry in |taps[]| and -// sum. The filters in |taps[]| are pre-shifted by 1. This prevents the final -// sum from outranging int16_t. -template <int filter_index> -__m128i SumOnePassTaps(const __m128i* const src, const __m128i* const taps) { - __m128i sum; - if (filter_index < 2) { - // 6 taps. - const __m128i v_madd_21 = _mm_maddubs_epi16(src[0], taps[0]); // k2k1 - const __m128i v_madd_43 = _mm_maddubs_epi16(src[1], taps[1]); // k4k3 - const __m128i v_madd_65 = _mm_maddubs_epi16(src[2], taps[2]); // k6k5 - sum = _mm_add_epi16(v_madd_21, v_madd_43); - sum = _mm_add_epi16(sum, v_madd_65); - } else if (filter_index == 2) { - // 8 taps. - const __m128i v_madd_10 = _mm_maddubs_epi16(src[0], taps[0]); // k1k0 - const __m128i v_madd_32 = _mm_maddubs_epi16(src[1], taps[1]); // k3k2 - const __m128i v_madd_54 = _mm_maddubs_epi16(src[2], taps[2]); // k5k4 - const __m128i v_madd_76 = _mm_maddubs_epi16(src[3], taps[3]); // k7k6 - const __m128i v_sum_3210 = _mm_add_epi16(v_madd_10, v_madd_32); - const __m128i v_sum_7654 = _mm_add_epi16(v_madd_54, v_madd_76); - sum = _mm_add_epi16(v_sum_7654, v_sum_3210); - } else if (filter_index == 3) { - // 2 taps. - sum = _mm_maddubs_epi16(src[0], taps[0]); // k4k3 - } else { - // 4 taps. - const __m128i v_madd_32 = _mm_maddubs_epi16(src[0], taps[0]); // k3k2 - const __m128i v_madd_54 = _mm_maddubs_epi16(src[1], taps[1]); // k5k4 - sum = _mm_add_epi16(v_madd_32, v_madd_54); - } - return sum; -} +#include "src/dsp/x86/convolve_sse4.inc" template <int filter_index> __m128i SumHorizontalTaps(const uint8_t* const src, @@ -125,68 +91,7 @@ __m128i HorizontalTaps8To16(const uint8_t* const src, return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1); } -template <int filter_index> -__m128i SumHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride, - const __m128i* const v_tap) { - const __m128i input0 = LoadLo8(&src[2]); - const __m128i input1 = LoadLo8(&src[2 + src_stride]); - - if (filter_index == 3) { - // 03 04 04 05 05 06 06 07 .... - const __m128i input0_dup = - _mm_srli_si128(_mm_unpacklo_epi8(input0, input0), 3); - // 13 14 14 15 15 16 16 17 .... - const __m128i input1_dup = - _mm_srli_si128(_mm_unpacklo_epi8(input1, input1), 3); - const __m128i v_src_43 = _mm_unpacklo_epi64(input0_dup, input1_dup); - const __m128i v_sum_43 = _mm_maddubs_epi16(v_src_43, v_tap[0]); // k4k3 - return v_sum_43; - } - - // 02 03 03 04 04 05 05 06 06 07 .... - const __m128i input0_dup = - _mm_srli_si128(_mm_unpacklo_epi8(input0, input0), 1); - // 12 13 13 14 14 15 15 16 16 17 .... - const __m128i input1_dup = - _mm_srli_si128(_mm_unpacklo_epi8(input1, input1), 1); - // 04 05 05 06 06 07 07 08 ... - const __m128i input0_dup_54 = _mm_srli_si128(input0_dup, 4); - // 14 15 15 16 16 17 17 18 ... - const __m128i input1_dup_54 = _mm_srli_si128(input1_dup, 4); - const __m128i v_src_32 = _mm_unpacklo_epi64(input0_dup, input1_dup); - const __m128i v_src_54 = _mm_unpacklo_epi64(input0_dup_54, input1_dup_54); - const __m128i v_madd_32 = _mm_maddubs_epi16(v_src_32, v_tap[0]); // k3k2 - const __m128i v_madd_54 = _mm_maddubs_epi16(v_src_54, v_tap[1]); // k5k4 - const __m128i v_sum_5432 = _mm_add_epi16(v_madd_54, v_madd_32); - return v_sum_5432; -} - -template <int filter_index> -__m128i SimpleHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride, - const __m128i* const v_tap) { - __m128i sum = SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap); - - // Normally the Horizontal pass does the downshift in two passes: - // kInterRoundBitsHorizontal - 1 and then (kFilterBits - - // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them - // requires adding the rounding offset from the skipped shift. - constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2); - - sum = _mm_add_epi16(sum, _mm_set1_epi16(first_shift_rounding_bit)); - sum = RightShiftWithRounding_S16(sum, kFilterBits - 1); - return _mm_packus_epi16(sum, sum); -} - -template <int filter_index> -__m128i HorizontalTaps8To16_2x2(const uint8_t* src, const ptrdiff_t src_stride, - const __m128i* const v_tap) { - const __m128i sum = - SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap); - - return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1); -} - -template <int num_taps, int step, int filter_index, bool is_2d = false, +template <int num_taps, int filter_index, bool is_2d = false, bool is_compound = false> void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride, void* const dest, const ptrdiff_t pred_stride, @@ -197,7 +102,7 @@ void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride, // 4 tap filters are never used when width > 4. if (num_taps != 4 && width > 4) { - int y = 0; + int y = height; do { int x = 0; do { @@ -214,12 +119,12 @@ void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride, SimpleHorizontalTaps<filter_index>(&src[x], v_tap); StoreLo8(&dest8[x], result); } - x += step; + x += 8; } while (x < width); src += src_stride; dest8 += pred_stride; dest16 += pred_stride; - } while (++y < height); + } while (--y != 0); return; } @@ -229,7 +134,7 @@ void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride, assert(num_taps <= 4); if (num_taps <= 4) { if (width == 4) { - int y = 0; + int y = height; do { if (is_2d || is_compound) { const __m128i v_sum = HorizontalTaps8To16<filter_index>(src, v_tap); @@ -241,12 +146,13 @@ void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride, src += src_stride; dest8 += pred_stride; dest16 += pred_stride; - } while (++y < height); + } while (--y != 0); return; } if (!is_compound) { - int y = 0; + int y = height; + if (is_2d) y -= 1; do { if (is_2d) { const __m128i sum = @@ -265,8 +171,8 @@ void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride, } src += src_stride << 1; - y += 2; - } while (y < height - 1); + y -= 2; + } while (y != 0); // The 2d filters have an odd |height| because the horizontal pass // generates context for the vertical pass. @@ -298,303 +204,6 @@ void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride, } } -template <int num_taps, bool is_2d_vertical = false> -LIBGAV1_ALWAYS_INLINE void SetupTaps(const __m128i* const filter, - __m128i* v_tap) { - if (num_taps == 8) { - v_tap[0] = _mm_shufflelo_epi16(*filter, 0x0); // k1k0 - v_tap[1] = _mm_shufflelo_epi16(*filter, 0x55); // k3k2 - v_tap[2] = _mm_shufflelo_epi16(*filter, 0xaa); // k5k4 - v_tap[3] = _mm_shufflelo_epi16(*filter, 0xff); // k7k6 - if (is_2d_vertical) { - v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]); - v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]); - v_tap[2] = _mm_cvtepi8_epi16(v_tap[2]); - v_tap[3] = _mm_cvtepi8_epi16(v_tap[3]); - } else { - v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]); - v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]); - v_tap[2] = _mm_unpacklo_epi64(v_tap[2], v_tap[2]); - v_tap[3] = _mm_unpacklo_epi64(v_tap[3], v_tap[3]); - } - } else if (num_taps == 6) { - const __m128i adjusted_filter = _mm_srli_si128(*filter, 1); - v_tap[0] = _mm_shufflelo_epi16(adjusted_filter, 0x0); // k2k1 - v_tap[1] = _mm_shufflelo_epi16(adjusted_filter, 0x55); // k4k3 - v_tap[2] = _mm_shufflelo_epi16(adjusted_filter, 0xaa); // k6k5 - if (is_2d_vertical) { - v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]); - v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]); - v_tap[2] = _mm_cvtepi8_epi16(v_tap[2]); - } else { - v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]); - v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]); - v_tap[2] = _mm_unpacklo_epi64(v_tap[2], v_tap[2]); - } - } else if (num_taps == 4) { - v_tap[0] = _mm_shufflelo_epi16(*filter, 0x55); // k3k2 - v_tap[1] = _mm_shufflelo_epi16(*filter, 0xaa); // k5k4 - if (is_2d_vertical) { - v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]); - v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]); - } else { - v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]); - v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]); - } - } else { // num_taps == 2 - const __m128i adjusted_filter = _mm_srli_si128(*filter, 1); - v_tap[0] = _mm_shufflelo_epi16(adjusted_filter, 0x55); // k4k3 - if (is_2d_vertical) { - v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]); - } else { - v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]); - } - } -} - -template <int num_taps, bool is_compound> -__m128i SimpleSum2DVerticalTaps(const __m128i* const src, - const __m128i* const taps) { - __m128i sum_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[0], src[1]), taps[0]); - __m128i sum_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[0], src[1]), taps[0]); - if (num_taps >= 4) { - __m128i madd_lo = - _mm_madd_epi16(_mm_unpacklo_epi16(src[2], src[3]), taps[1]); - __m128i madd_hi = - _mm_madd_epi16(_mm_unpackhi_epi16(src[2], src[3]), taps[1]); - sum_lo = _mm_add_epi32(sum_lo, madd_lo); - sum_hi = _mm_add_epi32(sum_hi, madd_hi); - if (num_taps >= 6) { - madd_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[4], src[5]), taps[2]); - madd_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[4], src[5]), taps[2]); - sum_lo = _mm_add_epi32(sum_lo, madd_lo); - sum_hi = _mm_add_epi32(sum_hi, madd_hi); - if (num_taps == 8) { - madd_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[6], src[7]), taps[3]); - madd_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[6], src[7]), taps[3]); - sum_lo = _mm_add_epi32(sum_lo, madd_lo); - sum_hi = _mm_add_epi32(sum_hi, madd_hi); - } - } - } - - if (is_compound) { - return _mm_packs_epi32( - RightShiftWithRounding_S32(sum_lo, kInterRoundBitsCompoundVertical - 1), - RightShiftWithRounding_S32(sum_hi, - kInterRoundBitsCompoundVertical - 1)); - } - - return _mm_packs_epi32( - RightShiftWithRounding_S32(sum_lo, kInterRoundBitsVertical - 1), - RightShiftWithRounding_S32(sum_hi, kInterRoundBitsVertical - 1)); -} - -template <int num_taps, bool is_compound = false> -void Filter2DVertical(const uint16_t* src, void* const dst, - const ptrdiff_t dst_stride, const int width, - const int height, const __m128i* const taps) { - assert(width >= 8); - constexpr int next_row = num_taps - 1; - // The Horizontal pass uses |width| as |stride| for the intermediate buffer. - const ptrdiff_t src_stride = width; - - auto* dst8 = static_cast<uint8_t*>(dst); - auto* dst16 = static_cast<uint16_t*>(dst); - - int x = 0; - do { - __m128i srcs[8]; - const uint16_t* src_x = src + x; - srcs[0] = LoadAligned16(src_x); - src_x += src_stride; - if (num_taps >= 4) { - srcs[1] = LoadAligned16(src_x); - src_x += src_stride; - srcs[2] = LoadAligned16(src_x); - src_x += src_stride; - if (num_taps >= 6) { - srcs[3] = LoadAligned16(src_x); - src_x += src_stride; - srcs[4] = LoadAligned16(src_x); - src_x += src_stride; - if (num_taps == 8) { - srcs[5] = LoadAligned16(src_x); - src_x += src_stride; - srcs[6] = LoadAligned16(src_x); - src_x += src_stride; - } - } - } - - int y = 0; - do { - srcs[next_row] = LoadAligned16(src_x); - src_x += src_stride; - - const __m128i sum = - SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps); - if (is_compound) { - StoreUnaligned16(dst16 + x + y * dst_stride, sum); - } else { - StoreLo8(dst8 + x + y * dst_stride, _mm_packus_epi16(sum, sum)); - } - - srcs[0] = srcs[1]; - if (num_taps >= 4) { - srcs[1] = srcs[2]; - srcs[2] = srcs[3]; - if (num_taps >= 6) { - srcs[3] = srcs[4]; - srcs[4] = srcs[5]; - if (num_taps == 8) { - srcs[5] = srcs[6]; - srcs[6] = srcs[7]; - } - } - } - } while (++y < height); - x += 8; - } while (x < width); -} - -// Take advantage of |src_stride| == |width| to process two rows at a time. -template <int num_taps, bool is_compound = false> -void Filter2DVertical4xH(const uint16_t* src, void* const dst, - const ptrdiff_t dst_stride, const int height, - const __m128i* const taps) { - auto* dst8 = static_cast<uint8_t*>(dst); - auto* dst16 = static_cast<uint16_t*>(dst); - - __m128i srcs[9]; - srcs[0] = LoadAligned16(src); - src += 8; - if (num_taps >= 4) { - srcs[2] = LoadAligned16(src); - src += 8; - srcs[1] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[0], 8), srcs[2]); - if (num_taps >= 6) { - srcs[4] = LoadAligned16(src); - src += 8; - srcs[3] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[2], 8), srcs[4]); - if (num_taps == 8) { - srcs[6] = LoadAligned16(src); - src += 8; - srcs[5] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[4], 8), srcs[6]); - } - } - } - - int y = 0; - do { - srcs[num_taps] = LoadAligned16(src); - src += 8; - srcs[num_taps - 1] = _mm_unpacklo_epi64( - _mm_srli_si128(srcs[num_taps - 2], 8), srcs[num_taps]); - - const __m128i sum = - SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps); - if (is_compound) { - StoreUnaligned16(dst16, sum); - dst16 += 4 << 1; - } else { - const __m128i results = _mm_packus_epi16(sum, sum); - Store4(dst8, results); - dst8 += dst_stride; - Store4(dst8, _mm_srli_si128(results, 4)); - dst8 += dst_stride; - } - - srcs[0] = srcs[2]; - if (num_taps >= 4) { - srcs[1] = srcs[3]; - srcs[2] = srcs[4]; - if (num_taps >= 6) { - srcs[3] = srcs[5]; - srcs[4] = srcs[6]; - if (num_taps == 8) { - srcs[5] = srcs[7]; - srcs[6] = srcs[8]; - } - } - } - y += 2; - } while (y < height); -} - -// Take advantage of |src_stride| == |width| to process four rows at a time. -template <int num_taps> -void Filter2DVertical2xH(const uint16_t* src, void* const dst, - const ptrdiff_t dst_stride, const int height, - const __m128i* const taps) { - constexpr int next_row = (num_taps < 6) ? 4 : 8; - - auto* dst8 = static_cast<uint8_t*>(dst); - - __m128i srcs[9]; - srcs[0] = LoadAligned16(src); - src += 8; - if (num_taps >= 6) { - srcs[4] = LoadAligned16(src); - src += 8; - srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4); - if (num_taps == 8) { - srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8); - srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12); - } - } - - int y = 0; - do { - srcs[next_row] = LoadAligned16(src); - src += 8; - if (num_taps == 2) { - srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4); - } else if (num_taps == 4) { - srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4); - srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8); - srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12); - } else if (num_taps == 6) { - srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8); - srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12); - srcs[5] = _mm_alignr_epi8(srcs[8], srcs[4], 4); - } else if (num_taps == 8) { - srcs[5] = _mm_alignr_epi8(srcs[8], srcs[4], 4); - srcs[6] = _mm_alignr_epi8(srcs[8], srcs[4], 8); - srcs[7] = _mm_alignr_epi8(srcs[8], srcs[4], 12); - } - - const __m128i sum = - SimpleSum2DVerticalTaps<num_taps, /*is_compound=*/false>(srcs, taps); - const __m128i results = _mm_packus_epi16(sum, sum); - - Store2(dst8, results); - dst8 += dst_stride; - Store2(dst8, _mm_srli_si128(results, 2)); - // When |height| <= 4 the taps are restricted to 2 and 4 tap variants. - // Therefore we don't need to check this condition when |height| > 4. - if (num_taps <= 4 && height == 2) return; - dst8 += dst_stride; - Store2(dst8, _mm_srli_si128(results, 4)); - dst8 += dst_stride; - Store2(dst8, _mm_srli_si128(results, 6)); - dst8 += dst_stride; - - srcs[0] = srcs[4]; - if (num_taps == 6) { - srcs[1] = srcs[5]; - srcs[4] = srcs[8]; - } else if (num_taps == 8) { - srcs[1] = srcs[5]; - srcs[2] = srcs[6]; - srcs[3] = srcs[7]; - srcs[4] = srcs[8]; - } - - y += 4; - } while (y < height); -} - template <bool is_2d = false, bool is_compound = false> LIBGAV1_ALWAYS_INLINE void DoHorizontalPass( const uint8_t* const src, const ptrdiff_t src_stride, void* const dst, @@ -607,28 +216,28 @@ LIBGAV1_ALWAYS_INLINE void DoHorizontalPass( if (filter_index == 2) { // 8 tap. SetupTaps<8>(&v_horizontal_filter, v_tap); - FilterHorizontal<8, 8, 2, is_2d, is_compound>( - src, src_stride, dst, dst_stride, width, height, v_tap); + FilterHorizontal<8, 2, is_2d, is_compound>(src, src_stride, dst, dst_stride, + width, height, v_tap); } else if (filter_index == 1) { // 6 tap. SetupTaps<6>(&v_horizontal_filter, v_tap); - FilterHorizontal<6, 8, 1, is_2d, is_compound>( - src, src_stride, dst, dst_stride, width, height, v_tap); + FilterHorizontal<6, 1, is_2d, is_compound>(src, src_stride, dst, dst_stride, + width, height, v_tap); } else if (filter_index == 0) { // 6 tap. SetupTaps<6>(&v_horizontal_filter, v_tap); - FilterHorizontal<6, 8, 0, is_2d, is_compound>( - src, src_stride, dst, dst_stride, width, height, v_tap); + FilterHorizontal<6, 0, is_2d, is_compound>(src, src_stride, dst, dst_stride, + width, height, v_tap); } else if (filter_index == 4) { // 4 tap. SetupTaps<4>(&v_horizontal_filter, v_tap); - FilterHorizontal<4, 8, 4, is_2d, is_compound>( - src, src_stride, dst, dst_stride, width, height, v_tap); + FilterHorizontal<4, 4, is_2d, is_compound>(src, src_stride, dst, dst_stride, + width, height, v_tap); } else if (filter_index == 5) { // 4 tap. SetupTaps<4>(&v_horizontal_filter, v_tap); - FilterHorizontal<4, 8, 5, is_2d, is_compound>( - src, src_stride, dst, dst_stride, width, height, v_tap); + FilterHorizontal<4, 5, is_2d, is_compound>(src, src_stride, dst, dst_stride, + width, height, v_tap); } else { // 2 tap. SetupTaps<2>(&v_horizontal_filter, v_tap); - FilterHorizontal<2, 8, 3, is_2d, is_compound>( - src, src_stride, dst, dst_stride, width, height, v_tap); + FilterHorizontal<2, 3, is_2d, is_compound>(src, src_stride, dst, dst_stride, + width, height, v_tap); } } @@ -718,39 +327,6 @@ void Convolve2D_SSE4_1(const void* const reference, } } -// The 1D compound shift is always |kInterRoundBitsHorizontal|, even for 1D -// Vertical calculations. -__m128i Compound1DShift(const __m128i sum) { - return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1); -} - -template <int filter_index> -__m128i SumVerticalTaps(const __m128i* const srcs, const __m128i* const v_tap) { - __m128i v_src[4]; - - if (filter_index < 2) { - // 6 taps. - v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]); - v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]); - v_src[2] = _mm_unpacklo_epi8(srcs[4], srcs[5]); - } else if (filter_index == 2) { - // 8 taps. - v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]); - v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]); - v_src[2] = _mm_unpacklo_epi8(srcs[4], srcs[5]); - v_src[3] = _mm_unpacklo_epi8(srcs[6], srcs[7]); - } else if (filter_index == 3) { - // 2 taps. - v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]); - } else if (filter_index > 3) { - // 4 taps. - v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]); - v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]); - } - const __m128i sum = SumOnePassTaps<filter_index>(v_src, v_tap); - return sum; -} - template <int filter_index, bool is_compound = false> void FilterVertical(const uint8_t* src, const ptrdiff_t src_stride, void* const dst, const ptrdiff_t dst_stride, @@ -787,7 +363,9 @@ void FilterVertical(const uint8_t* src, const ptrdiff_t src_stride, } } - int y = 0; + auto* dst8_x = dst8 + x; + auto* dst16_x = dst16 + x; + int y = height; do { srcs[next_row] = LoadLo8(src_x); src_x += src_stride; @@ -795,11 +373,13 @@ void FilterVertical(const uint8_t* src, const ptrdiff_t src_stride, const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap); if (is_compound) { const __m128i results = Compound1DShift(sums); - StoreUnaligned16(dst16 + x + y * dst_stride, results); + StoreUnaligned16(dst16_x, results); + dst16_x += dst_stride; } else { const __m128i results = RightShiftWithRounding_S16(sums, kFilterBits - 1); - StoreLo8(dst8 + x + y * dst_stride, _mm_packus_epi16(results, results)); + StoreLo8(dst8_x, _mm_packus_epi16(results, results)); + dst8_x += dst_stride; } srcs[0] = srcs[1]; @@ -815,506 +395,11 @@ void FilterVertical(const uint8_t* src, const ptrdiff_t src_stride, } } } - } while (++y < height); + } while (--y != 0); x += 8; } while (x < width); } -template <int filter_index, bool is_compound = false> -void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride, - void* const dst, const ptrdiff_t dst_stride, - const int height, const __m128i* const v_tap) { - const int num_taps = GetNumTapsInFilter(filter_index); - auto* dst8 = static_cast<uint8_t*>(dst); - auto* dst16 = static_cast<uint16_t*>(dst); - - __m128i srcs[9]; - - if (num_taps == 2) { - srcs[2] = _mm_setzero_si128(); - // 00 01 02 03 - srcs[0] = Load4(src); - src += src_stride; - - int y = 0; - do { - // 10 11 12 13 - const __m128i a = Load4(src); - // 00 01 02 03 10 11 12 13 - srcs[0] = _mm_unpacklo_epi32(srcs[0], a); - src += src_stride; - // 20 21 22 23 - srcs[2] = Load4(src); - src += src_stride; - // 10 11 12 13 20 21 22 23 - srcs[1] = _mm_unpacklo_epi32(a, srcs[2]); - - const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap); - if (is_compound) { - const __m128i results = Compound1DShift(sums); - StoreUnaligned16(dst16, results); - dst16 += 4 << 1; - } else { - const __m128i results_16 = - RightShiftWithRounding_S16(sums, kFilterBits - 1); - const __m128i results = _mm_packus_epi16(results_16, results_16); - Store4(dst8, results); - dst8 += dst_stride; - Store4(dst8, _mm_srli_si128(results, 4)); - dst8 += dst_stride; - } - - srcs[0] = srcs[2]; - y += 2; - } while (y < height); - } else if (num_taps == 4) { - srcs[4] = _mm_setzero_si128(); - // 00 01 02 03 - srcs[0] = Load4(src); - src += src_stride; - // 10 11 12 13 - const __m128i a = Load4(src); - // 00 01 02 03 10 11 12 13 - srcs[0] = _mm_unpacklo_epi32(srcs[0], a); - src += src_stride; - // 20 21 22 23 - srcs[2] = Load4(src); - src += src_stride; - // 10 11 12 13 20 21 22 23 - srcs[1] = _mm_unpacklo_epi32(a, srcs[2]); - - int y = 0; - do { - // 30 31 32 33 - const __m128i b = Load4(src); - // 20 21 22 23 30 31 32 33 - srcs[2] = _mm_unpacklo_epi32(srcs[2], b); - src += src_stride; - // 40 41 42 43 - srcs[4] = Load4(src); - src += src_stride; - // 30 31 32 33 40 41 42 43 - srcs[3] = _mm_unpacklo_epi32(b, srcs[4]); - - const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap); - if (is_compound) { - const __m128i results = Compound1DShift(sums); - StoreUnaligned16(dst16, results); - dst16 += 4 << 1; - } else { - const __m128i results_16 = - RightShiftWithRounding_S16(sums, kFilterBits - 1); - const __m128i results = _mm_packus_epi16(results_16, results_16); - Store4(dst8, results); - dst8 += dst_stride; - Store4(dst8, _mm_srli_si128(results, 4)); - dst8 += dst_stride; - } - - srcs[0] = srcs[2]; - srcs[1] = srcs[3]; - srcs[2] = srcs[4]; - y += 2; - } while (y < height); - } else if (num_taps == 6) { - srcs[6] = _mm_setzero_si128(); - // 00 01 02 03 - srcs[0] = Load4(src); - src += src_stride; - // 10 11 12 13 - const __m128i a = Load4(src); - // 00 01 02 03 10 11 12 13 - srcs[0] = _mm_unpacklo_epi32(srcs[0], a); - src += src_stride; - // 20 21 22 23 - srcs[2] = Load4(src); - src += src_stride; - // 10 11 12 13 20 21 22 23 - srcs[1] = _mm_unpacklo_epi32(a, srcs[2]); - // 30 31 32 33 - const __m128i b = Load4(src); - // 20 21 22 23 30 31 32 33 - srcs[2] = _mm_unpacklo_epi32(srcs[2], b); - src += src_stride; - // 40 41 42 43 - srcs[4] = Load4(src); - src += src_stride; - // 30 31 32 33 40 41 42 43 - srcs[3] = _mm_unpacklo_epi32(b, srcs[4]); - - int y = 0; - do { - // 50 51 52 53 - const __m128i c = Load4(src); - // 40 41 42 43 50 51 52 53 - srcs[4] = _mm_unpacklo_epi32(srcs[4], c); - src += src_stride; - // 60 61 62 63 - srcs[6] = Load4(src); - src += src_stride; - // 50 51 52 53 60 61 62 63 - srcs[5] = _mm_unpacklo_epi32(c, srcs[6]); - - const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap); - if (is_compound) { - const __m128i results = Compound1DShift(sums); - StoreUnaligned16(dst16, results); - dst16 += 4 << 1; - } else { - const __m128i results_16 = - RightShiftWithRounding_S16(sums, kFilterBits - 1); - const __m128i results = _mm_packus_epi16(results_16, results_16); - Store4(dst8, results); - dst8 += dst_stride; - Store4(dst8, _mm_srli_si128(results, 4)); - dst8 += dst_stride; - } - - srcs[0] = srcs[2]; - srcs[1] = srcs[3]; - srcs[2] = srcs[4]; - srcs[3] = srcs[5]; - srcs[4] = srcs[6]; - y += 2; - } while (y < height); - } else if (num_taps == 8) { - srcs[8] = _mm_setzero_si128(); - // 00 01 02 03 - srcs[0] = Load4(src); - src += src_stride; - // 10 11 12 13 - const __m128i a = Load4(src); - // 00 01 02 03 10 11 12 13 - srcs[0] = _mm_unpacklo_epi32(srcs[0], a); - src += src_stride; - // 20 21 22 23 - srcs[2] = Load4(src); - src += src_stride; - // 10 11 12 13 20 21 22 23 - srcs[1] = _mm_unpacklo_epi32(a, srcs[2]); - // 30 31 32 33 - const __m128i b = Load4(src); - // 20 21 22 23 30 31 32 33 - srcs[2] = _mm_unpacklo_epi32(srcs[2], b); - src += src_stride; - // 40 41 42 43 - srcs[4] = Load4(src); - src += src_stride; - // 30 31 32 33 40 41 42 43 - srcs[3] = _mm_unpacklo_epi32(b, srcs[4]); - // 50 51 52 53 - const __m128i c = Load4(src); - // 40 41 42 43 50 51 52 53 - srcs[4] = _mm_unpacklo_epi32(srcs[4], c); - src += src_stride; - // 60 61 62 63 - srcs[6] = Load4(src); - src += src_stride; - // 50 51 52 53 60 61 62 63 - srcs[5] = _mm_unpacklo_epi32(c, srcs[6]); - - int y = 0; - do { - // 70 71 72 73 - const __m128i d = Load4(src); - // 60 61 62 63 70 71 72 73 - srcs[6] = _mm_unpacklo_epi32(srcs[6], d); - src += src_stride; - // 80 81 82 83 - srcs[8] = Load4(src); - src += src_stride; - // 70 71 72 73 80 81 82 83 - srcs[7] = _mm_unpacklo_epi32(d, srcs[8]); - - const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap); - if (is_compound) { - const __m128i results = Compound1DShift(sums); - StoreUnaligned16(dst16, results); - dst16 += 4 << 1; - } else { - const __m128i results_16 = - RightShiftWithRounding_S16(sums, kFilterBits - 1); - const __m128i results = _mm_packus_epi16(results_16, results_16); - Store4(dst8, results); - dst8 += dst_stride; - Store4(dst8, _mm_srli_si128(results, 4)); - dst8 += dst_stride; - } - - srcs[0] = srcs[2]; - srcs[1] = srcs[3]; - srcs[2] = srcs[4]; - srcs[3] = srcs[5]; - srcs[4] = srcs[6]; - srcs[5] = srcs[7]; - srcs[6] = srcs[8]; - y += 2; - } while (y < height); - } -} - -template <int filter_index, bool negative_outside_taps = false> -void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride, - void* const dst, const ptrdiff_t dst_stride, - const int height, const __m128i* const v_tap) { - const int num_taps = GetNumTapsInFilter(filter_index); - auto* dst8 = static_cast<uint8_t*>(dst); - - __m128i srcs[9]; - - if (num_taps == 2) { - srcs[2] = _mm_setzero_si128(); - // 00 01 - srcs[0] = Load2(src); - src += src_stride; - - int y = 0; - do { - // 00 01 10 11 - srcs[0] = Load2<1>(src, srcs[0]); - src += src_stride; - // 00 01 10 11 20 21 - srcs[0] = Load2<2>(src, srcs[0]); - src += src_stride; - // 00 01 10 11 20 21 30 31 - srcs[0] = Load2<3>(src, srcs[0]); - src += src_stride; - // 40 41 - srcs[2] = Load2<0>(src, srcs[2]); - src += src_stride; - // 00 01 10 11 20 21 30 31 40 41 - const __m128i srcs_0_2 = _mm_unpacklo_epi64(srcs[0], srcs[2]); - // 10 11 20 21 30 31 40 41 - srcs[1] = _mm_srli_si128(srcs_0_2, 2); - // This uses srcs[0]..srcs[1]. - const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap); - const __m128i results_16 = - RightShiftWithRounding_S16(sums, kFilterBits - 1); - const __m128i results = _mm_packus_epi16(results_16, results_16); - - Store2(dst8, results); - dst8 += dst_stride; - Store2(dst8, _mm_srli_si128(results, 2)); - if (height == 2) return; - dst8 += dst_stride; - Store2(dst8, _mm_srli_si128(results, 4)); - dst8 += dst_stride; - Store2(dst8, _mm_srli_si128(results, 6)); - dst8 += dst_stride; - - srcs[0] = srcs[2]; - y += 4; - } while (y < height); - } else if (num_taps == 4) { - srcs[4] = _mm_setzero_si128(); - - // 00 01 - srcs[0] = Load2(src); - src += src_stride; - // 00 01 10 11 - srcs[0] = Load2<1>(src, srcs[0]); - src += src_stride; - // 00 01 10 11 20 21 - srcs[0] = Load2<2>(src, srcs[0]); - src += src_stride; - - int y = 0; - do { - // 00 01 10 11 20 21 30 31 - srcs[0] = Load2<3>(src, srcs[0]); - src += src_stride; - // 40 41 - srcs[4] = Load2<0>(src, srcs[4]); - src += src_stride; - // 40 41 50 51 - srcs[4] = Load2<1>(src, srcs[4]); - src += src_stride; - // 40 41 50 51 60 61 - srcs[4] = Load2<2>(src, srcs[4]); - src += src_stride; - // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 - const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]); - // 10 11 20 21 30 31 40 41 - srcs[1] = _mm_srli_si128(srcs_0_4, 2); - // 20 21 30 31 40 41 50 51 - srcs[2] = _mm_srli_si128(srcs_0_4, 4); - // 30 31 40 41 50 51 60 61 - srcs[3] = _mm_srli_si128(srcs_0_4, 6); - - // This uses srcs[0]..srcs[3]. - const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap); - const __m128i results_16 = - RightShiftWithRounding_S16(sums, kFilterBits - 1); - const __m128i results = _mm_packus_epi16(results_16, results_16); - - Store2(dst8, results); - dst8 += dst_stride; - Store2(dst8, _mm_srli_si128(results, 2)); - if (height == 2) return; - dst8 += dst_stride; - Store2(dst8, _mm_srli_si128(results, 4)); - dst8 += dst_stride; - Store2(dst8, _mm_srli_si128(results, 6)); - dst8 += dst_stride; - - srcs[0] = srcs[4]; - y += 4; - } while (y < height); - } else if (num_taps == 6) { - // During the vertical pass the number of taps is restricted when - // |height| <= 4. - assert(height > 4); - srcs[8] = _mm_setzero_si128(); - - // 00 01 - srcs[0] = Load2(src); - src += src_stride; - // 00 01 10 11 - srcs[0] = Load2<1>(src, srcs[0]); - src += src_stride; - // 00 01 10 11 20 21 - srcs[0] = Load2<2>(src, srcs[0]); - src += src_stride; - // 00 01 10 11 20 21 30 31 - srcs[0] = Load2<3>(src, srcs[0]); - src += src_stride; - // 40 41 - srcs[4] = Load2(src); - src += src_stride; - // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 - const __m128i srcs_0_4x = _mm_unpacklo_epi64(srcs[0], srcs[4]); - // 10 11 20 21 30 31 40 41 - srcs[1] = _mm_srli_si128(srcs_0_4x, 2); - - int y = 0; - do { - // 40 41 50 51 - srcs[4] = Load2<1>(src, srcs[4]); - src += src_stride; - // 40 41 50 51 60 61 - srcs[4] = Load2<2>(src, srcs[4]); - src += src_stride; - // 40 41 50 51 60 61 70 71 - srcs[4] = Load2<3>(src, srcs[4]); - src += src_stride; - // 80 81 - srcs[8] = Load2<0>(src, srcs[8]); - src += src_stride; - // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 - const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]); - // 20 21 30 31 40 41 50 51 - srcs[2] = _mm_srli_si128(srcs_0_4, 4); - // 30 31 40 41 50 51 60 61 - srcs[3] = _mm_srli_si128(srcs_0_4, 6); - const __m128i srcs_4_8 = _mm_unpacklo_epi64(srcs[4], srcs[8]); - // 50 51 60 61 70 71 80 81 - srcs[5] = _mm_srli_si128(srcs_4_8, 2); - - // This uses srcs[0]..srcs[5]. - const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap); - const __m128i results_16 = - RightShiftWithRounding_S16(sums, kFilterBits - 1); - const __m128i results = _mm_packus_epi16(results_16, results_16); - - Store2(dst8, results); - dst8 += dst_stride; - Store2(dst8, _mm_srli_si128(results, 2)); - dst8 += dst_stride; - Store2(dst8, _mm_srli_si128(results, 4)); - dst8 += dst_stride; - Store2(dst8, _mm_srli_si128(results, 6)); - dst8 += dst_stride; - - srcs[0] = srcs[4]; - srcs[1] = srcs[5]; - srcs[4] = srcs[8]; - y += 4; - } while (y < height); - } else if (num_taps == 8) { - // During the vertical pass the number of taps is restricted when - // |height| <= 4. - assert(height > 4); - srcs[8] = _mm_setzero_si128(); - // 00 01 - srcs[0] = Load2(src); - src += src_stride; - // 00 01 10 11 - srcs[0] = Load2<1>(src, srcs[0]); - src += src_stride; - // 00 01 10 11 20 21 - srcs[0] = Load2<2>(src, srcs[0]); - src += src_stride; - // 00 01 10 11 20 21 30 31 - srcs[0] = Load2<3>(src, srcs[0]); - src += src_stride; - // 40 41 - srcs[4] = Load2(src); - src += src_stride; - // 40 41 50 51 - srcs[4] = Load2<1>(src, srcs[4]); - src += src_stride; - // 40 41 50 51 60 61 - srcs[4] = Load2<2>(src, srcs[4]); - src += src_stride; - - // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 - const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]); - // 10 11 20 21 30 31 40 41 - srcs[1] = _mm_srli_si128(srcs_0_4, 2); - // 20 21 30 31 40 41 50 51 - srcs[2] = _mm_srli_si128(srcs_0_4, 4); - // 30 31 40 41 50 51 60 61 - srcs[3] = _mm_srli_si128(srcs_0_4, 6); - - int y = 0; - do { - // 40 41 50 51 60 61 70 71 - srcs[4] = Load2<3>(src, srcs[4]); - src += src_stride; - // 80 81 - srcs[8] = Load2<0>(src, srcs[8]); - src += src_stride; - // 80 81 90 91 - srcs[8] = Load2<1>(src, srcs[8]); - src += src_stride; - // 80 81 90 91 a0 a1 - srcs[8] = Load2<2>(src, srcs[8]); - src += src_stride; - - // 40 41 50 51 60 61 70 71 80 81 90 91 a0 a1 - const __m128i srcs_4_8 = _mm_unpacklo_epi64(srcs[4], srcs[8]); - // 50 51 60 61 70 71 80 81 - srcs[5] = _mm_srli_si128(srcs_4_8, 2); - // 60 61 70 71 80 81 90 91 - srcs[6] = _mm_srli_si128(srcs_4_8, 4); - // 70 71 80 81 90 91 a0 a1 - srcs[7] = _mm_srli_si128(srcs_4_8, 6); - - // This uses srcs[0]..srcs[7]. - const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap); - const __m128i results_16 = - RightShiftWithRounding_S16(sums, kFilterBits - 1); - const __m128i results = _mm_packus_epi16(results_16, results_16); - - Store2(dst8, results); - dst8 += dst_stride; - Store2(dst8, _mm_srli_si128(results, 2)); - dst8 += dst_stride; - Store2(dst8, _mm_srli_si128(results, 4)); - dst8 += dst_stride; - Store2(dst8, _mm_srli_si128(results, 6)); - dst8 += dst_stride; - - srcs[0] = srcs[4]; - srcs[1] = srcs[5]; - srcs[2] = srcs[6]; - srcs[3] = srcs[7]; - srcs[4] = srcs[8]; - y += 4; - } while (y < height); - } -} - void ConvolveVertical_SSE4_1(const void* const reference, const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/, @@ -1339,9 +424,9 @@ void ConvolveVertical_SSE4_1(const void* const reference, if (filter_index < 2) { // 6 tap. SetupTaps<6>(&v_filter, taps); if (width == 2) { - FilterVertical2xH<0>(src, src_stride, dest, dest_stride, height, taps); + FilterVertical2xH<6, 0>(src, src_stride, dest, dest_stride, height, taps); } else if (width == 4) { - FilterVertical4xH<0>(src, src_stride, dest, dest_stride, height, taps); + FilterVertical4xH<6, 0>(src, src_stride, dest, dest_stride, height, taps); } else { FilterVertical<0>(src, src_stride, dest, dest_stride, width, height, taps); @@ -1349,9 +434,9 @@ void ConvolveVertical_SSE4_1(const void* const reference, } else if (filter_index == 2) { // 8 tap. SetupTaps<8>(&v_filter, taps); if (width == 2) { - FilterVertical2xH<2>(src, src_stride, dest, dest_stride, height, taps); + FilterVertical2xH<8, 2>(src, src_stride, dest, dest_stride, height, taps); } else if (width == 4) { - FilterVertical4xH<2>(src, src_stride, dest, dest_stride, height, taps); + FilterVertical4xH<8, 2>(src, src_stride, dest, dest_stride, height, taps); } else { FilterVertical<2>(src, src_stride, dest, dest_stride, width, height, taps); @@ -1359,9 +444,9 @@ void ConvolveVertical_SSE4_1(const void* const reference, } else if (filter_index == 3) { // 2 tap. SetupTaps<2>(&v_filter, taps); if (width == 2) { - FilterVertical2xH<3>(src, src_stride, dest, dest_stride, height, taps); + FilterVertical2xH<2, 3>(src, src_stride, dest, dest_stride, height, taps); } else if (width == 4) { - FilterVertical4xH<3>(src, src_stride, dest, dest_stride, height, taps); + FilterVertical4xH<2, 3>(src, src_stride, dest, dest_stride, height, taps); } else { FilterVertical<3>(src, src_stride, dest, dest_stride, width, height, taps); @@ -1369,9 +454,9 @@ void ConvolveVertical_SSE4_1(const void* const reference, } else if (filter_index == 4) { // 4 tap. SetupTaps<4>(&v_filter, taps); if (width == 2) { - FilterVertical2xH<4>(src, src_stride, dest, dest_stride, height, taps); + FilterVertical2xH<4, 4>(src, src_stride, dest, dest_stride, height, taps); } else if (width == 4) { - FilterVertical4xH<4>(src, src_stride, dest, dest_stride, height, taps); + FilterVertical4xH<4, 4>(src, src_stride, dest, dest_stride, height, taps); } else { FilterVertical<4>(src, src_stride, dest, dest_stride, width, height, taps); @@ -1382,9 +467,9 @@ void ConvolveVertical_SSE4_1(const void* const reference, SetupTaps<4>(&v_filter, taps); if (width == 2) { - FilterVertical2xH<5>(src, src_stride, dest, dest_stride, height, taps); + FilterVertical2xH<4, 5>(src, src_stride, dest, dest_stride, height, taps); } else if (width == 4) { - FilterVertical4xH<5>(src, src_stride, dest, dest_stride, height, taps); + FilterVertical4xH<4, 5>(src, src_stride, dest, dest_stride, height, taps); } else { FilterVertical<5>(src, src_stride, dest, dest_stride, width, height, taps); @@ -1474,8 +559,8 @@ void ConvolveCompoundVertical_SSE4_1( if (filter_index < 2) { // 6 tap. SetupTaps<6>(&v_filter, taps); if (width == 4) { - FilterVertical4xH<0, /*is_compound=*/true>(src, src_stride, dest, 4, - height, taps); + FilterVertical4xH<6, 0, /*is_compound=*/true>(src, src_stride, dest, 4, + height, taps); } else { FilterVertical<0, /*is_compound=*/true>(src, src_stride, dest, width, width, height, taps); @@ -1484,8 +569,8 @@ void ConvolveCompoundVertical_SSE4_1( SetupTaps<8>(&v_filter, taps); if (width == 4) { - FilterVertical4xH<2, /*is_compound=*/true>(src, src_stride, dest, 4, - height, taps); + FilterVertical4xH<8, 2, /*is_compound=*/true>(src, src_stride, dest, 4, + height, taps); } else { FilterVertical<2, /*is_compound=*/true>(src, src_stride, dest, width, width, height, taps); @@ -1494,8 +579,8 @@ void ConvolveCompoundVertical_SSE4_1( SetupTaps<2>(&v_filter, taps); if (width == 4) { - FilterVertical4xH<3, /*is_compound=*/true>(src, src_stride, dest, 4, - height, taps); + FilterVertical4xH<2, 3, /*is_compound=*/true>(src, src_stride, dest, 4, + height, taps); } else { FilterVertical<3, /*is_compound=*/true>(src, src_stride, dest, width, width, height, taps); @@ -1504,8 +589,8 @@ void ConvolveCompoundVertical_SSE4_1( SetupTaps<4>(&v_filter, taps); if (width == 4) { - FilterVertical4xH<4, /*is_compound=*/true>(src, src_stride, dest, 4, - height, taps); + FilterVertical4xH<4, 4, /*is_compound=*/true>(src, src_stride, dest, 4, + height, taps); } else { FilterVertical<4, /*is_compound=*/true>(src, src_stride, dest, width, width, height, taps); @@ -1514,8 +599,8 @@ void ConvolveCompoundVertical_SSE4_1( SetupTaps<4>(&v_filter, taps); if (width == 4) { - FilterVertical4xH<5, /*is_compound=*/true>(src, src_stride, dest, 4, - height, taps); + FilterVertical4xH<4, 5, /*is_compound=*/true>(src, src_stride, dest, 4, + height, taps); } else { FilterVertical<5, /*is_compound=*/true>(src, src_stride, dest, width, width, height, taps); @@ -1752,7 +837,11 @@ inline void GetHalfSubPixelFilter(__m128i* output) { template <int num_taps, int grade_x> inline void PrepareSourceVectors(const uint8_t* src, const __m128i src_indices, __m128i* const source /*[num_taps >> 1]*/) { - const __m128i src_vals = LoadUnaligned16(src); + // |used_bytes| is only computed in msan builds. Mask away unused bytes for + // msan because it incorrectly models the outcome of the shuffles in some + // cases. This has not been reproduced out of context. + const int used_bytes = _mm_extract_epi8(src_indices, 15) + 1 + num_taps - 2; + const __m128i src_vals = LoadUnaligned16Msan(src, 16 - used_bytes); source[0] = _mm_shuffle_epi8(src_vals, src_indices); if (grade_x == 1) { if (num_taps > 2) { @@ -1768,7 +857,7 @@ inline void PrepareSourceVectors(const uint8_t* src, const __m128i src_indices, assert(grade_x > 1); assert(num_taps != 4); // grade_x > 1 also means width >= 8 && num_taps != 4 - const __m128i src_vals_ext = LoadLo8(src + 16); + const __m128i src_vals_ext = LoadLo8Msan(src + 16, 24 - used_bytes); if (num_taps > 2) { source[1] = _mm_shuffle_epi8(_mm_alignr_epi8(src_vals_ext, src_vals, 2), src_indices); @@ -1983,14 +1072,10 @@ __m128i Sum2DVerticalTaps4x2(const __m128i* const src, const __m128i* taps_lo, // |width_class| is 2, 4, or 8, according to the Store function that should be // used. template <int num_taps, int width_class, bool is_compound> -#if LIBGAV1_MSAN -__attribute__((no_sanitize_memory)) void ConvolveVerticalScale( -#else -inline void ConvolveVerticalScale( -#endif - const int16_t* src, const int width, const int subpixel_y, - const int filter_index, const int step_y, const int height, void* dest, - const ptrdiff_t dest_stride) { +inline void ConvolveVerticalScale(const int16_t* src, const int width, + const int subpixel_y, const int filter_index, + const int step_y, const int height, + void* dest, const ptrdiff_t dest_stride) { constexpr ptrdiff_t src_stride = kIntermediateStride; constexpr int kernel_offset = (8 - num_taps) / 2; const int16_t* src_y = src; @@ -2819,7 +1904,7 @@ void ConvolveInit_SSE4_1() { low_bitdepth::Init8bpp(); } } // namespace dsp } // namespace libgav1 -#else // !LIBGAV1_TARGETING_SSE4_1 +#else // !LIBGAV1_TARGETING_SSE4_1 namespace libgav1 { namespace dsp { diff --git a/src/dsp/x86/convolve_sse4.inc b/src/dsp/x86/convolve_sse4.inc new file mode 100644 index 0000000..550d6a4 --- /dev/null +++ b/src/dsp/x86/convolve_sse4.inc @@ -0,0 +1,934 @@ +// Copyright 2020 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Common 128 bit functions used for sse4/avx2 convolve implementations. +// This will be included inside an anonymous namespace on files where these are +// necessary. + +#include "src/dsp/convolve.inc" + +// Multiply every entry in |src[]| by the corresponding entry in |taps[]| and +// sum. The filters in |taps[]| are pre-shifted by 1. This prevents the final +// sum from outranging int16_t. +template <int filter_index> +__m128i SumOnePassTaps(const __m128i* const src, const __m128i* const taps) { + __m128i sum; + if (filter_index < 2) { + // 6 taps. + const __m128i v_madd_21 = _mm_maddubs_epi16(src[0], taps[0]); // k2k1 + const __m128i v_madd_43 = _mm_maddubs_epi16(src[1], taps[1]); // k4k3 + const __m128i v_madd_65 = _mm_maddubs_epi16(src[2], taps[2]); // k6k5 + sum = _mm_add_epi16(v_madd_21, v_madd_43); + sum = _mm_add_epi16(sum, v_madd_65); + } else if (filter_index == 2) { + // 8 taps. + const __m128i v_madd_10 = _mm_maddubs_epi16(src[0], taps[0]); // k1k0 + const __m128i v_madd_32 = _mm_maddubs_epi16(src[1], taps[1]); // k3k2 + const __m128i v_madd_54 = _mm_maddubs_epi16(src[2], taps[2]); // k5k4 + const __m128i v_madd_76 = _mm_maddubs_epi16(src[3], taps[3]); // k7k6 + const __m128i v_sum_3210 = _mm_add_epi16(v_madd_10, v_madd_32); + const __m128i v_sum_7654 = _mm_add_epi16(v_madd_54, v_madd_76); + sum = _mm_add_epi16(v_sum_7654, v_sum_3210); + } else if (filter_index == 3) { + // 2 taps. + sum = _mm_maddubs_epi16(src[0], taps[0]); // k4k3 + } else { + // 4 taps. + const __m128i v_madd_32 = _mm_maddubs_epi16(src[0], taps[0]); // k3k2 + const __m128i v_madd_54 = _mm_maddubs_epi16(src[1], taps[1]); // k5k4 + sum = _mm_add_epi16(v_madd_32, v_madd_54); + } + return sum; +} + +template <int filter_index> +__m128i SumHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride, + const __m128i* const v_tap) { + // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17 + const __m128i v_src = LoadHi8(LoadLo8(&src[0]), &src[src_stride]); + + if (filter_index == 3) { + // 03 04 04 05 05 06 06 07 13 14 14 15 15 16 16 17 + const __m128i v_src_43 = _mm_shuffle_epi8( + v_src, _mm_set_epi32(0x0f0e0e0d, 0x0d0c0c0b, 0x07060605, 0x05040403)); + const __m128i v_sum_43 = _mm_maddubs_epi16(v_src_43, v_tap[0]); // k4k3 + return v_sum_43; + } + + // 02 03 03 04 04 05 05 06 12 13 13 14 14 15 15 16 + const __m128i v_src_32 = _mm_shuffle_epi8( + v_src, _mm_set_epi32(0x0e0d0d0c, 0x0c0b0b0a, 0x06050504, 0x04030302)); + // 04 05 05 06 06 07 07 xx 14 15 15 16 16 17 17 xx + const __m128i v_src_54 = _mm_shuffle_epi8( + v_src, _mm_set_epi32(static_cast<int>(0x800f0f0e), 0x0e0d0d0c, + static_cast<int>(0x80070706), 0x06050504)); + const __m128i v_madd_32 = _mm_maddubs_epi16(v_src_32, v_tap[0]); // k3k2 + const __m128i v_madd_54 = _mm_maddubs_epi16(v_src_54, v_tap[1]); // k5k4 + const __m128i v_sum_5432 = _mm_add_epi16(v_madd_54, v_madd_32); + return v_sum_5432; +} + +template <int filter_index> +__m128i SimpleHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride, + const __m128i* const v_tap) { + __m128i sum = SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap); + + // Normally the Horizontal pass does the downshift in two passes: + // kInterRoundBitsHorizontal - 1 and then (kFilterBits - + // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them + // requires adding the rounding offset from the skipped shift. + constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2); + + sum = _mm_add_epi16(sum, _mm_set1_epi16(first_shift_rounding_bit)); + sum = RightShiftWithRounding_S16(sum, kFilterBits - 1); + return _mm_packus_epi16(sum, sum); +} + +template <int filter_index> +__m128i HorizontalTaps8To16_2x2(const uint8_t* src, const ptrdiff_t src_stride, + const __m128i* const v_tap) { + const __m128i sum = + SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap); + + return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1); +} + +template <int num_taps, bool is_2d_vertical = false> +LIBGAV1_ALWAYS_INLINE void SetupTaps(const __m128i* const filter, + __m128i* v_tap) { + if (num_taps == 8) { + v_tap[0] = _mm_shufflelo_epi16(*filter, 0x0); // k1k0 + v_tap[1] = _mm_shufflelo_epi16(*filter, 0x55); // k3k2 + v_tap[2] = _mm_shufflelo_epi16(*filter, 0xaa); // k5k4 + v_tap[3] = _mm_shufflelo_epi16(*filter, 0xff); // k7k6 + if (is_2d_vertical) { + v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]); + v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]); + v_tap[2] = _mm_cvtepi8_epi16(v_tap[2]); + v_tap[3] = _mm_cvtepi8_epi16(v_tap[3]); + } else { + v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]); + v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]); + v_tap[2] = _mm_unpacklo_epi64(v_tap[2], v_tap[2]); + v_tap[3] = _mm_unpacklo_epi64(v_tap[3], v_tap[3]); + } + } else if (num_taps == 6) { + const __m128i adjusted_filter = _mm_srli_si128(*filter, 1); + v_tap[0] = _mm_shufflelo_epi16(adjusted_filter, 0x0); // k2k1 + v_tap[1] = _mm_shufflelo_epi16(adjusted_filter, 0x55); // k4k3 + v_tap[2] = _mm_shufflelo_epi16(adjusted_filter, 0xaa); // k6k5 + if (is_2d_vertical) { + v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]); + v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]); + v_tap[2] = _mm_cvtepi8_epi16(v_tap[2]); + } else { + v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]); + v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]); + v_tap[2] = _mm_unpacklo_epi64(v_tap[2], v_tap[2]); + } + } else if (num_taps == 4) { + v_tap[0] = _mm_shufflelo_epi16(*filter, 0x55); // k3k2 + v_tap[1] = _mm_shufflelo_epi16(*filter, 0xaa); // k5k4 + if (is_2d_vertical) { + v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]); + v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]); + } else { + v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]); + v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]); + } + } else { // num_taps == 2 + const __m128i adjusted_filter = _mm_srli_si128(*filter, 1); + v_tap[0] = _mm_shufflelo_epi16(adjusted_filter, 0x55); // k4k3 + if (is_2d_vertical) { + v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]); + } else { + v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]); + } + } +} + +template <int num_taps, bool is_compound> +__m128i SimpleSum2DVerticalTaps(const __m128i* const src, + const __m128i* const taps) { + __m128i sum_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[0], src[1]), taps[0]); + __m128i sum_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[0], src[1]), taps[0]); + if (num_taps >= 4) { + __m128i madd_lo = + _mm_madd_epi16(_mm_unpacklo_epi16(src[2], src[3]), taps[1]); + __m128i madd_hi = + _mm_madd_epi16(_mm_unpackhi_epi16(src[2], src[3]), taps[1]); + sum_lo = _mm_add_epi32(sum_lo, madd_lo); + sum_hi = _mm_add_epi32(sum_hi, madd_hi); + if (num_taps >= 6) { + madd_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[4], src[5]), taps[2]); + madd_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[4], src[5]), taps[2]); + sum_lo = _mm_add_epi32(sum_lo, madd_lo); + sum_hi = _mm_add_epi32(sum_hi, madd_hi); + if (num_taps == 8) { + madd_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[6], src[7]), taps[3]); + madd_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[6], src[7]), taps[3]); + sum_lo = _mm_add_epi32(sum_lo, madd_lo); + sum_hi = _mm_add_epi32(sum_hi, madd_hi); + } + } + } + + if (is_compound) { + return _mm_packs_epi32( + RightShiftWithRounding_S32(sum_lo, kInterRoundBitsCompoundVertical - 1), + RightShiftWithRounding_S32(sum_hi, + kInterRoundBitsCompoundVertical - 1)); + } + + return _mm_packs_epi32( + RightShiftWithRounding_S32(sum_lo, kInterRoundBitsVertical - 1), + RightShiftWithRounding_S32(sum_hi, kInterRoundBitsVertical - 1)); +} + +template <int num_taps, bool is_compound = false> +void Filter2DVertical(const uint16_t* src, void* const dst, + const ptrdiff_t dst_stride, const int width, + const int height, const __m128i* const taps) { + assert(width >= 8); + constexpr int next_row = num_taps - 1; + // The Horizontal pass uses |width| as |stride| for the intermediate buffer. + const ptrdiff_t src_stride = width; + + auto* dst8 = static_cast<uint8_t*>(dst); + auto* dst16 = static_cast<uint16_t*>(dst); + + int x = 0; + do { + __m128i srcs[8]; + const uint16_t* src_x = src + x; + srcs[0] = LoadAligned16(src_x); + src_x += src_stride; + if (num_taps >= 4) { + srcs[1] = LoadAligned16(src_x); + src_x += src_stride; + srcs[2] = LoadAligned16(src_x); + src_x += src_stride; + if (num_taps >= 6) { + srcs[3] = LoadAligned16(src_x); + src_x += src_stride; + srcs[4] = LoadAligned16(src_x); + src_x += src_stride; + if (num_taps == 8) { + srcs[5] = LoadAligned16(src_x); + src_x += src_stride; + srcs[6] = LoadAligned16(src_x); + src_x += src_stride; + } + } + } + + auto* dst8_x = dst8 + x; + auto* dst16_x = dst16 + x; + int y = height; + do { + srcs[next_row] = LoadAligned16(src_x); + src_x += src_stride; + + const __m128i sum = + SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps); + if (is_compound) { + StoreUnaligned16(dst16_x, sum); + dst16_x += dst_stride; + } else { + StoreLo8(dst8_x, _mm_packus_epi16(sum, sum)); + dst8_x += dst_stride; + } + + srcs[0] = srcs[1]; + if (num_taps >= 4) { + srcs[1] = srcs[2]; + srcs[2] = srcs[3]; + if (num_taps >= 6) { + srcs[3] = srcs[4]; + srcs[4] = srcs[5]; + if (num_taps == 8) { + srcs[5] = srcs[6]; + srcs[6] = srcs[7]; + } + } + } + } while (--y != 0); + x += 8; + } while (x < width); +} + +// Take advantage of |src_stride| == |width| to process two rows at a time. +template <int num_taps, bool is_compound = false> +void Filter2DVertical4xH(const uint16_t* src, void* const dst, + const ptrdiff_t dst_stride, const int height, + const __m128i* const taps) { + auto* dst8 = static_cast<uint8_t*>(dst); + auto* dst16 = static_cast<uint16_t*>(dst); + + __m128i srcs[9]; + srcs[0] = LoadAligned16(src); + src += 8; + if (num_taps >= 4) { + srcs[2] = LoadAligned16(src); + src += 8; + srcs[1] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[0], 8), srcs[2]); + if (num_taps >= 6) { + srcs[4] = LoadAligned16(src); + src += 8; + srcs[3] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[2], 8), srcs[4]); + if (num_taps == 8) { + srcs[6] = LoadAligned16(src); + src += 8; + srcs[5] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[4], 8), srcs[6]); + } + } + } + + int y = height; + do { + srcs[num_taps] = LoadAligned16(src); + src += 8; + srcs[num_taps - 1] = _mm_unpacklo_epi64( + _mm_srli_si128(srcs[num_taps - 2], 8), srcs[num_taps]); + + const __m128i sum = + SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps); + if (is_compound) { + StoreUnaligned16(dst16, sum); + dst16 += 4 << 1; + } else { + const __m128i results = _mm_packus_epi16(sum, sum); + Store4(dst8, results); + dst8 += dst_stride; + Store4(dst8, _mm_srli_si128(results, 4)); + dst8 += dst_stride; + } + + srcs[0] = srcs[2]; + if (num_taps >= 4) { + srcs[1] = srcs[3]; + srcs[2] = srcs[4]; + if (num_taps >= 6) { + srcs[3] = srcs[5]; + srcs[4] = srcs[6]; + if (num_taps == 8) { + srcs[5] = srcs[7]; + srcs[6] = srcs[8]; + } + } + } + y -= 2; + } while (y != 0); +} + +// Take advantage of |src_stride| == |width| to process four rows at a time. +template <int num_taps> +void Filter2DVertical2xH(const uint16_t* src, void* const dst, + const ptrdiff_t dst_stride, const int height, + const __m128i* const taps) { + constexpr int next_row = (num_taps < 6) ? 4 : 8; + + auto* dst8 = static_cast<uint8_t*>(dst); + + __m128i srcs[9]; + srcs[0] = LoadAligned16(src); + src += 8; + if (num_taps >= 6) { + srcs[4] = LoadAligned16(src); + src += 8; + srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4); + if (num_taps == 8) { + srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8); + srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12); + } + } + + int y = height; + do { + srcs[next_row] = LoadAligned16(src); + src += 8; + if (num_taps == 2) { + srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4); + } else if (num_taps == 4) { + srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4); + srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8); + srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12); + } else if (num_taps == 6) { + srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8); + srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12); + srcs[5] = _mm_alignr_epi8(srcs[8], srcs[4], 4); + } else if (num_taps == 8) { + srcs[5] = _mm_alignr_epi8(srcs[8], srcs[4], 4); + srcs[6] = _mm_alignr_epi8(srcs[8], srcs[4], 8); + srcs[7] = _mm_alignr_epi8(srcs[8], srcs[4], 12); + } + + const __m128i sum = + SimpleSum2DVerticalTaps<num_taps, /*is_compound=*/false>(srcs, taps); + const __m128i results = _mm_packus_epi16(sum, sum); + + Store2(dst8, results); + dst8 += dst_stride; + Store2(dst8, _mm_srli_si128(results, 2)); + // When |height| <= 4 the taps are restricted to 2 and 4 tap variants. + // Therefore we don't need to check this condition when |height| > 4. + if (num_taps <= 4 && height == 2) return; + dst8 += dst_stride; + Store2(dst8, _mm_srli_si128(results, 4)); + dst8 += dst_stride; + Store2(dst8, _mm_srli_si128(results, 6)); + dst8 += dst_stride; + + srcs[0] = srcs[4]; + if (num_taps == 6) { + srcs[1] = srcs[5]; + srcs[4] = srcs[8]; + } else if (num_taps == 8) { + srcs[1] = srcs[5]; + srcs[2] = srcs[6]; + srcs[3] = srcs[7]; + srcs[4] = srcs[8]; + } + + y -= 4; + } while (y != 0); +} + +// The 1D compound shift is always |kInterRoundBitsHorizontal|, even for 1D +// Vertical calculations. +__m128i Compound1DShift(const __m128i sum) { + return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1); +} + +template <int filter_index> +__m128i SumVerticalTaps(const __m128i* const srcs, const __m128i* const v_tap) { + __m128i v_src[4]; + + if (filter_index < 2) { + // 6 taps. + v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]); + v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]); + v_src[2] = _mm_unpacklo_epi8(srcs[4], srcs[5]); + } else if (filter_index == 2) { + // 8 taps. + v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]); + v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]); + v_src[2] = _mm_unpacklo_epi8(srcs[4], srcs[5]); + v_src[3] = _mm_unpacklo_epi8(srcs[6], srcs[7]); + } else if (filter_index == 3) { + // 2 taps. + v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]); + } else if (filter_index > 3) { + // 4 taps. + v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]); + v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]); + } + const __m128i sum = SumOnePassTaps<filter_index>(v_src, v_tap); + return sum; +} + +// TODO(slavarnway): Use num_taps instead of filter_index for templates. See the +// 2D version. +template <int num_taps, int filter_index, bool is_compound = false> +void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride, + void* const dst, const ptrdiff_t dst_stride, + const int height, const __m128i* const v_tap) { + auto* dst8 = static_cast<uint8_t*>(dst); + auto* dst16 = static_cast<uint16_t*>(dst); + + __m128i srcs[9]; + + if (num_taps == 2) { + srcs[2] = _mm_setzero_si128(); + // 00 01 02 03 + srcs[0] = Load4(src); + src += src_stride; + + int y = height; + do { + // 10 11 12 13 + const __m128i a = Load4(src); + // 00 01 02 03 10 11 12 13 + srcs[0] = _mm_unpacklo_epi32(srcs[0], a); + src += src_stride; + // 20 21 22 23 + srcs[2] = Load4(src); + src += src_stride; + // 10 11 12 13 20 21 22 23 + srcs[1] = _mm_unpacklo_epi32(a, srcs[2]); + + const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap); + if (is_compound) { + const __m128i results = Compound1DShift(sums); + StoreUnaligned16(dst16, results); + dst16 += 4 << 1; + } else { + const __m128i results_16 = + RightShiftWithRounding_S16(sums, kFilterBits - 1); + const __m128i results = _mm_packus_epi16(results_16, results_16); + Store4(dst8, results); + dst8 += dst_stride; + Store4(dst8, _mm_srli_si128(results, 4)); + dst8 += dst_stride; + } + + srcs[0] = srcs[2]; + y -= 2; + } while (y != 0); + } else if (num_taps == 4) { + srcs[4] = _mm_setzero_si128(); + // 00 01 02 03 + srcs[0] = Load4(src); + src += src_stride; + // 10 11 12 13 + const __m128i a = Load4(src); + // 00 01 02 03 10 11 12 13 + srcs[0] = _mm_unpacklo_epi32(srcs[0], a); + src += src_stride; + // 20 21 22 23 + srcs[2] = Load4(src); + src += src_stride; + // 10 11 12 13 20 21 22 23 + srcs[1] = _mm_unpacklo_epi32(a, srcs[2]); + + int y = height; + do { + // 30 31 32 33 + const __m128i b = Load4(src); + // 20 21 22 23 30 31 32 33 + srcs[2] = _mm_unpacklo_epi32(srcs[2], b); + src += src_stride; + // 40 41 42 43 + srcs[4] = Load4(src); + src += src_stride; + // 30 31 32 33 40 41 42 43 + srcs[3] = _mm_unpacklo_epi32(b, srcs[4]); + + const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap); + if (is_compound) { + const __m128i results = Compound1DShift(sums); + StoreUnaligned16(dst16, results); + dst16 += 4 << 1; + } else { + const __m128i results_16 = + RightShiftWithRounding_S16(sums, kFilterBits - 1); + const __m128i results = _mm_packus_epi16(results_16, results_16); + Store4(dst8, results); + dst8 += dst_stride; + Store4(dst8, _mm_srli_si128(results, 4)); + dst8 += dst_stride; + } + + srcs[0] = srcs[2]; + srcs[1] = srcs[3]; + srcs[2] = srcs[4]; + y -= 2; + } while (y != 0); + } else if (num_taps == 6) { + srcs[6] = _mm_setzero_si128(); + // 00 01 02 03 + srcs[0] = Load4(src); + src += src_stride; + // 10 11 12 13 + const __m128i a = Load4(src); + // 00 01 02 03 10 11 12 13 + srcs[0] = _mm_unpacklo_epi32(srcs[0], a); + src += src_stride; + // 20 21 22 23 + srcs[2] = Load4(src); + src += src_stride; + // 10 11 12 13 20 21 22 23 + srcs[1] = _mm_unpacklo_epi32(a, srcs[2]); + // 30 31 32 33 + const __m128i b = Load4(src); + // 20 21 22 23 30 31 32 33 + srcs[2] = _mm_unpacklo_epi32(srcs[2], b); + src += src_stride; + // 40 41 42 43 + srcs[4] = Load4(src); + src += src_stride; + // 30 31 32 33 40 41 42 43 + srcs[3] = _mm_unpacklo_epi32(b, srcs[4]); + + int y = height; + do { + // 50 51 52 53 + const __m128i c = Load4(src); + // 40 41 42 43 50 51 52 53 + srcs[4] = _mm_unpacklo_epi32(srcs[4], c); + src += src_stride; + // 60 61 62 63 + srcs[6] = Load4(src); + src += src_stride; + // 50 51 52 53 60 61 62 63 + srcs[5] = _mm_unpacklo_epi32(c, srcs[6]); + + const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap); + if (is_compound) { + const __m128i results = Compound1DShift(sums); + StoreUnaligned16(dst16, results); + dst16 += 4 << 1; + } else { + const __m128i results_16 = + RightShiftWithRounding_S16(sums, kFilterBits - 1); + const __m128i results = _mm_packus_epi16(results_16, results_16); + Store4(dst8, results); + dst8 += dst_stride; + Store4(dst8, _mm_srli_si128(results, 4)); + dst8 += dst_stride; + } + + srcs[0] = srcs[2]; + srcs[1] = srcs[3]; + srcs[2] = srcs[4]; + srcs[3] = srcs[5]; + srcs[4] = srcs[6]; + y -= 2; + } while (y != 0); + } else if (num_taps == 8) { + srcs[8] = _mm_setzero_si128(); + // 00 01 02 03 + srcs[0] = Load4(src); + src += src_stride; + // 10 11 12 13 + const __m128i a = Load4(src); + // 00 01 02 03 10 11 12 13 + srcs[0] = _mm_unpacklo_epi32(srcs[0], a); + src += src_stride; + // 20 21 22 23 + srcs[2] = Load4(src); + src += src_stride; + // 10 11 12 13 20 21 22 23 + srcs[1] = _mm_unpacklo_epi32(a, srcs[2]); + // 30 31 32 33 + const __m128i b = Load4(src); + // 20 21 22 23 30 31 32 33 + srcs[2] = _mm_unpacklo_epi32(srcs[2], b); + src += src_stride; + // 40 41 42 43 + srcs[4] = Load4(src); + src += src_stride; + // 30 31 32 33 40 41 42 43 + srcs[3] = _mm_unpacklo_epi32(b, srcs[4]); + // 50 51 52 53 + const __m128i c = Load4(src); + // 40 41 42 43 50 51 52 53 + srcs[4] = _mm_unpacklo_epi32(srcs[4], c); + src += src_stride; + // 60 61 62 63 + srcs[6] = Load4(src); + src += src_stride; + // 50 51 52 53 60 61 62 63 + srcs[5] = _mm_unpacklo_epi32(c, srcs[6]); + + int y = height; + do { + // 70 71 72 73 + const __m128i d = Load4(src); + // 60 61 62 63 70 71 72 73 + srcs[6] = _mm_unpacklo_epi32(srcs[6], d); + src += src_stride; + // 80 81 82 83 + srcs[8] = Load4(src); + src += src_stride; + // 70 71 72 73 80 81 82 83 + srcs[7] = _mm_unpacklo_epi32(d, srcs[8]); + + const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap); + if (is_compound) { + const __m128i results = Compound1DShift(sums); + StoreUnaligned16(dst16, results); + dst16 += 4 << 1; + } else { + const __m128i results_16 = + RightShiftWithRounding_S16(sums, kFilterBits - 1); + const __m128i results = _mm_packus_epi16(results_16, results_16); + Store4(dst8, results); + dst8 += dst_stride; + Store4(dst8, _mm_srli_si128(results, 4)); + dst8 += dst_stride; + } + + srcs[0] = srcs[2]; + srcs[1] = srcs[3]; + srcs[2] = srcs[4]; + srcs[3] = srcs[5]; + srcs[4] = srcs[6]; + srcs[5] = srcs[7]; + srcs[6] = srcs[8]; + y -= 2; + } while (y != 0); + } +} + +template <int num_taps, int filter_index, bool negative_outside_taps = false> +void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride, + void* const dst, const ptrdiff_t dst_stride, + const int height, const __m128i* const v_tap) { + auto* dst8 = static_cast<uint8_t*>(dst); + + __m128i srcs[9]; + + if (num_taps == 2) { + srcs[2] = _mm_setzero_si128(); + // 00 01 + srcs[0] = Load2(src); + src += src_stride; + + int y = height; + do { + // 00 01 10 11 + srcs[0] = Load2<1>(src, srcs[0]); + src += src_stride; + // 00 01 10 11 20 21 + srcs[0] = Load2<2>(src, srcs[0]); + src += src_stride; + // 00 01 10 11 20 21 30 31 + srcs[0] = Load2<3>(src, srcs[0]); + src += src_stride; + // 40 41 + srcs[2] = Load2<0>(src, srcs[2]); + src += src_stride; + // 00 01 10 11 20 21 30 31 40 41 + const __m128i srcs_0_2 = _mm_unpacklo_epi64(srcs[0], srcs[2]); + // 10 11 20 21 30 31 40 41 + srcs[1] = _mm_srli_si128(srcs_0_2, 2); + // This uses srcs[0]..srcs[1]. + const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap); + const __m128i results_16 = + RightShiftWithRounding_S16(sums, kFilterBits - 1); + const __m128i results = _mm_packus_epi16(results_16, results_16); + + Store2(dst8, results); + dst8 += dst_stride; + Store2(dst8, _mm_srli_si128(results, 2)); + if (height == 2) return; + dst8 += dst_stride; + Store2(dst8, _mm_srli_si128(results, 4)); + dst8 += dst_stride; + Store2(dst8, _mm_srli_si128(results, 6)); + dst8 += dst_stride; + + srcs[0] = srcs[2]; + y -= 4; + } while (y != 0); + } else if (num_taps == 4) { + srcs[4] = _mm_setzero_si128(); + + // 00 01 + srcs[0] = Load2(src); + src += src_stride; + // 00 01 10 11 + srcs[0] = Load2<1>(src, srcs[0]); + src += src_stride; + // 00 01 10 11 20 21 + srcs[0] = Load2<2>(src, srcs[0]); + src += src_stride; + + int y = height; + do { + // 00 01 10 11 20 21 30 31 + srcs[0] = Load2<3>(src, srcs[0]); + src += src_stride; + // 40 41 + srcs[4] = Load2<0>(src, srcs[4]); + src += src_stride; + // 40 41 50 51 + srcs[4] = Load2<1>(src, srcs[4]); + src += src_stride; + // 40 41 50 51 60 61 + srcs[4] = Load2<2>(src, srcs[4]); + src += src_stride; + // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 + const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]); + // 10 11 20 21 30 31 40 41 + srcs[1] = _mm_srli_si128(srcs_0_4, 2); + // 20 21 30 31 40 41 50 51 + srcs[2] = _mm_srli_si128(srcs_0_4, 4); + // 30 31 40 41 50 51 60 61 + srcs[3] = _mm_srli_si128(srcs_0_4, 6); + + // This uses srcs[0]..srcs[3]. + const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap); + const __m128i results_16 = + RightShiftWithRounding_S16(sums, kFilterBits - 1); + const __m128i results = _mm_packus_epi16(results_16, results_16); + + Store2(dst8, results); + dst8 += dst_stride; + Store2(dst8, _mm_srli_si128(results, 2)); + if (height == 2) return; + dst8 += dst_stride; + Store2(dst8, _mm_srli_si128(results, 4)); + dst8 += dst_stride; + Store2(dst8, _mm_srli_si128(results, 6)); + dst8 += dst_stride; + + srcs[0] = srcs[4]; + y -= 4; + } while (y != 0); + } else if (num_taps == 6) { + // During the vertical pass the number of taps is restricted when + // |height| <= 4. + assert(height > 4); + srcs[8] = _mm_setzero_si128(); + + // 00 01 + srcs[0] = Load2(src); + src += src_stride; + // 00 01 10 11 + srcs[0] = Load2<1>(src, srcs[0]); + src += src_stride; + // 00 01 10 11 20 21 + srcs[0] = Load2<2>(src, srcs[0]); + src += src_stride; + // 00 01 10 11 20 21 30 31 + srcs[0] = Load2<3>(src, srcs[0]); + src += src_stride; + // 40 41 + srcs[4] = Load2(src); + src += src_stride; + // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 + const __m128i srcs_0_4x = _mm_unpacklo_epi64(srcs[0], srcs[4]); + // 10 11 20 21 30 31 40 41 + srcs[1] = _mm_srli_si128(srcs_0_4x, 2); + + int y = height; + do { + // 40 41 50 51 + srcs[4] = Load2<1>(src, srcs[4]); + src += src_stride; + // 40 41 50 51 60 61 + srcs[4] = Load2<2>(src, srcs[4]); + src += src_stride; + // 40 41 50 51 60 61 70 71 + srcs[4] = Load2<3>(src, srcs[4]); + src += src_stride; + // 80 81 + srcs[8] = Load2<0>(src, srcs[8]); + src += src_stride; + // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 + const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]); + // 20 21 30 31 40 41 50 51 + srcs[2] = _mm_srli_si128(srcs_0_4, 4); + // 30 31 40 41 50 51 60 61 + srcs[3] = _mm_srli_si128(srcs_0_4, 6); + const __m128i srcs_4_8 = _mm_unpacklo_epi64(srcs[4], srcs[8]); + // 50 51 60 61 70 71 80 81 + srcs[5] = _mm_srli_si128(srcs_4_8, 2); + + // This uses srcs[0]..srcs[5]. + const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap); + const __m128i results_16 = + RightShiftWithRounding_S16(sums, kFilterBits - 1); + const __m128i results = _mm_packus_epi16(results_16, results_16); + + Store2(dst8, results); + dst8 += dst_stride; + Store2(dst8, _mm_srli_si128(results, 2)); + dst8 += dst_stride; + Store2(dst8, _mm_srli_si128(results, 4)); + dst8 += dst_stride; + Store2(dst8, _mm_srli_si128(results, 6)); + dst8 += dst_stride; + + srcs[0] = srcs[4]; + srcs[1] = srcs[5]; + srcs[4] = srcs[8]; + y -= 4; + } while (y != 0); + } else if (num_taps == 8) { + // During the vertical pass the number of taps is restricted when + // |height| <= 4. + assert(height > 4); + srcs[8] = _mm_setzero_si128(); + // 00 01 + srcs[0] = Load2(src); + src += src_stride; + // 00 01 10 11 + srcs[0] = Load2<1>(src, srcs[0]); + src += src_stride; + // 00 01 10 11 20 21 + srcs[0] = Load2<2>(src, srcs[0]); + src += src_stride; + // 00 01 10 11 20 21 30 31 + srcs[0] = Load2<3>(src, srcs[0]); + src += src_stride; + // 40 41 + srcs[4] = Load2(src); + src += src_stride; + // 40 41 50 51 + srcs[4] = Load2<1>(src, srcs[4]); + src += src_stride; + // 40 41 50 51 60 61 + srcs[4] = Load2<2>(src, srcs[4]); + src += src_stride; + + // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 + const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]); + // 10 11 20 21 30 31 40 41 + srcs[1] = _mm_srli_si128(srcs_0_4, 2); + // 20 21 30 31 40 41 50 51 + srcs[2] = _mm_srli_si128(srcs_0_4, 4); + // 30 31 40 41 50 51 60 61 + srcs[3] = _mm_srli_si128(srcs_0_4, 6); + + int y = height; + do { + // 40 41 50 51 60 61 70 71 + srcs[4] = Load2<3>(src, srcs[4]); + src += src_stride; + // 80 81 + srcs[8] = Load2<0>(src, srcs[8]); + src += src_stride; + // 80 81 90 91 + srcs[8] = Load2<1>(src, srcs[8]); + src += src_stride; + // 80 81 90 91 a0 a1 + srcs[8] = Load2<2>(src, srcs[8]); + src += src_stride; + + // 40 41 50 51 60 61 70 71 80 81 90 91 a0 a1 + const __m128i srcs_4_8 = _mm_unpacklo_epi64(srcs[4], srcs[8]); + // 50 51 60 61 70 71 80 81 + srcs[5] = _mm_srli_si128(srcs_4_8, 2); + // 60 61 70 71 80 81 90 91 + srcs[6] = _mm_srli_si128(srcs_4_8, 4); + // 70 71 80 81 90 91 a0 a1 + srcs[7] = _mm_srli_si128(srcs_4_8, 6); + + // This uses srcs[0]..srcs[7]. + const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap); + const __m128i results_16 = + RightShiftWithRounding_S16(sums, kFilterBits - 1); + const __m128i results = _mm_packus_epi16(results_16, results_16); + + Store2(dst8, results); + dst8 += dst_stride; + Store2(dst8, _mm_srli_si128(results, 2)); + dst8 += dst_stride; + Store2(dst8, _mm_srli_si128(results, 4)); + dst8 += dst_stride; + Store2(dst8, _mm_srli_si128(results, 6)); + dst8 += dst_stride; + + srcs[0] = srcs[4]; + srcs[1] = srcs[5]; + srcs[2] = srcs[6]; + srcs[3] = srcs[7]; + srcs[4] = srcs[8]; + y -= 4; + } while (y != 0); + } +} diff --git a/src/dsp/x86/distance_weighted_blend_sse4.cc b/src/dsp/x86/distance_weighted_blend_sse4.cc index deb57ef..3c29b19 100644 --- a/src/dsp/x86/distance_weighted_blend_sse4.cc +++ b/src/dsp/x86/distance_weighted_blend_sse4.cc @@ -30,6 +30,7 @@ namespace libgav1 { namespace dsp { +namespace low_bitdepth { namespace { constexpr int kInterPostRoundBit = 4; @@ -212,13 +213,231 @@ void Init8bpp() { } } // namespace +} // namespace low_bitdepth -void DistanceWeightedBlendInit_SSE4_1() { Init8bpp(); } +#if LIBGAV1_MAX_BITDEPTH >= 10 +namespace high_bitdepth { +namespace { + +constexpr int kMax10bppSample = (1 << 10) - 1; +constexpr int kInterPostRoundBit = 4; + +inline __m128i ComputeWeightedAverage8(const __m128i& pred0, + const __m128i& pred1, + const __m128i& weight0, + const __m128i& weight1) { + // This offset is a combination of round_factor and round_offset + // which are to be added and subtracted respectively. + // Here kInterPostRoundBit + 4 is considering bitdepth=10. + constexpr int offset = + (1 << ((kInterPostRoundBit + 4) - 1)) - (kCompoundOffset << 4); + const __m128i zero = _mm_setzero_si128(); + const __m128i bias = _mm_set1_epi32(offset); + const __m128i clip_high = _mm_set1_epi16(kMax10bppSample); + + __m128i prediction0 = _mm_cvtepu16_epi32(pred0); + __m128i mult0 = _mm_mullo_epi32(prediction0, weight0); + __m128i prediction1 = _mm_cvtepu16_epi32(pred1); + __m128i mult1 = _mm_mullo_epi32(prediction1, weight1); + __m128i sum = _mm_add_epi32(mult0, mult1); + sum = _mm_add_epi32(sum, bias); + const __m128i result0 = _mm_srai_epi32(sum, kInterPostRoundBit + 4); + + prediction0 = _mm_unpackhi_epi16(pred0, zero); + mult0 = _mm_mullo_epi32(prediction0, weight0); + prediction1 = _mm_unpackhi_epi16(pred1, zero); + mult1 = _mm_mullo_epi32(prediction1, weight1); + sum = _mm_add_epi32(mult0, mult1); + sum = _mm_add_epi32(sum, bias); + const __m128i result1 = _mm_srai_epi32(sum, kInterPostRoundBit + 4); + const __m128i pack = _mm_packus_epi32(result0, result1); + + return _mm_min_epi16(pack, clip_high); +} + +template <int height> +inline void DistanceWeightedBlend4xH_SSE4_1( + const uint16_t* pred_0, const uint16_t* pred_1, const uint8_t weight_0, + const uint8_t weight_1, void* const dest, const ptrdiff_t dest_stride) { + auto* dst = static_cast<uint16_t*>(dest); + const __m128i weight0 = _mm_set1_epi32(weight_0); + const __m128i weight1 = _mm_set1_epi32(weight_1); + + int y = height; + do { + const __m128i src_00 = LoadLo8(pred_0); + const __m128i src_10 = LoadLo8(pred_1); + pred_0 += 4; + pred_1 += 4; + __m128i src_0 = LoadHi8(src_00, pred_0); + __m128i src_1 = LoadHi8(src_10, pred_1); + pred_0 += 4; + pred_1 += 4; + const __m128i res0 = + ComputeWeightedAverage8(src_0, src_1, weight0, weight1); + + const __m128i src_01 = LoadLo8(pred_0); + const __m128i src_11 = LoadLo8(pred_1); + pred_0 += 4; + pred_1 += 4; + src_0 = LoadHi8(src_01, pred_0); + src_1 = LoadHi8(src_11, pred_1); + pred_0 += 4; + pred_1 += 4; + const __m128i res1 = + ComputeWeightedAverage8(src_0, src_1, weight0, weight1); + + StoreLo8(dst, res0); + dst += dest_stride; + StoreHi8(dst, res0); + dst += dest_stride; + StoreLo8(dst, res1); + dst += dest_stride; + StoreHi8(dst, res1); + dst += dest_stride; + y -= 4; + } while (y != 0); +} + +template <int height> +inline void DistanceWeightedBlend8xH_SSE4_1( + const uint16_t* pred_0, const uint16_t* pred_1, const uint8_t weight_0, + const uint8_t weight_1, void* const dest, const ptrdiff_t dest_stride) { + auto* dst = static_cast<uint16_t*>(dest); + const __m128i weight0 = _mm_set1_epi32(weight_0); + const __m128i weight1 = _mm_set1_epi32(weight_1); + + int y = height; + do { + const __m128i src_00 = LoadAligned16(pred_0); + const __m128i src_10 = LoadAligned16(pred_1); + pred_0 += 8; + pred_1 += 8; + const __m128i res0 = + ComputeWeightedAverage8(src_00, src_10, weight0, weight1); + + const __m128i src_01 = LoadAligned16(pred_0); + const __m128i src_11 = LoadAligned16(pred_1); + pred_0 += 8; + pred_1 += 8; + const __m128i res1 = + ComputeWeightedAverage8(src_01, src_11, weight0, weight1); + + StoreUnaligned16(dst, res0); + dst += dest_stride; + StoreUnaligned16(dst, res1); + dst += dest_stride; + y -= 2; + } while (y != 0); +} + +inline void DistanceWeightedBlendLarge_SSE4_1( + const uint16_t* pred_0, const uint16_t* pred_1, const uint8_t weight_0, + const uint8_t weight_1, const int width, const int height, void* const dest, + const ptrdiff_t dest_stride) { + auto* dst = static_cast<uint16_t*>(dest); + const __m128i weight0 = _mm_set1_epi32(weight_0); + const __m128i weight1 = _mm_set1_epi32(weight_1); + + int y = height; + do { + int x = 0; + do { + const __m128i src_0_lo = LoadAligned16(pred_0 + x); + const __m128i src_1_lo = LoadAligned16(pred_1 + x); + const __m128i res_lo = + ComputeWeightedAverage8(src_0_lo, src_1_lo, weight0, weight1); + + const __m128i src_0_hi = LoadAligned16(pred_0 + x + 8); + const __m128i src_1_hi = LoadAligned16(pred_1 + x + 8); + const __m128i res_hi = + ComputeWeightedAverage8(src_0_hi, src_1_hi, weight0, weight1); + + StoreUnaligned16(dst + x, res_lo); + x += 8; + StoreUnaligned16(dst + x, res_hi); + x += 8; + } while (x < width); + dst += dest_stride; + pred_0 += width; + pred_1 += width; + } while (--y != 0); +} + +void DistanceWeightedBlend_SSE4_1(const void* prediction_0, + const void* prediction_1, + const uint8_t weight_0, + const uint8_t weight_1, const int width, + const int height, void* const dest, + const ptrdiff_t dest_stride) { + const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); + const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); + const ptrdiff_t dst_stride = dest_stride / sizeof(*pred_0); + if (width == 4) { + if (height == 4) { + DistanceWeightedBlend4xH_SSE4_1<4>(pred_0, pred_1, weight_0, weight_1, + dest, dst_stride); + } else if (height == 8) { + DistanceWeightedBlend4xH_SSE4_1<8>(pred_0, pred_1, weight_0, weight_1, + dest, dst_stride); + } else { + assert(height == 16); + DistanceWeightedBlend4xH_SSE4_1<16>(pred_0, pred_1, weight_0, weight_1, + dest, dst_stride); + } + return; + } + + if (width == 8) { + switch (height) { + case 4: + DistanceWeightedBlend8xH_SSE4_1<4>(pred_0, pred_1, weight_0, weight_1, + dest, dst_stride); + return; + case 8: + DistanceWeightedBlend8xH_SSE4_1<8>(pred_0, pred_1, weight_0, weight_1, + dest, dst_stride); + return; + case 16: + DistanceWeightedBlend8xH_SSE4_1<16>(pred_0, pred_1, weight_0, weight_1, + dest, dst_stride); + return; + default: + assert(height == 32); + DistanceWeightedBlend8xH_SSE4_1<32>(pred_0, pred_1, weight_0, weight_1, + dest, dst_stride); + + return; + } + } + + DistanceWeightedBlendLarge_SSE4_1(pred_0, pred_1, weight_0, weight_1, width, + height, dest, dst_stride); +} + +void Init10bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10); + assert(dsp != nullptr); +#if DSP_ENABLED_10BPP_SSE4_1(DistanceWeightedBlend) + dsp->distance_weighted_blend = DistanceWeightedBlend_SSE4_1; +#endif +} + +} // namespace +} // namespace high_bitdepth +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +void DistanceWeightedBlendInit_SSE4_1() { + low_bitdepth::Init8bpp(); +#if LIBGAV1_MAX_BITDEPTH >= 10 + high_bitdepth::Init10bpp(); +#endif +} } // namespace dsp } // namespace libgav1 -#else // !LIBGAV1_TARGETING_SSE4_1 +#else // !LIBGAV1_TARGETING_SSE4_1 namespace libgav1 { namespace dsp { diff --git a/src/dsp/x86/distance_weighted_blend_sse4.h b/src/dsp/x86/distance_weighted_blend_sse4.h index 8646eca..dbb9f88 100644 --- a/src/dsp/x86/distance_weighted_blend_sse4.h +++ b/src/dsp/x86/distance_weighted_blend_sse4.h @@ -36,6 +36,10 @@ void DistanceWeightedBlendInit_SSE4_1(); #define LIBGAV1_Dsp8bpp_DistanceWeightedBlend LIBGAV1_CPU_SSE4_1 #endif +#ifndef LIBGAV1_Dsp10bpp_DistanceWeightedBlend +#define LIBGAV1_Dsp10bpp_DistanceWeightedBlend LIBGAV1_CPU_SSE4_1 +#endif + #endif // LIBGAV1_TARGETING_SSE4_1 #endif // LIBGAV1_SRC_DSP_X86_DISTANCE_WEIGHTED_BLEND_SSE4_H_ diff --git a/src/dsp/x86/film_grain_sse4.cc b/src/dsp/x86/film_grain_sse4.cc new file mode 100644 index 0000000..745c1ca --- /dev/null +++ b/src/dsp/x86/film_grain_sse4.cc @@ -0,0 +1,514 @@ +// Copyright 2020 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/film_grain.h" +#include "src/utils/cpu.h" + +#if LIBGAV1_TARGETING_SSE4_1 +#include <smmintrin.h> + +#include <cassert> +#include <cstddef> +#include <cstdint> +#include <cstring> + +#include "src/dsp/common.h" +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/dsp/film_grain_common.h" +#include "src/dsp/x86/common_sse4.h" +#include "src/utils/common.h" +#include "src/utils/compiler_attributes.h" +#include "src/utils/logging.h" + +namespace libgav1 { +namespace dsp { +namespace film_grain { +namespace { + +// Load 8 values from source, widening to int16_t intermediate value size. +// The function is overloaded for each type and bitdepth for simplicity. +inline __m128i LoadSource(const int8_t* src) { + return _mm_cvtepi8_epi16(LoadLo8(src)); +} + +// Load 8 values from source, widening to int16_t intermediate value size. +inline __m128i LoadSource(const uint8_t* src) { + return _mm_cvtepu8_epi16(LoadLo8(src)); +} + +inline __m128i LoadSourceMsan(const uint8_t* src, const int valid_range) { + return _mm_cvtepu8_epi16(LoadLo8Msan(src, 8 - valid_range)); +} + +// Store 8 values to dest, narrowing to uint8_t from int16_t intermediate value. +inline void StoreUnsigned(uint8_t* dest, const __m128i data) { + StoreLo8(dest, _mm_packus_epi16(data, data)); +} + +#if LIBGAV1_MAX_BITDEPTH >= 10 +// Load 8 values from source. +inline __m128i LoadSource(const int16_t* src) { return LoadUnaligned16(src); } + +// Load 8 values from source. +inline __m128i LoadSource(const uint16_t* src) { return LoadUnaligned16(src); } + +// Store 8 values to dest. +inline void StoreUnsigned(uint16_t* dest, const __m128i data) { + StoreUnaligned16(dest, data); +} +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +// For BlendNoiseWithImageChromaWithCfl, only |subsampling_x| is needed. +inline __m128i GetAverageLuma(const uint8_t* const luma, int subsampling_x) { + if (subsampling_x != 0) { + const __m128i src = LoadUnaligned16(luma); + + return RightShiftWithRounding_U16( + _mm_hadd_epi16(_mm_cvtepu8_epi16(src), + _mm_unpackhi_epi8(src, _mm_setzero_si128())), + 1); + } + return _mm_cvtepu8_epi16(LoadLo8(luma)); +} + +inline __m128i GetAverageLumaMsan(const uint8_t* const luma, int subsampling_x, + int valid_range) { + if (subsampling_x != 0) { + const __m128i src = LoadUnaligned16Msan(luma, 16 - valid_range); + + return RightShiftWithRounding_U16( + _mm_hadd_epi16(_mm_cvtepu8_epi16(src), + _mm_unpackhi_epi8(src, _mm_setzero_si128())), + 1); + } + return _mm_cvtepu8_epi16(LoadLo8Msan(luma, 8 - valid_range)); +} + +#if LIBGAV1_MAX_BITDEPTH >= 10 +// For BlendNoiseWithImageChromaWithCfl, only |subsampling_x| is needed. +inline __m128i GetAverageLuma(const uint16_t* const luma, int subsampling_x) { + if (subsampling_x != 0) { + return RightShiftWithRounding_U16( + _mm_hadd_epi16(LoadUnaligned16(luma), LoadUnaligned16(luma + 8)), 1); + } + return LoadUnaligned16(luma); +} + +inline __m128i GetAverageLumaMsan(const uint16_t* const luma, int subsampling_x, + int valid_range) { + if (subsampling_x != 0) { + return RightShiftWithRounding_U16( + _mm_hadd_epi16( + LoadUnaligned16Msan(luma, 16 - valid_range * sizeof(*luma)), + LoadUnaligned16Msan(luma + 8, 32 - valid_range * sizeof(*luma))), + 1); + } + return LoadUnaligned16Msan(luma, 16 - valid_range * sizeof(*luma)); +} +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +inline __m128i Clip3(const __m128i value, const __m128i low, + const __m128i high) { + const __m128i clipped_to_ceiling = _mm_min_epi16(high, value); + return _mm_max_epi16(low, clipped_to_ceiling); +} + +template <int bitdepth, typename Pixel> +inline __m128i GetScalingFactors( + const uint8_t scaling_lut[kScalingLookupTableSize], const Pixel* source) { + alignas(16) int16_t start_vals[8]; + if (bitdepth == 8) { + // TODO(petersonab): Speed this up by creating a uint16_t scaling_lut. + // Currently this code results in a series of movzbl. + for (int i = 0; i < 8; ++i) { + start_vals[i] = scaling_lut[source[i]]; + } + return LoadAligned16(start_vals); + } + alignas(16) int16_t end_vals[8]; + // TODO(petersonab): Precompute this into a larger table for direct lookups. + for (int i = 0; i < 8; ++i) { + const int index = source[i] >> 2; + start_vals[i] = scaling_lut[index]; + end_vals[i] = scaling_lut[index + 1]; + } + const __m128i start = LoadAligned16(start_vals); + const __m128i end = LoadAligned16(end_vals); + __m128i remainder = LoadSource(source); + remainder = _mm_srli_epi16(_mm_slli_epi16(remainder, 14), 1); + const __m128i delta = _mm_mulhrs_epi16(_mm_sub_epi16(end, start), remainder); + return _mm_add_epi16(start, delta); +} + +// |scaling_shift| is in range [8,11]. +template <int bitdepth> +inline __m128i ScaleNoise(const __m128i noise, const __m128i scaling, + const __m128i scaling_shift) { + const __m128i shifted_scale_factors = _mm_sll_epi16(scaling, scaling_shift); + return _mm_mulhrs_epi16(noise, shifted_scale_factors); +} + +template <int bitdepth, typename GrainType, typename Pixel> +void BlendNoiseWithImageLuma_SSE4_1( + const void* noise_image_ptr, int min_value, int max_luma, int scaling_shift, + int width, int height, int start_height, + const uint8_t scaling_lut_y[kScalingLookupTableSize], + const void* source_plane_y, ptrdiff_t source_stride_y, void* dest_plane_y, + ptrdiff_t dest_stride_y) { + const auto* noise_image = + static_cast<const Array2D<GrainType>*>(noise_image_ptr); + const auto* in_y_row = static_cast<const Pixel*>(source_plane_y); + source_stride_y /= sizeof(Pixel); + auto* out_y_row = static_cast<Pixel*>(dest_plane_y); + dest_stride_y /= sizeof(Pixel); + const __m128i floor = _mm_set1_epi16(min_value); + const __m128i ceiling = _mm_set1_epi16(max_luma); + const int safe_width = width & ~7; + const __m128i derived_scaling_shift = _mm_cvtsi32_si128(15 - scaling_shift); + int y = 0; + do { + int x = 0; + for (; x < safe_width; x += 8) { + // TODO(b/133525232): Make 16-pixel version of loop body. + const __m128i orig = LoadSource(&in_y_row[x]); + const __m128i scaling = + GetScalingFactors<bitdepth, Pixel>(scaling_lut_y, &in_y_row[x]); + __m128i noise = LoadSource(&(noise_image[kPlaneY][y + start_height][x])); + + noise = ScaleNoise<bitdepth>(noise, scaling, derived_scaling_shift); + const __m128i combined = _mm_add_epi16(orig, noise); + StoreUnsigned(&out_y_row[x], Clip3(combined, floor, ceiling)); + } + + if (x < width) { + Pixel luma_buffer[8]; + // Prevent arbitrary indices from entering GetScalingFactors. + memset(luma_buffer, 0, sizeof(luma_buffer)); + const int valid_range = width - x; + memcpy(luma_buffer, &in_y_row[x], valid_range * sizeof(in_y_row[0])); + luma_buffer[valid_range] = in_y_row[width - 1]; + const __m128i orig = LoadSource(&in_y_row[x]); + const __m128i scaling = + GetScalingFactors<bitdepth, Pixel>(scaling_lut_y, luma_buffer); + __m128i noise = LoadSource(&(noise_image[kPlaneY][y + start_height][x])); + + noise = ScaleNoise<bitdepth>(noise, scaling, derived_scaling_shift); + const __m128i combined = _mm_add_epi16(orig, noise); + StoreUnsigned(&out_y_row[x], Clip3(combined, floor, ceiling)); + } + in_y_row += source_stride_y; + out_y_row += dest_stride_y; + } while (++y < height); + out_y_row = static_cast<Pixel*>(dest_plane_y); +} + +template <int bitdepth, typename GrainType, typename Pixel> +inline __m128i BlendChromaValsWithCfl( + const Pixel* average_luma_buffer, + const uint8_t scaling_lut[kScalingLookupTableSize], + const Pixel* chroma_cursor, const GrainType* noise_image_cursor, + const __m128i scaling_shift) { + const __m128i scaling = + GetScalingFactors<bitdepth, Pixel>(scaling_lut, average_luma_buffer); + const __m128i orig = LoadSource(chroma_cursor); + __m128i noise = LoadSource(noise_image_cursor); + noise = ScaleNoise<bitdepth>(noise, scaling, scaling_shift); + return _mm_add_epi16(orig, noise); +} + +template <int bitdepth, typename GrainType, typename Pixel> +LIBGAV1_ALWAYS_INLINE void BlendChromaPlaneWithCfl_SSE4_1( + const Array2D<GrainType>& noise_image, int min_value, int max_chroma, + int width, int height, int start_height, int subsampling_x, + int subsampling_y, int scaling_shift, + const uint8_t scaling_lut[kScalingLookupTableSize], const Pixel* in_y_row, + ptrdiff_t source_stride_y, const Pixel* in_chroma_row, + ptrdiff_t source_stride_chroma, Pixel* out_chroma_row, + ptrdiff_t dest_stride) { + const __m128i floor = _mm_set1_epi16(min_value); + const __m128i ceiling = _mm_set1_epi16(max_chroma); + alignas(16) Pixel luma_buffer[16]; + + const int chroma_height = (height + subsampling_y) >> subsampling_y; + const int chroma_width = (width + subsampling_x) >> subsampling_x; + // |chroma_width| is rounded up. If |width| is odd, then the final pixel will + // need to be guarded from overread, even if |chroma_width| is divisible by 8. + const int safe_chroma_width = (chroma_width - (width & 1)) & ~7; + + // Writing to this buffer avoids the cost of doing 8 lane lookups in a row + // in GetScalingFactors. + Pixel average_luma_buffer[8]; + assert(start_height % 2 == 0); + start_height >>= subsampling_y; + const __m128i derived_scaling_shift = _mm_cvtsi32_si128(15 - scaling_shift); + int y = 0; + do { + int x = 0; + for (; x < safe_chroma_width; x += 8) { + const int luma_x = x << subsampling_x; + // TODO(petersonab): Consider specializing by subsampling_x. In the 444 + // case &in_y_row[x] can be passed to GetScalingFactors directly. + const __m128i average_luma = + GetAverageLuma(&in_y_row[luma_x], subsampling_x); + StoreUnsigned(average_luma_buffer, average_luma); + + const __m128i blended = + BlendChromaValsWithCfl<bitdepth, GrainType, Pixel>( + average_luma_buffer, scaling_lut, &in_chroma_row[x], + &(noise_image[y + start_height][x]), derived_scaling_shift); + StoreUnsigned(&out_chroma_row[x], Clip3(blended, floor, ceiling)); + } + + // This section only runs if width % (8 << sub_x) != 0. It should never run + // on 720p and above. + if (x < chroma_width) { + // Prevent huge indices from entering GetScalingFactors due to + // uninitialized values. This is not a problem in 8bpp because the table + // is made larger than 255 values. + if (bitdepth > 8) { + memset(luma_buffer, 0, sizeof(luma_buffer)); + } + const int luma_x = x << subsampling_x; + const int valid_range = width - luma_x; + assert(valid_range < 16); + memcpy(luma_buffer, &in_y_row[luma_x], valid_range * sizeof(in_y_row[0])); + luma_buffer[valid_range] = in_y_row[width - 1]; + const __m128i average_luma = + GetAverageLumaMsan(luma_buffer, subsampling_x, valid_range + 1); + StoreUnsigned(average_luma_buffer, average_luma); + + const __m128i blended = + BlendChromaValsWithCfl<bitdepth, GrainType, Pixel>( + average_luma_buffer, scaling_lut, &in_chroma_row[x], + &(noise_image[y + start_height][x]), derived_scaling_shift); + StoreUnsigned(&out_chroma_row[x], Clip3(blended, floor, ceiling)); + } + + in_y_row += source_stride_y << subsampling_y; + in_chroma_row += source_stride_chroma; + out_chroma_row += dest_stride; + } while (++y < chroma_height); +} + +// This function is for the case params_.chroma_scaling_from_luma == true. +// This further implies that scaling_lut_u == scaling_lut_v == scaling_lut_y. +template <int bitdepth, typename GrainType, typename Pixel> +void BlendNoiseWithImageChromaWithCfl_SSE4_1( + Plane plane, const FilmGrainParams& params, const void* noise_image_ptr, + int min_value, int max_chroma, int width, int height, int start_height, + int subsampling_x, int subsampling_y, + const uint8_t scaling_lut[kScalingLookupTableSize], + const void* source_plane_y, ptrdiff_t source_stride_y, + const void* source_plane_uv, ptrdiff_t source_stride_uv, + void* dest_plane_uv, ptrdiff_t dest_stride_uv) { + const auto* noise_image = + static_cast<const Array2D<GrainType>*>(noise_image_ptr); + const auto* in_y = static_cast<const Pixel*>(source_plane_y); + source_stride_y /= sizeof(Pixel); + + const auto* in_uv = static_cast<const Pixel*>(source_plane_uv); + source_stride_uv /= sizeof(Pixel); + auto* out_uv = static_cast<Pixel*>(dest_plane_uv); + dest_stride_uv /= sizeof(Pixel); + BlendChromaPlaneWithCfl_SSE4_1<bitdepth, GrainType, Pixel>( + noise_image[plane], min_value, max_chroma, width, height, start_height, + subsampling_x, subsampling_y, params.chroma_scaling, scaling_lut, in_y, + source_stride_y, in_uv, source_stride_uv, out_uv, dest_stride_uv); +} + +} // namespace + +namespace low_bitdepth { +namespace { + +// |offset| is 32x4 packed to add with the result of _mm_madd_epi16. +inline __m128i BlendChromaValsNoCfl8bpp( + const uint8_t scaling_lut[kScalingLookupTableSize], const __m128i& orig, + const int8_t* noise_image_cursor, const __m128i& average_luma, + const __m128i& scaling_shift, const __m128i& offset, + const __m128i& weights) { + uint8_t merged_buffer[8]; + const __m128i combined_lo = + _mm_madd_epi16(_mm_unpacklo_epi16(average_luma, orig), weights); + const __m128i combined_hi = + _mm_madd_epi16(_mm_unpackhi_epi16(average_luma, orig), weights); + const __m128i merged_base = _mm_packs_epi32(_mm_srai_epi32((combined_lo), 6), + _mm_srai_epi32((combined_hi), 6)); + + const __m128i merged = _mm_add_epi16(merged_base, offset); + + StoreLo8(merged_buffer, _mm_packus_epi16(merged, merged)); + const __m128i scaling = + GetScalingFactors<8, uint8_t>(scaling_lut, merged_buffer); + __m128i noise = LoadSource(noise_image_cursor); + noise = ScaleNoise<8>(noise, scaling, scaling_shift); + return _mm_add_epi16(orig, noise); +} + +LIBGAV1_ALWAYS_INLINE void BlendChromaPlane8bpp_SSE4_1( + const Array2D<int8_t>& noise_image, int min_value, int max_chroma, + int width, int height, int start_height, int subsampling_x, + int subsampling_y, int scaling_shift, int chroma_offset, + int chroma_multiplier, int luma_multiplier, + const uint8_t scaling_lut[kScalingLookupTableSize], const uint8_t* in_y_row, + ptrdiff_t source_stride_y, const uint8_t* in_chroma_row, + ptrdiff_t source_stride_chroma, uint8_t* out_chroma_row, + ptrdiff_t dest_stride) { + const __m128i floor = _mm_set1_epi16(min_value); + const __m128i ceiling = _mm_set1_epi16(max_chroma); + + const int chroma_height = (height + subsampling_y) >> subsampling_y; + const int chroma_width = (width + subsampling_x) >> subsampling_x; + // |chroma_width| is rounded up. If |width| is odd, then the final luma pixel + // will need to be guarded from overread, even if |chroma_width| is a + // multiple of 8. + const int safe_chroma_width = (chroma_width - (width & 1)) & ~7; + alignas(16) uint8_t luma_buffer[16]; + const __m128i offset = _mm_set1_epi16(chroma_offset); + const __m128i multipliers = _mm_set1_epi32(LeftShift(chroma_multiplier, 16) | + (luma_multiplier & 0xFFFF)); + const __m128i derived_scaling_shift = _mm_cvtsi32_si128(15 - scaling_shift); + + start_height >>= subsampling_y; + int y = 0; + do { + int x = 0; + for (; x < safe_chroma_width; x += 8) { + const int luma_x = x << subsampling_x; + const __m128i average_luma = + GetAverageLuma(&in_y_row[luma_x], subsampling_x); + const __m128i orig_chroma = LoadSource(&in_chroma_row[x]); + const __m128i blended = BlendChromaValsNoCfl8bpp( + scaling_lut, orig_chroma, &(noise_image[y + start_height][x]), + average_luma, derived_scaling_shift, offset, multipliers); + StoreUnsigned(&out_chroma_row[x], Clip3(blended, floor, ceiling)); + } + + if (x < chroma_width) { + // Begin right edge iteration. Same as the normal iterations, but the + // |average_luma| computation requires a duplicated luma value at the + // end. + const int luma_x = x << subsampling_x; + const int valid_range = width - luma_x; + assert(valid_range < 16); + // There is no need to pre-initialize this buffer, because merged values + // used as indices are saturated in the 8bpp case. Uninitialized values + // are written outside the frame. + memcpy(luma_buffer, &in_y_row[luma_x], valid_range * sizeof(in_y_row[0])); + luma_buffer[valid_range] = in_y_row[width - 1]; + const int valid_range_chroma = chroma_width - x; + uint8_t chroma_buffer[8]; + memcpy(chroma_buffer, &in_chroma_row[x], + valid_range_chroma * sizeof(in_chroma_row[0])); + + const __m128i average_luma = + GetAverageLumaMsan(luma_buffer, subsampling_x, valid_range + 1); + const __m128i orig_chroma = + LoadSourceMsan(chroma_buffer, valid_range_chroma); + const __m128i blended = BlendChromaValsNoCfl8bpp( + scaling_lut, orig_chroma, &(noise_image[y + start_height][x]), + average_luma, derived_scaling_shift, offset, multipliers); + StoreUnsigned(&out_chroma_row[x], Clip3(blended, floor, ceiling)); + // End of right edge iteration. + } + + in_y_row += source_stride_y << subsampling_y; + in_chroma_row += source_stride_chroma; + out_chroma_row += dest_stride; + } while (++y < chroma_height); +} + +// This function is for the case params_.chroma_scaling_from_luma == false. +void BlendNoiseWithImageChroma8bpp_SSE4_1( + Plane plane, const FilmGrainParams& params, const void* noise_image_ptr, + int min_value, int max_chroma, int width, int height, int start_height, + int subsampling_x, int subsampling_y, + const uint8_t scaling_lut[kScalingLookupTableSize], + const void* source_plane_y, ptrdiff_t source_stride_y, + const void* source_plane_uv, ptrdiff_t source_stride_uv, + void* dest_plane_uv, ptrdiff_t dest_stride_uv) { + assert(plane == kPlaneU || plane == kPlaneV); + const auto* noise_image = + static_cast<const Array2D<int8_t>*>(noise_image_ptr); + const auto* in_y = static_cast<const uint8_t*>(source_plane_y); + const auto* in_uv = static_cast<const uint8_t*>(source_plane_uv); + auto* out_uv = static_cast<uint8_t*>(dest_plane_uv); + + const int offset = (plane == kPlaneU) ? params.u_offset : params.v_offset; + const int luma_multiplier = + (plane == kPlaneU) ? params.u_luma_multiplier : params.v_luma_multiplier; + const int multiplier = + (plane == kPlaneU) ? params.u_multiplier : params.v_multiplier; + BlendChromaPlane8bpp_SSE4_1( + noise_image[plane], min_value, max_chroma, width, height, start_height, + subsampling_x, subsampling_y, params.chroma_scaling, offset, multiplier, + luma_multiplier, scaling_lut, in_y, source_stride_y, in_uv, + source_stride_uv, out_uv, dest_stride_uv); +} + +void Init8bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8); + assert(dsp != nullptr); + + dsp->film_grain.blend_noise_luma = + BlendNoiseWithImageLuma_SSE4_1<8, int8_t, uint8_t>; + dsp->film_grain.blend_noise_chroma[0] = BlendNoiseWithImageChroma8bpp_SSE4_1; + dsp->film_grain.blend_noise_chroma[1] = + BlendNoiseWithImageChromaWithCfl_SSE4_1<8, int8_t, uint8_t>; +} + +} // namespace +} // namespace low_bitdepth + +#if LIBGAV1_MAX_BITDEPTH >= 10 +namespace high_bitdepth { +namespace { + +void Init10bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10); + assert(dsp != nullptr); + + dsp->film_grain.blend_noise_luma = + BlendNoiseWithImageLuma_SSE4_1<10, int16_t, uint16_t>; + dsp->film_grain.blend_noise_chroma[1] = + BlendNoiseWithImageChromaWithCfl_SSE4_1<10, int16_t, uint16_t>; +} + +} // namespace +} // namespace high_bitdepth +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +} // namespace film_grain + +void FilmGrainInit_SSE4_1() { + film_grain::low_bitdepth::Init8bpp(); +#if LIBGAV1_MAX_BITDEPTH >= 10 + film_grain::high_bitdepth::Init10bpp(); +#endif // LIBGAV1_MAX_BITDEPTH >= 10 +} + +} // namespace dsp +} // namespace libgav1 + +#else // !LIBGAV1_ENABLE_SSE4_1 + +namespace libgav1 { +namespace dsp { + +void FilmGrainInit_SSE4_1() {} + +} // namespace dsp +} // namespace libgav1 +#endif // LIBGAV1_TARGETING_SSE4_1 diff --git a/src/dsp/x86/film_grain_sse4.h b/src/dsp/x86/film_grain_sse4.h new file mode 100644 index 0000000..1cacbac --- /dev/null +++ b/src/dsp/x86/film_grain_sse4.h @@ -0,0 +1,40 @@ +/* + * Copyright 2020 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_X86_FILM_GRAIN_SSE4_H_ +#define LIBGAV1_SRC_DSP_X86_FILM_GRAIN_SSE4_H_ + +#include "src/dsp/dsp.h" +#include "src/utils/cpu.h" + +namespace libgav1 { +namespace dsp { + +// Initialize members of Dsp::film_grain. This function is not thread-safe. +void FilmGrainInit_SSE4_1(); + +} // namespace dsp +} // namespace libgav1 + +#if LIBGAV1_TARGETING_SSE4_1 +#define LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseLuma LIBGAV1_DSP_SSE4_1 +#define LIBGAV1_Dsp10bpp_FilmGrainBlendNoiseLuma LIBGAV1_DSP_SSE4_1 +#define LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseChroma LIBGAV1_DSP_SSE4_1 +#define LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseChromaWithCfl LIBGAV1_DSP_SSE4_1 +#define LIBGAV1_Dsp10bpp_FilmGrainBlendNoiseChromaWithCfl LIBGAV1_DSP_SSE4_1 +#endif // LIBGAV1_TARGETING_SSE4_1 + +#endif // LIBGAV1_SRC_DSP_X86_FILM_GRAIN_SSE4_H_ diff --git a/src/dsp/x86/intra_edge_sse4.cc b/src/dsp/x86/intra_edge_sse4.cc index 4a8658d..d6af907 100644 --- a/src/dsp/x86/intra_edge_sse4.cc +++ b/src/dsp/x86/intra_edge_sse4.cc @@ -22,7 +22,7 @@ #include <cassert> #include <cstddef> #include <cstdint> -#include <cstring> // memcpy +#include <cstring> #include "src/dsp/constants.h" #include "src/dsp/dsp.h" @@ -259,7 +259,7 @@ void IntraEdgeInit_SSE4_1() { Init8bpp(); } } // namespace dsp } // namespace libgav1 -#else // !LIBGAV1_TARGETING_SSE4_1 +#else // !LIBGAV1_TARGETING_SSE4_1 namespace libgav1 { namespace dsp { diff --git a/src/dsp/x86/intrapred_cfl_sse4.cc b/src/dsp/x86/intrapred_cfl_sse4.cc index fac1556..f2dcfdb 100644 --- a/src/dsp/x86/intrapred_cfl_sse4.cc +++ b/src/dsp/x86/intrapred_cfl_sse4.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "src/dsp/intrapred.h" +#include "src/dsp/intrapred_cfl.h" #include "src/utils/cpu.h" #if LIBGAV1_TARGETING_SSE4_1 @@ -29,9 +29,48 @@ #include "src/dsp/x86/common_sse4.h" #include "src/utils/common.h" #include "src/utils/compiler_attributes.h" +#include "src/utils/constants.h" namespace libgav1 { namespace dsp { +namespace { + +// This duplicates the last two 16-bit values in |row|. +inline __m128i LastRowSamples(const __m128i row) { + return _mm_shuffle_epi32(row, 0xFF); +} + +// This duplicates the last 16-bit value in |row|. +inline __m128i LastRowResult(const __m128i row) { + const __m128i dup_row = _mm_shufflehi_epi16(row, 0xFF); + return _mm_shuffle_epi32(dup_row, 0xFF); +} + +// Takes in two sums of input row pairs, and completes the computation for two +// output rows. +inline __m128i StoreLumaResults4_420(const __m128i vertical_sum0, + const __m128i vertical_sum1, + int16_t* luma_ptr) { + __m128i result = _mm_hadd_epi16(vertical_sum0, vertical_sum1); + result = _mm_slli_epi16(result, 1); + StoreLo8(luma_ptr, result); + StoreHi8(luma_ptr + kCflLumaBufferStride, result); + return result; +} + +// Takes two halves of a vertically added pair of rows and completes the +// computation for one output row. +inline __m128i StoreLumaResults8_420(const __m128i vertical_sum0, + const __m128i vertical_sum1, + int16_t* luma_ptr) { + __m128i result = _mm_hadd_epi16(vertical_sum0, vertical_sum1); + result = _mm_slli_epi16(result, 1); + StoreUnaligned16(luma_ptr, result); + return result; +} + +} // namespace + namespace low_bitdepth { namespace { @@ -40,8 +79,8 @@ namespace { inline __m128i CflPredictUnclipped(const __m128i* input, __m128i alpha_q12, __m128i alpha_sign, __m128i dc_q0) { - __m128i ac_q3 = LoadUnaligned16(input); - __m128i ac_sign = _mm_sign_epi16(alpha_sign, ac_q3); + const __m128i ac_q3 = LoadUnaligned16(input); + const __m128i ac_sign = _mm_sign_epi16(alpha_sign, ac_q3); __m128i scaled_luma_q0 = _mm_mulhrs_epi16(_mm_abs_epi16(ac_q3), alpha_q12); scaled_luma_q0 = _mm_sign_epi16(scaled_luma_q0, ac_sign); return _mm_add_epi16(scaled_luma_q0, dc_q0); @@ -88,8 +127,7 @@ void CflIntraPredictor_SSE4_1( template <int block_height_log2, bool is_inside> void CflSubsampler444_4xH_SSE4_1( int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], - const int /*max_luma_width*/, const int max_luma_height, - const void* const source, ptrdiff_t stride) { + const int max_luma_height, const void* const source, ptrdiff_t stride) { static_assert(block_height_log2 <= 4, ""); const int block_height = 1 << block_height_log2; const int visible_height = max_luma_height; @@ -119,12 +157,15 @@ void CflSubsampler444_4xH_SSE4_1( } while (y < visible_height); if (!is_inside) { - int y = visible_height; + // Replicate the 2 high lanes. + samples = _mm_shuffle_epi32(samples, 0xee); do { + StoreLo8(luma_ptr, samples); + luma_ptr += kCflLumaBufferStride; StoreHi8(luma_ptr, samples); luma_ptr += kCflLumaBufferStride; sum = _mm_add_epi16(sum, samples); - ++y; + y += 2; } while (y < block_height); } @@ -152,15 +193,15 @@ void CflSubsampler444_4xH_SSE4_1( static_assert(block_height_log2 <= 4, ""); assert(max_luma_width >= 4); assert(max_luma_height >= 4); - const int block_height = 1 << block_height_log2; - const int block_width = 4; + static_cast<void>(max_luma_width); + constexpr int block_height = 1 << block_height_log2; - if (block_height <= max_luma_height && block_width <= max_luma_width) { - CflSubsampler444_4xH_SSE4_1<block_height_log2, true>( - luma, max_luma_width, max_luma_height, source, stride); + if (block_height <= max_luma_height) { + CflSubsampler444_4xH_SSE4_1<block_height_log2, true>(luma, max_luma_height, + source, stride); } else { - CflSubsampler444_4xH_SSE4_1<block_height_log2, false>( - luma, max_luma_width, max_luma_height, source, stride); + CflSubsampler444_4xH_SSE4_1<block_height_log2, false>(luma, max_luma_height, + source, stride); } } @@ -302,19 +343,9 @@ void CflSubsampler444_SSE4_1( __m128i inner_sum_lo, inner_sum_hi; int y = 0; do { -#if LIBGAV1_MSAN // We can load uninitialized values here. Even though they are - // then masked off by blendv, MSAN isn't smart enough to - // understand that. So we switch to a C implementation here. - uint16_t c_arr[16]; - for (int x = 0; x < 16; x++) { - const int x_index = std::min(x, visible_width_16 - 1); - c_arr[x] = src[x_index] << 3; - } - samples0 = LoadUnaligned16(c_arr); - samples1 = LoadUnaligned16(c_arr + 8); - static_cast<void>(blend_mask_16); -#else - __m128i samples01 = LoadUnaligned16(src); + // We can load uninitialized values here. Even though they are then masked + // off by blendv, MSAN doesn't model that behavior. + __m128i samples01 = LoadUnaligned16Msan(src, invisible_width_16); if (!inside) { const __m128i border16 = @@ -323,26 +354,15 @@ void CflSubsampler444_SSE4_1( } samples0 = _mm_slli_epi16(_mm_cvtepu8_epi16(samples01), 3); samples1 = _mm_slli_epi16(_mm_unpackhi_epi8(samples01, zero), 3); -#endif // LIBGAV1_MSAN StoreUnaligned16(luma_ptr, samples0); StoreUnaligned16(luma_ptr + 8, samples1); __m128i inner_sum = _mm_add_epi16(samples0, samples1); if (block_width == 32) { -#if LIBGAV1_MSAN // We can load uninitialized values here. Even though they are - // then masked off by blendv, MSAN isn't smart enough to - // understand that. So we switch to a C implementation here. - uint16_t c_arr[16]; - for (int x = 16; x < 32; x++) { - const int x_index = std::min(x, visible_width_32 - 1); - c_arr[x - 16] = src[x_index] << 3; - } - samples2 = LoadUnaligned16(c_arr); - samples3 = LoadUnaligned16(c_arr + 8); - static_cast<void>(blend_mask_32); -#else - __m128i samples23 = LoadUnaligned16(src + 16); + // We can load uninitialized values here. Even though they are then masked + // off by blendv, MSAN doesn't model that behavior. + __m128i samples23 = LoadUnaligned16Msan(src + 16, invisible_width_32); if (!inside) { const __m128i border32 = _mm_set1_epi8(static_cast<int8_t>(src[visible_width_32 - 1])); @@ -350,7 +370,6 @@ void CflSubsampler444_SSE4_1( } samples2 = _mm_slli_epi16(_mm_cvtepu8_epi16(samples23), 3); samples3 = _mm_slli_epi16(_mm_unpackhi_epi8(samples23, zero), 3); -#endif // LIBGAV1_MSAN StoreUnaligned16(luma_ptr + 16, samples2); StoreUnaligned16(luma_ptr + 24, samples3); @@ -418,29 +437,6 @@ void CflSubsampler444_SSE4_1( } } -// Takes in two sums of input row pairs, and completes the computation for two -// output rows. -inline __m128i StoreLumaResults4_420(const __m128i vertical_sum0, - const __m128i vertical_sum1, - int16_t* luma_ptr) { - __m128i result = _mm_hadd_epi16(vertical_sum0, vertical_sum1); - result = _mm_slli_epi16(result, 1); - StoreLo8(luma_ptr, result); - StoreHi8(luma_ptr + kCflLumaBufferStride, result); - return result; -} - -// Takes two halves of a vertically added pair of rows and completes the -// computation for one output row. -inline __m128i StoreLumaResults8_420(const __m128i vertical_sum0, - const __m128i vertical_sum1, - int16_t* luma_ptr) { - __m128i result = _mm_hadd_epi16(vertical_sum0, vertical_sum1); - result = _mm_slli_epi16(result, 1); - StoreUnaligned16(luma_ptr, result); - return result; -} - template <int block_height_log2> void CflSubsampler420_4xH_SSE4_1( int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], @@ -511,17 +507,6 @@ void CflSubsampler420_4xH_SSE4_1( } } -// This duplicates the last two 16-bit values in |row|. -inline __m128i LastRowSamples(const __m128i row) { - return _mm_shuffle_epi32(row, 0xFF); -} - -// This duplicates the last 16-bit value in |row|. -inline __m128i LastRowResult(const __m128i row) { - const __m128i dup_row = _mm_shufflehi_epi16(row, 0xFF); - return _mm_shuffle_epi32(dup_row, 0xFF); -} - template <int block_height_log2, int max_luma_width> inline void CflSubsampler420Impl_8xH_SSE4_1( int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], @@ -655,10 +640,11 @@ inline void CflSubsampler420Impl_WxH_SSE4_1( __m128i final_sum = zero; const int block_height = 1 << block_height_log2; const int luma_height = std::min(block_height, max_luma_height >> 1); + static_assert(max_luma_width <= 32, ""); int16_t* luma_ptr = luma[0]; __m128i final_row_result; - // Begin first y section, covering width up to 16. + // Begin first y section, covering width up to 32. int y = 0; do { const uint8_t* src_next = src + stride; @@ -694,29 +680,32 @@ inline void CflSubsampler420Impl_WxH_SSE4_1( final_row_result = StoreLumaResults8_420(luma_sum2, luma_sum3, luma_ptr + 8); sum = _mm_add_epi16(sum, final_row_result); + if (block_width_log2 == 5) { + const __m128i wide_fill = LastRowResult(final_row_result); + sum = _mm_add_epi16(sum, wide_fill); + sum = _mm_add_epi16(sum, wide_fill); + } final_sum = _mm_add_epi32(final_sum, _mm_cvtepu16_epi32(sum)); final_sum = _mm_add_epi32(final_sum, _mm_unpackhi_epi16(sum, zero)); src += stride << 1; luma_ptr += kCflLumaBufferStride; } while (++y < luma_height); - // Because max_luma_width is at most 32, any values beyond x=16 will - // necessarily be duplicated. - if (block_width_log2 == 5) { - const __m128i wide_fill = LastRowResult(final_row_result); - // Multiply duplicated value by number of occurrences, height * 4, since - // there are 16 in each row and the value appears in the vector 4 times. - final_sum = _mm_add_epi32( - final_sum, - _mm_slli_epi32(_mm_cvtepi16_epi32(wide_fill), block_height_log2 + 2)); - } - // Begin second y section. if (y < block_height) { const __m128i final_fill0 = LoadUnaligned16(luma_ptr - kCflLumaBufferStride); const __m128i final_fill1 = LoadUnaligned16(luma_ptr - kCflLumaBufferStride + 8); + __m128i wide_fill; + + if (block_width_log2 == 5) { + // There are 16 16-bit fill values per row, shifting by 2 accounts for + // the widening to 32-bit. + wide_fill = + _mm_slli_epi32(_mm_cvtepi16_epi32(LastRowResult(final_fill1)), 2); + } + const __m128i final_inner_sum = _mm_add_epi16(final_fill0, final_fill1); const __m128i final_inner_sum0 = _mm_cvtepu16_epi32(final_inner_sum); const __m128i final_inner_sum1 = _mm_unpackhi_epi16(final_inner_sum, zero); @@ -726,6 +715,9 @@ inline void CflSubsampler420Impl_WxH_SSE4_1( do { StoreUnaligned16(luma_ptr, final_fill0); StoreUnaligned16(luma_ptr + 8, final_fill1); + if (block_width_log2 == 5) { + final_sum = _mm_add_epi32(final_sum, wide_fill); + } luma_ptr += kCflLumaBufferStride; final_sum = _mm_add_epi32(final_sum, final_fill_to_sum); @@ -747,14 +739,10 @@ inline void CflSubsampler420Impl_WxH_SSE4_1( const __m128i samples1 = LoadUnaligned16(luma_ptr + 8); final_row_result = _mm_sub_epi16(samples1, averages); StoreUnaligned16(luma_ptr + 8, final_row_result); - } - if (block_width_log2 == 5) { - int16_t* wide_luma_ptr = luma[0] + 16; - const __m128i wide_fill = LastRowResult(final_row_result); - for (int i = 0; i < block_height; - ++i, wide_luma_ptr += kCflLumaBufferStride) { - StoreUnaligned16(wide_luma_ptr, wide_fill); - StoreUnaligned16(wide_luma_ptr + 8, wide_fill); + if (block_width_log2 == 5) { + const __m128i wide_fill = LastRowResult(final_row_result); + StoreUnaligned16(luma_ptr + 16, wide_fill); + StoreUnaligned16(luma_ptr + 24, wide_fill); } } } @@ -958,7 +946,882 @@ void Init8bpp() { } // namespace } // namespace low_bitdepth -void IntraPredCflInit_SSE4_1() { low_bitdepth::Init8bpp(); } +#if LIBGAV1_MAX_BITDEPTH >= 10 +namespace high_bitdepth { +namespace { + +//------------------------------------------------------------------------------ +// CflIntraPredictor_10bpp_SSE4_1 + +inline __m128i CflPredictUnclipped(const __m128i* input, __m128i alpha_q12, + __m128i alpha_sign, __m128i dc_q0) { + const __m128i ac_q3 = LoadUnaligned16(input); + const __m128i ac_sign = _mm_sign_epi16(alpha_sign, ac_q3); + __m128i scaled_luma_q0 = _mm_mulhrs_epi16(_mm_abs_epi16(ac_q3), alpha_q12); + scaled_luma_q0 = _mm_sign_epi16(scaled_luma_q0, ac_sign); + return _mm_add_epi16(scaled_luma_q0, dc_q0); +} + +inline __m128i ClipEpi16(__m128i x, __m128i min, __m128i max) { + return _mm_max_epi16(_mm_min_epi16(x, max), min); +} + +template <int width, int height> +void CflIntraPredictor_10bpp_SSE4_1( + void* const dest, ptrdiff_t stride, + const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int alpha) { + constexpr int kCflLumaBufferStrideLog2_16i = 5; + constexpr int kCflLumaBufferStrideLog2_128i = + kCflLumaBufferStrideLog2_16i - 3; + constexpr int kRowIncr = 1 << kCflLumaBufferStrideLog2_128i; + auto* dst = static_cast<uint16_t*>(dest); + const __m128i alpha_sign = _mm_set1_epi16(alpha); + const __m128i alpha_q12 = _mm_slli_epi16(_mm_abs_epi16(alpha_sign), 9); + auto* row = reinterpret_cast<const __m128i*>(luma); + const __m128i* row_end = row + (height << kCflLumaBufferStrideLog2_128i); + const __m128i dc_val = _mm_set1_epi16(dst[0]); + const __m128i min = _mm_setzero_si128(); + const __m128i max = _mm_set1_epi16((1 << kBitdepth10) - 1); + + stride >>= 1; + + do { + __m128i res = CflPredictUnclipped(row, alpha_q12, alpha_sign, dc_val); + res = ClipEpi16(res, min, max); + if (width == 4) { + StoreLo8(dst, res); + } else if (width == 8) { + StoreUnaligned16(dst, res); + } else if (width == 16) { + StoreUnaligned16(dst, res); + const __m128i res_1 = + CflPredictUnclipped(row + 1, alpha_q12, alpha_sign, dc_val); + StoreUnaligned16(dst + 8, ClipEpi16(res_1, min, max)); + } else { + StoreUnaligned16(dst, res); + const __m128i res_1 = + CflPredictUnclipped(row + 1, alpha_q12, alpha_sign, dc_val); + StoreUnaligned16(dst + 8, ClipEpi16(res_1, min, max)); + const __m128i res_2 = + CflPredictUnclipped(row + 2, alpha_q12, alpha_sign, dc_val); + StoreUnaligned16(dst + 16, ClipEpi16(res_2, min, max)); + const __m128i res_3 = + CflPredictUnclipped(row + 3, alpha_q12, alpha_sign, dc_val); + StoreUnaligned16(dst + 24, ClipEpi16(res_3, min, max)); + } + + dst += stride; + } while ((row += kRowIncr) < row_end); +} + +template <int block_height_log2, bool is_inside> +void CflSubsampler444_4xH_SSE4_1( + int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int max_luma_height, const void* const source, ptrdiff_t stride) { + static_assert(block_height_log2 <= 4, ""); + const int block_height = 1 << block_height_log2; + const int visible_height = max_luma_height; + const auto* src = static_cast<const uint16_t*>(source); + const ptrdiff_t src_stride = stride / sizeof(src[0]); + int16_t* luma_ptr = luma[0]; + __m128i zero = _mm_setzero_si128(); + __m128i sum = zero; + __m128i samples; + int y = visible_height; + + do { + samples = LoadHi8(LoadLo8(src), src + src_stride); + src += src_stride << 1; + sum = _mm_add_epi16(sum, samples); + y -= 2; + } while (y != 0); + + if (!is_inside) { + y = visible_height; + samples = _mm_unpackhi_epi64(samples, samples); + do { + sum = _mm_add_epi16(sum, samples); + y += 2; + } while (y < block_height); + } + + sum = _mm_add_epi32(_mm_unpackhi_epi16(sum, zero), _mm_cvtepu16_epi32(sum)); + sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 8)); + sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 4)); + + // Here the left shift by 3 (to increase precision) is nullified in right + // shift ((log2 of width 4) + 1). + __m128i averages = RightShiftWithRounding_U32(sum, block_height_log2 - 1); + averages = _mm_shufflelo_epi16(averages, 0); + src = static_cast<const uint16_t*>(source); + luma_ptr = luma[0]; + y = visible_height; + do { + samples = LoadLo8(src); + samples = _mm_slli_epi16(samples, 3); + StoreLo8(luma_ptr, _mm_sub_epi16(samples, averages)); + src += src_stride; + luma_ptr += kCflLumaBufferStride; + } while (--y != 0); + + if (!is_inside) { + y = visible_height; + // Replicate last line + do { + StoreLo8(luma_ptr, _mm_sub_epi16(samples, averages)); + luma_ptr += kCflLumaBufferStride; + } while (++y < block_height); + } +} + +template <int block_height_log2> +void CflSubsampler444_4xH_SSE4_1( + int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int max_luma_width, const int max_luma_height, + const void* const source, ptrdiff_t stride) { + static_cast<void>(max_luma_width); + static_cast<void>(max_luma_height); + static_assert(block_height_log2 <= 4, ""); + assert(max_luma_width >= 4); + assert(max_luma_height >= 4); + const int block_height = 1 << block_height_log2; + + if (block_height <= max_luma_height) { + CflSubsampler444_4xH_SSE4_1<block_height_log2, true>(luma, max_luma_height, + source, stride); + } else { + CflSubsampler444_4xH_SSE4_1<block_height_log2, false>(luma, max_luma_height, + source, stride); + } +} + +template <int block_height_log2, bool is_inside> +void CflSubsampler444_8xH_SSE4_1( + int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int max_luma_height, const void* const source, ptrdiff_t stride) { + const int block_height = 1 << block_height_log2; + const int visible_height = max_luma_height; + const __m128i dup16 = _mm_set1_epi32(0x01000100); + const auto* src = static_cast<const uint16_t*>(source); + const ptrdiff_t src_stride = stride / sizeof(src[0]); + int16_t* luma_ptr = luma[0]; + const __m128i zero = _mm_setzero_si128(); + __m128i sum = zero; + __m128i samples; + int y = visible_height; + + do { + samples = LoadUnaligned16(src); + src += src_stride; + sum = _mm_add_epi16(sum, samples); + } while (--y != 0); + + if (!is_inside) { + y = visible_height; + do { + sum = _mm_add_epi16(sum, samples); + } while (++y < block_height); + } + + sum = _mm_add_epi32(_mm_unpackhi_epi16(sum, zero), _mm_cvtepu16_epi32(sum)); + sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 8)); + sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 4)); + + // Here the left shift by 3 (to increase precision) is nullified in right + // shift (log2 of width 8). + __m128i averages = RightShiftWithRounding_U32(sum, block_height_log2); + averages = _mm_shuffle_epi8(averages, dup16); + + src = static_cast<const uint16_t*>(source); + luma_ptr = luma[0]; + y = visible_height; + do { + samples = LoadUnaligned16(src); + samples = _mm_slli_epi16(samples, 3); + StoreUnaligned16(luma_ptr, _mm_sub_epi16(samples, averages)); + src += src_stride; + luma_ptr += kCflLumaBufferStride; + } while (--y != 0); + + if (!is_inside) { + y = visible_height; + // Replicate last line + do { + StoreUnaligned16(luma_ptr, _mm_sub_epi16(samples, averages)); + luma_ptr += kCflLumaBufferStride; + } while (++y < block_height); + } +} + +template <int block_height_log2> +void CflSubsampler444_8xH_SSE4_1( + int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int max_luma_width, const int max_luma_height, + const void* const source, ptrdiff_t stride) { + static_cast<void>(max_luma_width); + static_cast<void>(max_luma_height); + static_assert(block_height_log2 <= 5, ""); + assert(max_luma_width >= 4); + assert(max_luma_height >= 4); + const int block_height = 1 << block_height_log2; + const int block_width = 8; + + const int horz_inside = block_width <= max_luma_width; + const int vert_inside = block_height <= max_luma_height; + if (horz_inside && vert_inside) { + CflSubsampler444_8xH_SSE4_1<block_height_log2, true>(luma, max_luma_height, + source, stride); + } else { + CflSubsampler444_8xH_SSE4_1<block_height_log2, false>(luma, max_luma_height, + source, stride); + } +} + +template <int block_width_log2, int block_height_log2, bool is_inside> +void CflSubsampler444_WxH_SSE4_1( + int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int max_luma_width, const int max_luma_height, + const void* const source, ptrdiff_t stride) { + const int block_height = 1 << block_height_log2; + const int visible_height = max_luma_height; + const int block_width = 1 << block_width_log2; + const __m128i dup16 = _mm_set1_epi32(0x01000100); + const __m128i zero = _mm_setzero_si128(); + const auto* src = static_cast<const uint16_t*>(source); + const ptrdiff_t src_stride = stride / sizeof(src[0]); + int16_t* luma_ptr = luma[0]; + __m128i sum = zero; + __m128i inner_sum_lo, inner_sum_hi; + __m128i samples[4]; + int y = visible_height; + + do { + samples[0] = LoadUnaligned16(src); + samples[1] = (max_luma_width >= 16) ? LoadUnaligned16(src + 8) + : LastRowResult(samples[0]); + __m128i inner_sum = _mm_add_epi16(samples[0], samples[1]); + if (block_width == 32) { + samples[2] = (max_luma_width >= 24) ? LoadUnaligned16(src + 16) + : LastRowResult(samples[1]); + samples[3] = (max_luma_width == 32) ? LoadUnaligned16(src + 24) + : LastRowResult(samples[2]); + + inner_sum = _mm_add_epi16(samples[2], inner_sum); + inner_sum = _mm_add_epi16(samples[3], inner_sum); + } + inner_sum_lo = _mm_cvtepu16_epi32(inner_sum); + inner_sum_hi = _mm_unpackhi_epi16(inner_sum, zero); + sum = _mm_add_epi32(sum, inner_sum_lo); + sum = _mm_add_epi32(sum, inner_sum_hi); + src += src_stride; + } while (--y != 0); + + if (!is_inside) { + y = visible_height; + __m128i inner_sum = _mm_add_epi16(samples[0], samples[1]); + if (block_width == 32) { + inner_sum = _mm_add_epi16(samples[2], inner_sum); + inner_sum = _mm_add_epi16(samples[3], inner_sum); + } + inner_sum_lo = _mm_cvtepu16_epi32(inner_sum); + inner_sum_hi = _mm_unpackhi_epi16(inner_sum, zero); + do { + sum = _mm_add_epi32(sum, inner_sum_lo); + sum = _mm_add_epi32(sum, inner_sum_hi); + } while (++y < block_height); + } + + sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 8)); + sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 4)); + + // Here the left shift by 3 (to increase precision) is subtracted in right + // shift factor (block_width_log2 + block_height_log2 - 3). + __m128i averages = + RightShiftWithRounding_U32(sum, block_width_log2 + block_height_log2 - 3); + averages = _mm_shuffle_epi8(averages, dup16); + + src = static_cast<const uint16_t*>(source); + __m128i samples_ext = zero; + luma_ptr = luma[0]; + y = visible_height; + do { + int idx = 0; + for (int x = 0; x < block_width; x += 8) { + if (max_luma_width > x) { + samples[idx] = LoadUnaligned16(&src[x]); + samples[idx] = _mm_slli_epi16(samples[idx], 3); + samples_ext = samples[idx]; + } else { + samples[idx] = LastRowResult(samples_ext); + } + StoreUnaligned16(&luma_ptr[x], _mm_sub_epi16(samples[idx++], averages)); + } + src += src_stride; + luma_ptr += kCflLumaBufferStride; + } while (--y != 0); + + if (!is_inside) { + y = visible_height; + // Replicate last line + do { + int idx = 0; + for (int x = 0; x < block_width; x += 8) { + StoreUnaligned16(&luma_ptr[x], _mm_sub_epi16(samples[idx++], averages)); + } + luma_ptr += kCflLumaBufferStride; + } while (++y < block_height); + } +} + +template <int block_width_log2, int block_height_log2> +void CflSubsampler444_WxH_SSE4_1( + int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int max_luma_width, const int max_luma_height, + const void* const source, ptrdiff_t stride) { + static_assert(block_width_log2 == 4 || block_width_log2 == 5, + "This function will only work for block_width 16 and 32."); + static_assert(block_height_log2 <= 5, ""); + assert(max_luma_width >= 4); + assert(max_luma_height >= 4); + + const int block_height = 1 << block_height_log2; + const int vert_inside = block_height <= max_luma_height; + if (vert_inside) { + CflSubsampler444_WxH_SSE4_1<block_width_log2, block_height_log2, true>( + luma, max_luma_width, max_luma_height, source, stride); + } else { + CflSubsampler444_WxH_SSE4_1<block_width_log2, block_height_log2, false>( + luma, max_luma_width, max_luma_height, source, stride); + } +} + +template <int block_height_log2> +void CflSubsampler420_4xH_SSE4_1( + int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int /*max_luma_width*/, const int max_luma_height, + const void* const source, ptrdiff_t stride) { + const int block_height = 1 << block_height_log2; + const auto* src = static_cast<const uint16_t*>(source); + const ptrdiff_t src_stride = stride / sizeof(src[0]); + int16_t* luma_ptr = luma[0]; + const __m128i zero = _mm_setzero_si128(); + __m128i final_sum = zero; + const int luma_height = std::min(block_height, max_luma_height >> 1); + int y = luma_height; + + do { + const __m128i samples_row0 = LoadUnaligned16(src); + src += src_stride; + const __m128i samples_row1 = LoadUnaligned16(src); + src += src_stride; + const __m128i luma_sum01 = _mm_add_epi16(samples_row0, samples_row1); + + const __m128i samples_row2 = LoadUnaligned16(src); + src += src_stride; + const __m128i samples_row3 = LoadUnaligned16(src); + src += src_stride; + const __m128i luma_sum23 = _mm_add_epi16(samples_row2, samples_row3); + __m128i sum = StoreLumaResults4_420(luma_sum01, luma_sum23, luma_ptr); + luma_ptr += kCflLumaBufferStride << 1; + + const __m128i samples_row4 = LoadUnaligned16(src); + src += src_stride; + const __m128i samples_row5 = LoadUnaligned16(src); + src += src_stride; + const __m128i luma_sum45 = _mm_add_epi16(samples_row4, samples_row5); + + const __m128i samples_row6 = LoadUnaligned16(src); + src += src_stride; + const __m128i samples_row7 = LoadUnaligned16(src); + src += src_stride; + const __m128i luma_sum67 = _mm_add_epi16(samples_row6, samples_row7); + sum = _mm_add_epi16( + sum, StoreLumaResults4_420(luma_sum45, luma_sum67, luma_ptr)); + luma_ptr += kCflLumaBufferStride << 1; + + final_sum = _mm_add_epi32(final_sum, _mm_cvtepu16_epi32(sum)); + final_sum = _mm_add_epi32(final_sum, _mm_unpackhi_epi16(sum, zero)); + y -= 4; + } while (y != 0); + + const __m128i final_fill = LoadLo8(luma_ptr - kCflLumaBufferStride); + const __m128i final_fill_to_sum = _mm_cvtepu16_epi32(final_fill); + for (y = luma_height; y < block_height; ++y) { + StoreLo8(luma_ptr, final_fill); + luma_ptr += kCflLumaBufferStride; + final_sum = _mm_add_epi32(final_sum, final_fill_to_sum); + } + final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 8)); + final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 4)); + + __m128i averages = RightShiftWithRounding_U32( + final_sum, block_height_log2 + 2 /*log2 of width 4*/); + + averages = _mm_shufflelo_epi16(averages, 0); + luma_ptr = luma[0]; + y = block_height; + do { + const __m128i samples = LoadLo8(luma_ptr); + StoreLo8(luma_ptr, _mm_sub_epi16(samples, averages)); + luma_ptr += kCflLumaBufferStride; + } while (--y != 0); +} + +template <int block_height_log2, int max_luma_width> +inline void CflSubsampler420Impl_8xH_SSE4_1( + int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int max_luma_height, const void* const source, ptrdiff_t stride) { + const int block_height = 1 << block_height_log2; + const auto* src = static_cast<const uint16_t*>(source); + const ptrdiff_t src_stride = stride / sizeof(src[0]); + const __m128i zero = _mm_setzero_si128(); + __m128i final_sum = zero; + int16_t* luma_ptr = luma[0]; + const int luma_height = std::min(block_height, max_luma_height >> 1); + int y = luma_height; + + do { + const __m128i samples_row00 = LoadUnaligned16(src); + const __m128i samples_row01 = (max_luma_width == 16) + ? LoadUnaligned16(src + 8) + : LastRowSamples(samples_row00); + src += src_stride; + const __m128i samples_row10 = LoadUnaligned16(src); + const __m128i samples_row11 = (max_luma_width == 16) + ? LoadUnaligned16(src + 8) + : LastRowSamples(samples_row10); + src += src_stride; + const __m128i luma_sum00 = _mm_add_epi16(samples_row00, samples_row10); + const __m128i luma_sum01 = _mm_add_epi16(samples_row01, samples_row11); + __m128i sum = StoreLumaResults8_420(luma_sum00, luma_sum01, luma_ptr); + luma_ptr += kCflLumaBufferStride; + + const __m128i samples_row20 = LoadUnaligned16(src); + const __m128i samples_row21 = (max_luma_width == 16) + ? LoadUnaligned16(src + 8) + : LastRowSamples(samples_row20); + src += src_stride; + const __m128i samples_row30 = LoadUnaligned16(src); + const __m128i samples_row31 = (max_luma_width == 16) + ? LoadUnaligned16(src + 8) + : LastRowSamples(samples_row30); + src += src_stride; + const __m128i luma_sum10 = _mm_add_epi16(samples_row20, samples_row30); + const __m128i luma_sum11 = _mm_add_epi16(samples_row21, samples_row31); + sum = _mm_add_epi16( + sum, StoreLumaResults8_420(luma_sum10, luma_sum11, luma_ptr)); + luma_ptr += kCflLumaBufferStride; + + const __m128i samples_row40 = LoadUnaligned16(src); + const __m128i samples_row41 = (max_luma_width == 16) + ? LoadUnaligned16(src + 8) + : LastRowSamples(samples_row40); + src += src_stride; + const __m128i samples_row50 = LoadUnaligned16(src); + const __m128i samples_row51 = (max_luma_width == 16) + ? LoadUnaligned16(src + 8) + : LastRowSamples(samples_row50); + src += src_stride; + const __m128i luma_sum20 = _mm_add_epi16(samples_row40, samples_row50); + const __m128i luma_sum21 = _mm_add_epi16(samples_row41, samples_row51); + sum = _mm_add_epi16( + sum, StoreLumaResults8_420(luma_sum20, luma_sum21, luma_ptr)); + luma_ptr += kCflLumaBufferStride; + + const __m128i samples_row60 = LoadUnaligned16(src); + const __m128i samples_row61 = (max_luma_width == 16) + ? LoadUnaligned16(src + 8) + : LastRowSamples(samples_row60); + src += src_stride; + const __m128i samples_row70 = LoadUnaligned16(src); + const __m128i samples_row71 = (max_luma_width == 16) + ? LoadUnaligned16(src + 8) + : LastRowSamples(samples_row70); + src += src_stride; + const __m128i luma_sum30 = _mm_add_epi16(samples_row60, samples_row70); + const __m128i luma_sum31 = _mm_add_epi16(samples_row61, samples_row71); + sum = _mm_add_epi16( + sum, StoreLumaResults8_420(luma_sum30, luma_sum31, luma_ptr)); + luma_ptr += kCflLumaBufferStride; + + final_sum = _mm_add_epi32(final_sum, _mm_cvtepu16_epi32(sum)); + final_sum = _mm_add_epi32(final_sum, _mm_unpackhi_epi16(sum, zero)); + y -= 4; + } while (y != 0); + + // Duplicate the final row downward to the end after max_luma_height. + const __m128i final_fill = LoadUnaligned16(luma_ptr - kCflLumaBufferStride); + const __m128i final_fill_to_sum0 = _mm_cvtepi16_epi32(final_fill); + const __m128i final_fill_to_sum1 = + _mm_cvtepi16_epi32(_mm_srli_si128(final_fill, 8)); + const __m128i final_fill_to_sum = + _mm_add_epi32(final_fill_to_sum0, final_fill_to_sum1); + for (y = luma_height; y < block_height; ++y) { + StoreUnaligned16(luma_ptr, final_fill); + luma_ptr += kCflLumaBufferStride; + final_sum = _mm_add_epi32(final_sum, final_fill_to_sum); + } + final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 8)); + final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 4)); + + __m128i averages = RightShiftWithRounding_S32( + final_sum, block_height_log2 + 3 /*log2 of width 8*/); + + averages = _mm_shufflelo_epi16(averages, 0); + averages = _mm_shuffle_epi32(averages, 0); + luma_ptr = luma[0]; + y = block_height; + do { + const __m128i samples = LoadUnaligned16(luma_ptr); + StoreUnaligned16(luma_ptr, _mm_sub_epi16(samples, averages)); + luma_ptr += kCflLumaBufferStride; + } while (--y != 0); +} + +template <int block_height_log2> +void CflSubsampler420_8xH_SSE4_1( + int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int max_luma_width, const int max_luma_height, + const void* const source, ptrdiff_t stride) { + if (max_luma_width == 8) { + CflSubsampler420Impl_8xH_SSE4_1<block_height_log2, 8>(luma, max_luma_height, + source, stride); + } else { + CflSubsampler420Impl_8xH_SSE4_1<block_height_log2, 16>( + luma, max_luma_height, source, stride); + } +} + +template <int block_width_log2, int block_height_log2, int max_luma_width> +inline void CflSubsampler420Impl_WxH_SSE4_1( + int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int max_luma_height, const void* const source, ptrdiff_t stride) { + const auto* src = static_cast<const uint16_t*>(source); + const ptrdiff_t src_stride = stride / sizeof(src[0]); + const __m128i zero = _mm_setzero_si128(); + __m128i final_sum = zero; + const int block_height = 1 << block_height_log2; + const int luma_height = std::min(block_height, max_luma_height >> 1); + int16_t* luma_ptr = luma[0]; + __m128i final_row_result; + // Begin first y section, covering width up to 32. + int y = luma_height; + + do { + const uint16_t* src_next = src + src_stride; + const __m128i samples_row00 = LoadUnaligned16(src); + const __m128i samples_row01 = (max_luma_width >= 16) + ? LoadUnaligned16(src + 8) + : LastRowSamples(samples_row00); + const __m128i samples_row02 = (max_luma_width >= 24) + ? LoadUnaligned16(src + 16) + : LastRowSamples(samples_row01); + const __m128i samples_row03 = (max_luma_width == 32) + ? LoadUnaligned16(src + 24) + : LastRowSamples(samples_row02); + const __m128i samples_row10 = LoadUnaligned16(src_next); + const __m128i samples_row11 = (max_luma_width >= 16) + ? LoadUnaligned16(src_next + 8) + : LastRowSamples(samples_row10); + const __m128i samples_row12 = (max_luma_width >= 24) + ? LoadUnaligned16(src_next + 16) + : LastRowSamples(samples_row11); + const __m128i samples_row13 = (max_luma_width == 32) + ? LoadUnaligned16(src_next + 24) + : LastRowSamples(samples_row12); + const __m128i luma_sum0 = _mm_add_epi16(samples_row00, samples_row10); + const __m128i luma_sum1 = _mm_add_epi16(samples_row01, samples_row11); + const __m128i luma_sum2 = _mm_add_epi16(samples_row02, samples_row12); + const __m128i luma_sum3 = _mm_add_epi16(samples_row03, samples_row13); + __m128i sum = StoreLumaResults8_420(luma_sum0, luma_sum1, luma_ptr); + final_row_result = + StoreLumaResults8_420(luma_sum2, luma_sum3, luma_ptr + 8); + sum = _mm_add_epi16(sum, final_row_result); + final_sum = _mm_add_epi32(final_sum, _mm_cvtepu16_epi32(sum)); + final_sum = _mm_add_epi32(final_sum, _mm_unpackhi_epi16(sum, zero)); + + // Because max_luma_width is at most 32, any values beyond x=16 will + // necessarily be duplicated. + if (block_width_log2 == 5) { + const __m128i wide_fill = LastRowResult(final_row_result); + // There are 16 16-bit fill values per row, shifting by 2 accounts for + // the widening to 32-bit. + final_sum = _mm_add_epi32( + final_sum, _mm_slli_epi32(_mm_cvtepi16_epi32(wide_fill), 2)); + } + src += src_stride << 1; + luma_ptr += kCflLumaBufferStride; + } while (--y != 0); + + // Begin second y section. + y = luma_height; + if (y < block_height) { + const __m128i final_fill0 = + LoadUnaligned16(luma_ptr - kCflLumaBufferStride); + const __m128i final_fill1 = + LoadUnaligned16(luma_ptr - kCflLumaBufferStride + 8); + __m128i wide_fill; + if (block_width_log2 == 5) { + // There are 16 16-bit fill values per row, shifting by 2 accounts for + // the widening to 32-bit. + wide_fill = + _mm_slli_epi32(_mm_cvtepi16_epi32(LastRowResult(final_fill1)), 2); + } + const __m128i final_inner_sum = _mm_add_epi16(final_fill0, final_fill1); + const __m128i final_inner_sum0 = _mm_cvtepu16_epi32(final_inner_sum); + const __m128i final_inner_sum1 = _mm_unpackhi_epi16(final_inner_sum, zero); + const __m128i final_fill_to_sum = + _mm_add_epi32(final_inner_sum0, final_inner_sum1); + + do { + StoreUnaligned16(luma_ptr, final_fill0); + StoreUnaligned16(luma_ptr + 8, final_fill1); + if (block_width_log2 == 5) { + final_sum = _mm_add_epi32(final_sum, wide_fill); + } + luma_ptr += kCflLumaBufferStride; + final_sum = _mm_add_epi32(final_sum, final_fill_to_sum); + } while (++y < block_height); + } // End second y section. + + final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 8)); + final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 4)); + + __m128i averages = RightShiftWithRounding_S32( + final_sum, block_width_log2 + block_height_log2); + averages = _mm_shufflelo_epi16(averages, 0); + averages = _mm_shuffle_epi32(averages, 0); + + luma_ptr = luma[0]; + y = block_height; + do { + const __m128i samples0 = LoadUnaligned16(luma_ptr); + StoreUnaligned16(luma_ptr, _mm_sub_epi16(samples0, averages)); + const __m128i samples1 = LoadUnaligned16(luma_ptr + 8); + final_row_result = _mm_sub_epi16(samples1, averages); + StoreUnaligned16(luma_ptr + 8, final_row_result); + + if (block_width_log2 == 5) { + const __m128i wide_fill = LastRowResult(final_row_result); + StoreUnaligned16(luma_ptr + 16, wide_fill); + StoreUnaligned16(luma_ptr + 24, wide_fill); + } + luma_ptr += kCflLumaBufferStride; + } while (--y != 0); +} + +template <int block_width_log2, int block_height_log2> +void CflSubsampler420_WxH_SSE4_1( + int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int max_luma_width, const int max_luma_height, + const void* const source, ptrdiff_t stride) { + switch (max_luma_width) { + case 8: + CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 8>( + luma, max_luma_height, source, stride); + return; + case 16: + CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 16>( + luma, max_luma_height, source, stride); + return; + case 24: + CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 24>( + luma, max_luma_height, source, stride); + return; + default: + assert(max_luma_width == 32); + CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 32>( + luma, max_luma_height, source, stride); + return; + } +} + +void Init10bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10); + assert(dsp != nullptr); + +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_CflIntraPredictor) + dsp->cfl_intra_predictors[kTransformSize4x4] = + CflIntraPredictor_10bpp_SSE4_1<4, 4>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x8_CflIntraPredictor) + dsp->cfl_intra_predictors[kTransformSize4x8] = + CflIntraPredictor_10bpp_SSE4_1<4, 8>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x16_CflIntraPredictor) + dsp->cfl_intra_predictors[kTransformSize4x16] = + CflIntraPredictor_10bpp_SSE4_1<4, 16>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x4_CflIntraPredictor) + dsp->cfl_intra_predictors[kTransformSize8x4] = + CflIntraPredictor_10bpp_SSE4_1<8, 4>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x8_CflIntraPredictor) + dsp->cfl_intra_predictors[kTransformSize8x8] = + CflIntraPredictor_10bpp_SSE4_1<8, 8>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x16_CflIntraPredictor) + dsp->cfl_intra_predictors[kTransformSize8x16] = + CflIntraPredictor_10bpp_SSE4_1<8, 16>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x32_CflIntraPredictor) + dsp->cfl_intra_predictors[kTransformSize8x32] = + CflIntraPredictor_10bpp_SSE4_1<8, 32>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x4_CflIntraPredictor) + dsp->cfl_intra_predictors[kTransformSize16x4] = + CflIntraPredictor_10bpp_SSE4_1<16, 4>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x8_CflIntraPredictor) + dsp->cfl_intra_predictors[kTransformSize16x8] = + CflIntraPredictor_10bpp_SSE4_1<16, 8>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x16_CflIntraPredictor) + dsp->cfl_intra_predictors[kTransformSize16x16] = + CflIntraPredictor_10bpp_SSE4_1<16, 16>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x32_CflIntraPredictor) + dsp->cfl_intra_predictors[kTransformSize16x32] = + CflIntraPredictor_10bpp_SSE4_1<16, 32>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x8_CflIntraPredictor) + dsp->cfl_intra_predictors[kTransformSize32x8] = + CflIntraPredictor_10bpp_SSE4_1<32, 8>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x16_CflIntraPredictor) + dsp->cfl_intra_predictors[kTransformSize32x16] = + CflIntraPredictor_10bpp_SSE4_1<32, 16>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x32_CflIntraPredictor) + dsp->cfl_intra_predictors[kTransformSize32x32] = + CflIntraPredictor_10bpp_SSE4_1<32, 32>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_CflSubsampler420) + dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] = + CflSubsampler420_4xH_SSE4_1<2>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x8_CflSubsampler420) + dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] = + CflSubsampler420_4xH_SSE4_1<3>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x16_CflSubsampler420) + dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] = + CflSubsampler420_4xH_SSE4_1<4>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x4_CflSubsampler420) + dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] = + CflSubsampler420_8xH_SSE4_1<2>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x8_CflSubsampler420) + dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] = + CflSubsampler420_8xH_SSE4_1<3>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x16_CflSubsampler420) + dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] = + CflSubsampler420_8xH_SSE4_1<4>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x32_CflSubsampler420) + dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] = + CflSubsampler420_8xH_SSE4_1<5>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x4_CflSubsampler420) + dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] = + CflSubsampler420_WxH_SSE4_1<4, 2>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x8_CflSubsampler420) + dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] = + CflSubsampler420_WxH_SSE4_1<4, 3>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x16_CflSubsampler420) + dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] = + CflSubsampler420_WxH_SSE4_1<4, 4>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x32_CflSubsampler420) + dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] = + CflSubsampler420_WxH_SSE4_1<4, 5>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x8_CflSubsampler420) + dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] = + CflSubsampler420_WxH_SSE4_1<5, 3>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x16_CflSubsampler420) + dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] = + CflSubsampler420_WxH_SSE4_1<5, 4>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x32_CflSubsampler420) + dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] = + CflSubsampler420_WxH_SSE4_1<5, 5>; +#endif + +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_CflSubsampler444) + dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] = + CflSubsampler444_4xH_SSE4_1<2>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x8_CflSubsampler444) + dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] = + CflSubsampler444_4xH_SSE4_1<3>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x16_CflSubsampler444) + dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] = + CflSubsampler444_4xH_SSE4_1<4>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x4_CflSubsampler444) + dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] = + CflSubsampler444_8xH_SSE4_1<2>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x8_CflSubsampler444) + dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] = + CflSubsampler444_8xH_SSE4_1<3>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x16_CflSubsampler444) + dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] = + CflSubsampler444_8xH_SSE4_1<4>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x32_CflSubsampler444) + dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] = + CflSubsampler444_8xH_SSE4_1<5>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x4_CflSubsampler444) + dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] = + CflSubsampler444_WxH_SSE4_1<4, 2>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x8_CflSubsampler444) + dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] = + CflSubsampler444_WxH_SSE4_1<4, 3>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x16_CflSubsampler444) + dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] = + CflSubsampler444_WxH_SSE4_1<4, 4>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x32_CflSubsampler444) + dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] = + CflSubsampler444_WxH_SSE4_1<4, 5>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x8_CflSubsampler444) + dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] = + CflSubsampler444_WxH_SSE4_1<5, 3>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x16_CflSubsampler444) + dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] = + CflSubsampler444_WxH_SSE4_1<5, 4>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x32_CflSubsampler444) + dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] = + CflSubsampler444_WxH_SSE4_1<5, 5>; +#endif +} + +} // namespace +} // namespace high_bitdepth +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +void IntraPredCflInit_SSE4_1() { + low_bitdepth::Init8bpp(); +#if LIBGAV1_MAX_BITDEPTH >= 10 + high_bitdepth::Init10bpp(); +#endif // LIBGAV1_MAX_BITDEPTH >= 10 +} } // namespace dsp } // namespace libgav1 diff --git a/src/dsp/x86/intrapred_cfl_sse4.h b/src/dsp/x86/intrapred_cfl_sse4.h new file mode 100644 index 0000000..5d1a425 --- /dev/null +++ b/src/dsp/x86/intrapred_cfl_sse4.h @@ -0,0 +1,376 @@ +/* + * Copyright 2021 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_X86_INTRAPRED_CFL_SSE4_H_ +#define LIBGAV1_SRC_DSP_X86_INTRAPRED_CFL_SSE4_H_ + +#include "src/dsp/dsp.h" +#include "src/utils/cpu.h" + +namespace libgav1 { +namespace dsp { + +// Initializes Dsp::cfl_intra_predictors and Dsp::cfl_subsamplers, see the +// defines below for specifics. These functions are not thread-safe. +void IntraPredCflInit_SSE4_1(); + +} // namespace dsp +} // namespace libgav1 + +// If sse4 is enabled and the baseline isn't set due to a higher level of +// optimization being enabled, signal the sse4 implementation should be used. +#if LIBGAV1_TARGETING_SSE4_1 +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler420 +#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler420 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler420 +#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler420 +#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler420 +#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler420 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler420 +#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler420 +#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler420 +#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler420 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler420 +#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler420 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler420 +#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler420 +#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler420 +#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler420 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler420 +#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler420 +#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler420 +#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler420 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler444 +#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler444 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler444 +#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler444 +#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler444 +#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler444 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler444 +#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler444 +#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler444 +#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler444 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler444 +#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler444 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler444 +#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler444 +#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler444 +#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler444 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler444 +#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler444 +#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler444 +#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler444 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflIntraPredictor +#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflIntraPredictor LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflIntraPredictor +#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflIntraPredictor +#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflIntraPredictor +#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflIntraPredictor LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflIntraPredictor +#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflIntraPredictor +#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflIntraPredictor +#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflIntraPredictor LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflIntraPredictor +#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflIntraPredictor LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflIntraPredictor +#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflIntraPredictor +#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflIntraPredictor +#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflIntraPredictor LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflIntraPredictor +#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflIntraPredictor +#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflIntraPredictor +#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflIntraPredictor LIBGAV1_CPU_SSE4_1 +#endif + +//------------------------------------------------------------------------------ +// 10bpp + +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler420 +#define LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler420 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler420 +#define LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler420 +#define LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler420 +#define LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler420 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler420 +#define LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler420 +#define LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler420 +#define LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler420 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler420 +#define LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler420 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler420 +#define LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler420 +#define LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler420 +#define LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler420 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler420 +#define LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler420 +#define LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler420 +#define LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler420 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler444 +#define LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler444 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler444 +#define LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler444 +#define LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler444 +#define LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler444 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler444 +#define LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler444 +#define LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler444 +#define LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler444 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler444 +#define LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler444 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler444 +#define LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler444 +#define LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler444 +#define LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler444 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler444 +#define LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler444 +#define LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler444 +#define LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler444 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflIntraPredictor +#define LIBGAV1_Dsp10bpp_TransformSize4x4_CflIntraPredictor LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflIntraPredictor +#define LIBGAV1_Dsp10bpp_TransformSize4x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflIntraPredictor +#define LIBGAV1_Dsp10bpp_TransformSize4x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflIntraPredictor +#define LIBGAV1_Dsp10bpp_TransformSize8x4_CflIntraPredictor LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflIntraPredictor +#define LIBGAV1_Dsp10bpp_TransformSize8x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflIntraPredictor +#define LIBGAV1_Dsp10bpp_TransformSize8x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflIntraPredictor +#define LIBGAV1_Dsp10bpp_TransformSize8x32_CflIntraPredictor LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflIntraPredictor +#define LIBGAV1_Dsp10bpp_TransformSize16x4_CflIntraPredictor LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflIntraPredictor +#define LIBGAV1_Dsp10bpp_TransformSize16x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflIntraPredictor +#define LIBGAV1_Dsp10bpp_TransformSize16x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflIntraPredictor +#define LIBGAV1_Dsp10bpp_TransformSize16x32_CflIntraPredictor LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflIntraPredictor +#define LIBGAV1_Dsp10bpp_TransformSize32x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflIntraPredictor +#define LIBGAV1_Dsp10bpp_TransformSize32x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflIntraPredictor +#define LIBGAV1_Dsp10bpp_TransformSize32x32_CflIntraPredictor LIBGAV1_CPU_SSE4_1 +#endif +#endif // LIBGAV1_TARGETING_SSE4_1 + +#endif // LIBGAV1_SRC_DSP_X86_INTRAPRED_CFL_SSE4_H_ diff --git a/src/dsp/x86/intrapred_directional_sse4.cc b/src/dsp/x86/intrapred_directional_sse4.cc new file mode 100644 index 0000000..e642aee --- /dev/null +++ b/src/dsp/x86/intrapred_directional_sse4.cc @@ -0,0 +1,1478 @@ +// Copyright 2021 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/intrapred_directional.h" +#include "src/utils/cpu.h" + +#if LIBGAV1_TARGETING_SSE4_1 + +#include <smmintrin.h> + +#include <algorithm> +#include <cassert> +#include <cstddef> +#include <cstdint> +#include <cstring> + +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/dsp/x86/common_sse4.h" +#include "src/dsp/x86/transpose_sse4.h" +#include "src/utils/common.h" +#include "src/utils/memory.h" + +namespace libgav1 { +namespace dsp { +namespace low_bitdepth { +namespace { + +//------------------------------------------------------------------------------ +// 7.11.2.4. Directional intra prediction process + +// Special case: An |xstep| of 64 corresponds to an angle delta of 45, meaning +// upsampling is ruled out. In addition, the bits masked by 0x3F for +// |shift_val| are 0 for all multiples of 64, so the formula +// val = top[top_base_x]*shift + top[top_base_x+1]*(32-shift), reduces to +// val = top[top_base_x+1] << 5, meaning only the second set of pixels is +// involved in the output. Hence |top| is offset by 1. +inline void DirectionalZone1_Step64(uint8_t* dst, ptrdiff_t stride, + const uint8_t* const top, const int width, + const int height) { + ptrdiff_t offset = 1; + if (height == 4) { + memcpy(dst, top + offset, width); + dst += stride; + memcpy(dst, top + offset + 1, width); + dst += stride; + memcpy(dst, top + offset + 2, width); + dst += stride; + memcpy(dst, top + offset + 3, width); + return; + } + int y = 0; + do { + memcpy(dst, top + offset, width); + dst += stride; + memcpy(dst, top + offset + 1, width); + dst += stride; + memcpy(dst, top + offset + 2, width); + dst += stride; + memcpy(dst, top + offset + 3, width); + dst += stride; + memcpy(dst, top + offset + 4, width); + dst += stride; + memcpy(dst, top + offset + 5, width); + dst += stride; + memcpy(dst, top + offset + 6, width); + dst += stride; + memcpy(dst, top + offset + 7, width); + dst += stride; + + offset += 8; + y += 8; + } while (y < height); +} + +inline void DirectionalZone1_4xH(uint8_t* dst, ptrdiff_t stride, + const uint8_t* const top, const int height, + const int xstep, const bool upsampled) { + const int upsample_shift = static_cast<int>(upsampled); + const int scale_bits = 6 - upsample_shift; + const __m128i max_shift = _mm_set1_epi8(32); + // Downscaling for a weighted average whose weights sum to 32 (max_shift). + const int rounding_bits = 5; + const int max_base_x = (height + 3 /* width - 1 */) << upsample_shift; + const __m128i final_top_val = _mm_set1_epi16(top[max_base_x]); + const __m128i sampler = upsampled ? _mm_set_epi64x(0, 0x0706050403020100) + : _mm_set_epi64x(0, 0x0403030202010100); + // Each 16-bit value here corresponds to a position that may exceed + // |max_base_x|. When added to the top_base_x, it is used to mask values + // that pass the end of |top|. Starting from 1 to simulate "cmpge" which is + // not supported for packed integers. + const __m128i offsets = + _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001); + + // All rows from |min_corner_only_y| down will simply use memcpy. |max_base_x| + // is always greater than |height|, so clipping to 1 is enough to make the + // logic work. + const int xstep_units = std::max(xstep >> scale_bits, 1); + const int min_corner_only_y = std::min(max_base_x / xstep_units, height); + + // Rows up to this y-value can be computed without checking for bounds. + int y = 0; + int top_x = xstep; + + for (; y < min_corner_only_y; ++y, dst += stride, top_x += xstep) { + const int top_base_x = top_x >> scale_bits; + + // Permit negative values of |top_x|. + const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1; + const __m128i shift = _mm_set1_epi8(shift_val); + const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift); + const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift); + __m128i top_index_vect = _mm_set1_epi16(top_base_x); + top_index_vect = _mm_add_epi16(top_index_vect, offsets); + const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x); + + // Load 8 values because we will select the sampled values based on + // |upsampled|. + const __m128i values = LoadLo8(top + top_base_x); + const __m128i sampled_values = _mm_shuffle_epi8(values, sampler); + const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect); + __m128i prod = _mm_maddubs_epi16(sampled_values, shifts); + prod = RightShiftWithRounding_U16(prod, rounding_bits); + // Replace pixels from invalid range with top-right corner. + prod = _mm_blendv_epi8(prod, final_top_val, past_max); + Store4(dst, _mm_packus_epi16(prod, prod)); + } + + // Fill in corner-only rows. + for (; y < height; ++y) { + memset(dst, top[max_base_x], /* width */ 4); + dst += stride; + } +} + +// 7.11.2.4 (7) angle < 90 +inline void DirectionalZone1_Large(uint8_t* dest, ptrdiff_t stride, + const uint8_t* const top_row, + const int width, const int height, + const int xstep, const bool upsampled) { + const int upsample_shift = static_cast<int>(upsampled); + const __m128i sampler = + upsampled ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100) + : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100); + const int scale_bits = 6 - upsample_shift; + const int max_base_x = ((width + height) - 1) << upsample_shift; + + const __m128i max_shift = _mm_set1_epi8(32); + // Downscaling for a weighted average whose weights sum to 32 (max_shift). + const int rounding_bits = 5; + const int base_step = 1 << upsample_shift; + const int base_step8 = base_step << 3; + + // All rows from |min_corner_only_y| down will simply use memcpy. |max_base_x| + // is always greater than |height|, so clipping to 1 is enough to make the + // logic work. + const int xstep_units = std::max(xstep >> scale_bits, 1); + const int min_corner_only_y = std::min(max_base_x / xstep_units, height); + + // Rows up to this y-value can be computed without checking for bounds. + const int max_no_corner_y = std::min( + LeftShift((max_base_x - (base_step * width)), scale_bits) / xstep, + height); + // No need to check for exceeding |max_base_x| in the first loop. + int y = 0; + int top_x = xstep; + for (; y < max_no_corner_y; ++y, dest += stride, top_x += xstep) { + int top_base_x = top_x >> scale_bits; + // Permit negative values of |top_x|. + const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1; + const __m128i shift = _mm_set1_epi8(shift_val); + const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift); + const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift); + int x = 0; + do { + const __m128i top_vals = LoadUnaligned16(top_row + top_base_x); + __m128i vals = _mm_shuffle_epi8(top_vals, sampler); + vals = _mm_maddubs_epi16(vals, shifts); + vals = RightShiftWithRounding_U16(vals, rounding_bits); + StoreLo8(dest + x, _mm_packus_epi16(vals, vals)); + top_base_x += base_step8; + x += 8; + } while (x < width); + } + + // Each 16-bit value here corresponds to a position that may exceed + // |max_base_x|. When added to the top_base_x, it is used to mask values + // that pass the end of |top|. Starting from 1 to simulate "cmpge" which is + // not supported for packed integers. + const __m128i offsets = + _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001); + + const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x); + const __m128i final_top_val = _mm_set1_epi16(top_row[max_base_x]); + const __m128i base_step8_vect = _mm_set1_epi16(base_step8); + for (; y < min_corner_only_y; ++y, dest += stride, top_x += xstep) { + int top_base_x = top_x >> scale_bits; + + const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1; + const __m128i shift = _mm_set1_epi8(shift_val); + const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift); + const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift); + __m128i top_index_vect = _mm_set1_epi16(top_base_x); + top_index_vect = _mm_add_epi16(top_index_vect, offsets); + + int x = 0; + const int min_corner_only_x = + std::min(width, ((max_base_x - top_base_x) >> upsample_shift) + 7) & ~7; + for (; x < min_corner_only_x; + x += 8, top_base_x += base_step8, + top_index_vect = _mm_add_epi16(top_index_vect, base_step8_vect)) { + const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect); + // Assuming a buffer zone of 8 bytes at the end of top_row, this prevents + // reading out of bounds. If all indices are past max and we don't need to + // use the loaded bytes at all, |top_base_x| becomes 0. |top_base_x| will + // reset for the next |y|. + top_base_x &= ~_mm_cvtsi128_si32(past_max); + const __m128i top_vals = LoadUnaligned16(top_row + top_base_x); + __m128i vals = _mm_shuffle_epi8(top_vals, sampler); + vals = _mm_maddubs_epi16(vals, shifts); + vals = RightShiftWithRounding_U16(vals, rounding_bits); + vals = _mm_blendv_epi8(vals, final_top_val, past_max); + StoreLo8(dest + x, _mm_packus_epi16(vals, vals)); + } + // Corner-only section of the row. + memset(dest + x, top_row[max_base_x], width - x); + } + // Fill in corner-only rows. + for (; y < height; ++y) { + memset(dest, top_row[max_base_x], width); + dest += stride; + } +} + +// 7.11.2.4 (7) angle < 90 +inline void DirectionalZone1_SSE4_1(uint8_t* dest, ptrdiff_t stride, + const uint8_t* const top_row, + const int width, const int height, + const int xstep, const bool upsampled) { + const int upsample_shift = static_cast<int>(upsampled); + if (xstep == 64) { + DirectionalZone1_Step64(dest, stride, top_row, width, height); + return; + } + if (width == 4) { + DirectionalZone1_4xH(dest, stride, top_row, height, xstep, upsampled); + return; + } + if (width >= 32) { + DirectionalZone1_Large(dest, stride, top_row, width, height, xstep, + upsampled); + return; + } + const __m128i sampler = + upsampled ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100) + : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100); + const int scale_bits = 6 - upsample_shift; + const int max_base_x = ((width + height) - 1) << upsample_shift; + + const __m128i max_shift = _mm_set1_epi8(32); + // Downscaling for a weighted average whose weights sum to 32 (max_shift). + const int rounding_bits = 5; + const int base_step = 1 << upsample_shift; + const int base_step8 = base_step << 3; + + // No need to check for exceeding |max_base_x| in the loops. + if (((xstep * height) >> scale_bits) + base_step * width < max_base_x) { + int top_x = xstep; + int y = 0; + do { + int top_base_x = top_x >> scale_bits; + // Permit negative values of |top_x|. + const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1; + const __m128i shift = _mm_set1_epi8(shift_val); + const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift); + const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift); + int x = 0; + do { + const __m128i top_vals = LoadUnaligned16(top_row + top_base_x); + __m128i vals = _mm_shuffle_epi8(top_vals, sampler); + vals = _mm_maddubs_epi16(vals, shifts); + vals = RightShiftWithRounding_U16(vals, rounding_bits); + StoreLo8(dest + x, _mm_packus_epi16(vals, vals)); + top_base_x += base_step8; + x += 8; + } while (x < width); + dest += stride; + top_x += xstep; + } while (++y < height); + return; + } + + // Each 16-bit value here corresponds to a position that may exceed + // |max_base_x|. When added to the top_base_x, it is used to mask values + // that pass the end of |top|. Starting from 1 to simulate "cmpge" which is + // not supported for packed integers. + const __m128i offsets = + _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001); + + const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x); + const __m128i final_top_val = _mm_set1_epi16(top_row[max_base_x]); + const __m128i base_step8_vect = _mm_set1_epi16(base_step8); + int top_x = xstep; + int y = 0; + do { + int top_base_x = top_x >> scale_bits; + + if (top_base_x >= max_base_x) { + for (int i = y; i < height; ++i) { + memset(dest, top_row[max_base_x], width); + dest += stride; + } + return; + } + + const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1; + const __m128i shift = _mm_set1_epi8(shift_val); + const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift); + const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift); + __m128i top_index_vect = _mm_set1_epi16(top_base_x); + top_index_vect = _mm_add_epi16(top_index_vect, offsets); + + int x = 0; + for (; x < width - 8; + x += 8, top_base_x += base_step8, + top_index_vect = _mm_add_epi16(top_index_vect, base_step8_vect)) { + const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect); + // Assuming a buffer zone of 8 bytes at the end of top_row, this prevents + // reading out of bounds. If all indices are past max and we don't need to + // use the loaded bytes at all, |top_base_x| becomes 0. |top_base_x| will + // reset for the next |y|. + top_base_x &= ~_mm_cvtsi128_si32(past_max); + const __m128i top_vals = LoadUnaligned16(top_row + top_base_x); + __m128i vals = _mm_shuffle_epi8(top_vals, sampler); + vals = _mm_maddubs_epi16(vals, shifts); + vals = RightShiftWithRounding_U16(vals, rounding_bits); + vals = _mm_blendv_epi8(vals, final_top_val, past_max); + StoreLo8(dest + x, _mm_packus_epi16(vals, vals)); + } + const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect); + __m128i vals; + if (upsampled) { + vals = LoadUnaligned16(top_row + top_base_x); + } else { + const __m128i top_vals = LoadLo8(top_row + top_base_x); + vals = _mm_shuffle_epi8(top_vals, sampler); + vals = _mm_insert_epi8(vals, top_row[top_base_x + 8], 15); + } + vals = _mm_maddubs_epi16(vals, shifts); + vals = RightShiftWithRounding_U16(vals, rounding_bits); + vals = _mm_blendv_epi8(vals, final_top_val, past_max); + StoreLo8(dest + x, _mm_packus_epi16(vals, vals)); + dest += stride; + top_x += xstep; + } while (++y < height); +} + +void DirectionalIntraPredictorZone1_SSE4_1(void* const dest, ptrdiff_t stride, + const void* const top_row, + const int width, const int height, + const int xstep, + const bool upsampled_top) { + const auto* const top_ptr = static_cast<const uint8_t*>(top_row); + auto* dst = static_cast<uint8_t*>(dest); + DirectionalZone1_SSE4_1(dst, stride, top_ptr, width, height, xstep, + upsampled_top); +} + +template <bool upsampled> +inline void DirectionalZone3_4x4(uint8_t* dest, ptrdiff_t stride, + const uint8_t* const left_column, + const int base_left_y, const int ystep) { + // For use in the non-upsampled case. + const __m128i sampler = _mm_set_epi64x(0, 0x0403030202010100); + const int upsample_shift = static_cast<int>(upsampled); + const int scale_bits = 6 - upsample_shift; + const __m128i max_shift = _mm_set1_epi8(32); + // Downscaling for a weighted average whose weights sum to 32 (max_shift). + const int rounding_bits = 5; + + __m128i result_block[4]; + for (int x = 0, left_y = base_left_y; x < 4; x++, left_y += ystep) { + const int left_base_y = left_y >> scale_bits; + const int shift_val = ((left_y << upsample_shift) & 0x3F) >> 1; + const __m128i shift = _mm_set1_epi8(shift_val); + const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift); + const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift); + __m128i vals; + if (upsampled) { + vals = LoadLo8(left_column + left_base_y); + } else { + const __m128i top_vals = LoadLo8(left_column + left_base_y); + vals = _mm_shuffle_epi8(top_vals, sampler); + } + vals = _mm_maddubs_epi16(vals, shifts); + vals = RightShiftWithRounding_U16(vals, rounding_bits); + result_block[x] = _mm_packus_epi16(vals, vals); + } + const __m128i result = Transpose4x4_U8(result_block); + // This is result_row0. + Store4(dest, result); + dest += stride; + const int result_row1 = _mm_extract_epi32(result, 1); + memcpy(dest, &result_row1, sizeof(result_row1)); + dest += stride; + const int result_row2 = _mm_extract_epi32(result, 2); + memcpy(dest, &result_row2, sizeof(result_row2)); + dest += stride; + const int result_row3 = _mm_extract_epi32(result, 3); + memcpy(dest, &result_row3, sizeof(result_row3)); +} + +template <bool upsampled, int height> +inline void DirectionalZone3_8xH(uint8_t* dest, ptrdiff_t stride, + const uint8_t* const left_column, + const int base_left_y, const int ystep) { + // For use in the non-upsampled case. + const __m128i sampler = + _mm_set_epi64x(0x0807070606050504, 0x0403030202010100); + const int upsample_shift = static_cast<int>(upsampled); + const int scale_bits = 6 - upsample_shift; + const __m128i max_shift = _mm_set1_epi8(32); + // Downscaling for a weighted average whose weights sum to 32 (max_shift). + const int rounding_bits = 5; + + __m128i result_block[8]; + for (int x = 0, left_y = base_left_y; x < 8; x++, left_y += ystep) { + const int left_base_y = left_y >> scale_bits; + const int shift_val = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1; + const __m128i shift = _mm_set1_epi8(shift_val); + const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift); + const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift); + __m128i vals; + if (upsampled) { + vals = LoadUnaligned16(left_column + left_base_y); + } else { + const __m128i top_vals = LoadUnaligned16(left_column + left_base_y); + vals = _mm_shuffle_epi8(top_vals, sampler); + } + vals = _mm_maddubs_epi16(vals, shifts); + result_block[x] = RightShiftWithRounding_U16(vals, rounding_bits); + } + Transpose8x8_U16(result_block, result_block); + for (int y = 0; y < height; ++y) { + StoreLo8(dest, _mm_packus_epi16(result_block[y], result_block[y])); + dest += stride; + } +} + +// 7.11.2.4 (9) angle > 180 +void DirectionalIntraPredictorZone3_SSE4_1(void* dest, ptrdiff_t stride, + const void* const left_column, + const int width, const int height, + const int ystep, + const bool upsampled) { + const auto* const left_ptr = static_cast<const uint8_t*>(left_column); + auto* dst = static_cast<uint8_t*>(dest); + const int upsample_shift = static_cast<int>(upsampled); + if (width == 4 || height == 4) { + const ptrdiff_t stride4 = stride << 2; + if (upsampled) { + int left_y = ystep; + int x = 0; + do { + uint8_t* dst_x = dst + x; + int y = 0; + do { + DirectionalZone3_4x4<true>( + dst_x, stride, left_ptr + (y << upsample_shift), left_y, ystep); + dst_x += stride4; + y += 4; + } while (y < height); + left_y += ystep << 2; + x += 4; + } while (x < width); + } else { + int left_y = ystep; + int x = 0; + do { + uint8_t* dst_x = dst + x; + int y = 0; + do { + DirectionalZone3_4x4<false>(dst_x, stride, left_ptr + y, left_y, + ystep); + dst_x += stride4; + y += 4; + } while (y < height); + left_y += ystep << 2; + x += 4; + } while (x < width); + } + return; + } + + const ptrdiff_t stride8 = stride << 3; + if (upsampled) { + int left_y = ystep; + int x = 0; + do { + uint8_t* dst_x = dst + x; + int y = 0; + do { + DirectionalZone3_8xH<true, 8>( + dst_x, stride, left_ptr + (y << upsample_shift), left_y, ystep); + dst_x += stride8; + y += 8; + } while (y < height); + left_y += ystep << 3; + x += 8; + } while (x < width); + } else { + int left_y = ystep; + int x = 0; + do { + uint8_t* dst_x = dst + x; + int y = 0; + do { + DirectionalZone3_8xH<false, 8>( + dst_x, stride, left_ptr + (y << upsample_shift), left_y, ystep); + dst_x += stride8; + y += 8; + } while (y < height); + left_y += ystep << 3; + x += 8; + } while (x < width); + } +} + +//------------------------------------------------------------------------------ +// Directional Zone 2 Functions +// 7.11.2.4 (8) + +// DirectionalBlend* selectively overwrites the values written by +// DirectionalZone2FromLeftCol*. |zone_bounds| has one 16-bit index for each +// row. +template <int y_selector> +inline void DirectionalBlend4_SSE4_1(uint8_t* dest, + const __m128i& dest_index_vect, + const __m128i& vals, + const __m128i& zone_bounds) { + const __m128i max_dest_x_vect = _mm_shufflelo_epi16(zone_bounds, y_selector); + const __m128i use_left = _mm_cmplt_epi16(dest_index_vect, max_dest_x_vect); + const __m128i original_vals = _mm_cvtepu8_epi16(Load4(dest)); + const __m128i blended_vals = _mm_blendv_epi8(vals, original_vals, use_left); + Store4(dest, _mm_packus_epi16(blended_vals, blended_vals)); +} + +inline void DirectionalBlend8_SSE4_1(uint8_t* dest, + const __m128i& dest_index_vect, + const __m128i& vals, + const __m128i& zone_bounds, + const __m128i& bounds_selector) { + const __m128i max_dest_x_vect = + _mm_shuffle_epi8(zone_bounds, bounds_selector); + const __m128i use_left = _mm_cmplt_epi16(dest_index_vect, max_dest_x_vect); + const __m128i original_vals = _mm_cvtepu8_epi16(LoadLo8(dest)); + const __m128i blended_vals = _mm_blendv_epi8(vals, original_vals, use_left); + StoreLo8(dest, _mm_packus_epi16(blended_vals, blended_vals)); +} + +constexpr int kDirectionalWeightBits = 5; +// |source| is packed with 4 or 8 pairs of 8-bit values from left or top. +// |shifts| is named to match the specification, with 4 or 8 pairs of (32 - +// shift) and shift. Shift is guaranteed to be between 0 and 32. +inline __m128i DirectionalZone2FromSource_SSE4_1(const uint8_t* const source, + const __m128i& shifts, + const __m128i& sampler) { + const __m128i src_vals = LoadUnaligned16(source); + __m128i vals = _mm_shuffle_epi8(src_vals, sampler); + vals = _mm_maddubs_epi16(vals, shifts); + return RightShiftWithRounding_U16(vals, kDirectionalWeightBits); +} + +// Because the source values "move backwards" as the row index increases, the +// indices derived from ystep are generally negative. This is accommodated by +// making sure the relative indices are within [-15, 0] when the function is +// called, and sliding them into the inclusive range [0, 15], relative to a +// lower base address. +constexpr int kPositiveIndexOffset = 15; + +template <bool upsampled> +inline void DirectionalZone2FromLeftCol_4x4_SSE4_1( + uint8_t* dst, ptrdiff_t stride, const uint8_t* const left_column_base, + __m128i left_y) { + const int upsample_shift = static_cast<int>(upsampled); + const int scale_bits = 6 - upsample_shift; + const __m128i max_shifts = _mm_set1_epi8(32); + const __m128i shift_mask = _mm_set1_epi32(0x003F003F); + const __m128i index_increment = _mm_cvtsi32_si128(0x01010101); + const __m128i positive_offset = _mm_set1_epi8(kPositiveIndexOffset); + // Left_column and sampler are both offset by 15 so the indices are always + // positive. + const uint8_t* left_column = left_column_base - kPositiveIndexOffset; + for (int y = 0; y < 4; dst += stride, ++y) { + __m128i offset_y = _mm_srai_epi16(left_y, scale_bits); + offset_y = _mm_packs_epi16(offset_y, offset_y); + + const __m128i adjacent = _mm_add_epi8(offset_y, index_increment); + __m128i sampler = _mm_unpacklo_epi8(offset_y, adjacent); + // Slide valid |offset_y| indices from range [-15, 0] to [0, 15] so they + // can work as shuffle indices. Some values may be out of bounds, but their + // pred results will be masked over by top prediction. + sampler = _mm_add_epi8(sampler, positive_offset); + + __m128i shifts = _mm_srli_epi16( + _mm_and_si128(_mm_slli_epi16(left_y, upsample_shift), shift_mask), 1); + shifts = _mm_packus_epi16(shifts, shifts); + const __m128i opposite_shifts = _mm_sub_epi8(max_shifts, shifts); + shifts = _mm_unpacklo_epi8(opposite_shifts, shifts); + const __m128i vals = DirectionalZone2FromSource_SSE4_1( + left_column + (y << upsample_shift), shifts, sampler); + Store4(dst, _mm_packus_epi16(vals, vals)); + } +} + +// The height at which a load of 16 bytes will not contain enough source pixels +// from |left_column| to supply an accurate row when computing 8 pixels at a +// time. The values are found by inspection. By coincidence, all angles that +// satisfy (ystep >> 6) == 2 map to the same value, so it is enough to look up +// by ystep >> 6. The largest index for this lookup is 1023 >> 6 == 15. +constexpr int kDirectionalZone2ShuffleInvalidHeight[16] = { + 1024, 1024, 16, 16, 16, 16, 0, 0, 18, 0, 0, 0, 0, 0, 0, 40}; + +template <bool upsampled> +inline void DirectionalZone2FromLeftCol_8x8_SSE4_1( + uint8_t* dst, ptrdiff_t stride, const uint8_t* const left_column, + __m128i left_y) { + const int upsample_shift = static_cast<int>(upsampled); + const int scale_bits = 6 - upsample_shift; + const __m128i max_shifts = _mm_set1_epi8(32); + const __m128i shift_mask = _mm_set1_epi32(0x003F003F); + const __m128i index_increment = _mm_set1_epi8(1); + const __m128i denegation = _mm_set1_epi8(kPositiveIndexOffset); + for (int y = 0; y < 8; dst += stride, ++y) { + __m128i offset_y = _mm_srai_epi16(left_y, scale_bits); + offset_y = _mm_packs_epi16(offset_y, offset_y); + const __m128i adjacent = _mm_add_epi8(offset_y, index_increment); + + // Offset the relative index because ystep is negative in Zone 2 and shuffle + // indices must be nonnegative. + __m128i sampler = _mm_unpacklo_epi8(offset_y, adjacent); + sampler = _mm_add_epi8(sampler, denegation); + + __m128i shifts = _mm_srli_epi16( + _mm_and_si128(_mm_slli_epi16(left_y, upsample_shift), shift_mask), 1); + shifts = _mm_packus_epi16(shifts, shifts); + const __m128i opposite_shifts = _mm_sub_epi8(max_shifts, shifts); + shifts = _mm_unpacklo_epi8(opposite_shifts, shifts); + + // The specification adds (y << 6) to left_y, which is subject to + // upsampling, but this puts sampler indices out of the 0-15 range. It is + // equivalent to offset the source address by (y << upsample_shift) instead. + const __m128i vals = DirectionalZone2FromSource_SSE4_1( + left_column - kPositiveIndexOffset + (y << upsample_shift), shifts, + sampler); + StoreLo8(dst, _mm_packus_epi16(vals, vals)); + } +} + +// |zone_bounds| is an epi16 of the relative x index at which base >= -(1 << +// upsampled_top), for each row. When there are 4 values, they can be duplicated +// with a non-register shuffle mask. +// |shifts| is one pair of weights that applies throughout a given row. +template <bool upsampled_top> +inline void DirectionalZone1Blend_4x4( + uint8_t* dest, const uint8_t* const top_row, ptrdiff_t stride, + __m128i sampler, const __m128i& zone_bounds, const __m128i& shifts, + const __m128i& dest_index_x, int top_x, const int xstep) { + const int upsample_shift = static_cast<int>(upsampled_top); + const int scale_bits_x = 6 - upsample_shift; + top_x -= xstep; + + int top_base_x = (top_x >> scale_bits_x); + const __m128i vals0 = DirectionalZone2FromSource_SSE4_1( + top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0x00), sampler); + DirectionalBlend4_SSE4_1<0x00>(dest, dest_index_x, vals0, zone_bounds); + top_x -= xstep; + dest += stride; + + top_base_x = (top_x >> scale_bits_x); + const __m128i vals1 = DirectionalZone2FromSource_SSE4_1( + top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0x55), sampler); + DirectionalBlend4_SSE4_1<0x55>(dest, dest_index_x, vals1, zone_bounds); + top_x -= xstep; + dest += stride; + + top_base_x = (top_x >> scale_bits_x); + const __m128i vals2 = DirectionalZone2FromSource_SSE4_1( + top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0xAA), sampler); + DirectionalBlend4_SSE4_1<0xAA>(dest, dest_index_x, vals2, zone_bounds); + top_x -= xstep; + dest += stride; + + top_base_x = (top_x >> scale_bits_x); + const __m128i vals3 = DirectionalZone2FromSource_SSE4_1( + top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0xFF), sampler); + DirectionalBlend4_SSE4_1<0xFF>(dest, dest_index_x, vals3, zone_bounds); +} + +template <bool upsampled_top, int height> +inline void DirectionalZone1Blend_8xH( + uint8_t* dest, const uint8_t* const top_row, ptrdiff_t stride, + __m128i sampler, const __m128i& zone_bounds, const __m128i& shifts, + const __m128i& dest_index_x, int top_x, const int xstep) { + const int upsample_shift = static_cast<int>(upsampled_top); + const int scale_bits_x = 6 - upsample_shift; + + __m128i y_selector = _mm_set1_epi32(0x01000100); + const __m128i index_increment = _mm_set1_epi32(0x02020202); + for (int y = 0; y < height; ++y, + y_selector = _mm_add_epi8(y_selector, index_increment), + dest += stride) { + top_x -= xstep; + const int top_base_x = top_x >> scale_bits_x; + const __m128i vals = DirectionalZone2FromSource_SSE4_1( + top_row + top_base_x, _mm_shuffle_epi8(shifts, y_selector), sampler); + DirectionalBlend8_SSE4_1(dest, dest_index_x, vals, zone_bounds, y_selector); + } +} + +// 7.11.2.4 (8) 90 < angle > 180 +// The strategy for this function is to know how many blocks can be processed +// with just pixels from |top_ptr|, then handle mixed blocks, then handle only +// blocks that take from |left_ptr|. Additionally, a fast index-shuffle +// approach is used for pred values from |left_column| in sections that permit +// it. +template <bool upsampled_left, bool upsampled_top> +inline void DirectionalZone2_SSE4_1(void* dest, ptrdiff_t stride, + const uint8_t* const top_row, + const uint8_t* const left_column, + const int width, const int height, + const int xstep, const int ystep) { + auto* dst = static_cast<uint8_t*>(dest); + const int upsample_left_shift = static_cast<int>(upsampled_left); + const int upsample_top_shift = static_cast<int>(upsampled_top); + const __m128i max_shift = _mm_set1_epi8(32); + const ptrdiff_t stride8 = stride << 3; + const __m128i dest_index_x = + _mm_set_epi32(0x00070006, 0x00050004, 0x00030002, 0x00010000); + const __m128i sampler_top = + upsampled_top + ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100) + : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100); + const __m128i shift_mask = _mm_set1_epi32(0x003F003F); + // All columns from |min_top_only_x| to the right will only need |top_row| to + // compute. This assumes minimum |xstep| is 3. + const int min_top_only_x = std::min((height * xstep) >> 6, width); + + // For steep angles, the source pixels from left_column may not fit in a + // 16-byte load for shuffling. + // TODO(petersonab): Find a more precise formula for this subject to x. + const int max_shuffle_height = + std::min(height, kDirectionalZone2ShuffleInvalidHeight[ystep >> 6]); + + const int xstep8 = xstep << 3; + const __m128i xstep8_vect = _mm_set1_epi16(xstep8); + // Accumulate xstep across 8 rows. + const __m128i xstep_dup = _mm_set1_epi16(-xstep); + const __m128i increments = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1); + const __m128i xstep_for_shift = _mm_mullo_epi16(xstep_dup, increments); + // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1 + const __m128i scaled_one = _mm_set1_epi16(-64); + __m128i xstep_bounds_base = + (xstep == 64) ? _mm_sub_epi16(scaled_one, xstep_for_shift) + : _mm_sub_epi16(_mm_set1_epi16(-1), xstep_for_shift); + + const int left_base_increment = ystep >> 6; + const int ystep_remainder = ystep & 0x3F; + const int ystep8 = ystep << 3; + const int left_base_increment8 = ystep8 >> 6; + const int ystep_remainder8 = ystep8 & 0x3F; + const __m128i increment_left8 = _mm_set1_epi16(-ystep_remainder8); + + // If the 64 scaling is regarded as a decimal point, the first value of the + // left_y vector omits the portion which is covered under the left_column + // offset. Following values need the full ystep as a relative offset. + const __m128i ystep_init = _mm_set1_epi16(-ystep_remainder); + const __m128i ystep_dup = _mm_set1_epi16(-ystep); + __m128i left_y = _mm_mullo_epi16(ystep_dup, dest_index_x); + left_y = _mm_add_epi16(ystep_init, left_y); + + const __m128i increment_top8 = _mm_set1_epi16(8 << 6); + int x = 0; + + // This loop treats each set of 4 columns in 3 stages with y-value boundaries. + // The first stage, before the first y-loop, covers blocks that are only + // computed from the top row. The second stage, comprising two y-loops, covers + // blocks that have a mixture of values computed from top or left. The final + // stage covers blocks that are only computed from the left. + for (int left_offset = -left_base_increment; x < min_top_only_x; + x += 8, + xstep_bounds_base = _mm_sub_epi16(xstep_bounds_base, increment_top8), + // Watch left_y because it can still get big. + left_y = _mm_add_epi16(left_y, increment_left8), + left_offset -= left_base_increment8) { + uint8_t* dst_x = dst + x; + + // Round down to the nearest multiple of 8. + const int max_top_only_y = std::min(((x + 1) << 6) / xstep, height) & ~7; + DirectionalZone1_4xH(dst_x, stride, top_row + (x << upsample_top_shift), + max_top_only_y, -xstep, upsampled_top); + DirectionalZone1_4xH(dst_x + 4, stride, + top_row + ((x + 4) << upsample_top_shift), + max_top_only_y, -xstep, upsampled_top); + + int y = max_top_only_y; + dst_x += stride * y; + const int xstep_y = xstep * y; + const __m128i xstep_y_vect = _mm_set1_epi16(xstep_y); + // All rows from |min_left_only_y| down for this set of columns, only need + // |left_column| to compute. + const int min_left_only_y = std::min(((x + 8) << 6) / xstep, height); + // At high angles such that min_left_only_y < 8, ystep is low and xstep is + // high. This means that max_shuffle_height is unbounded and xstep_bounds + // will overflow in 16 bits. This is prevented by stopping the first + // blending loop at min_left_only_y for such cases, which means we skip over + // the second blending loop as well. + const int left_shuffle_stop_y = + std::min(max_shuffle_height, min_left_only_y); + __m128i xstep_bounds = _mm_add_epi16(xstep_bounds_base, xstep_y_vect); + __m128i xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift, xstep_y_vect); + int top_x = -xstep_y; + + for (; y < left_shuffle_stop_y; + y += 8, dst_x += stride8, + xstep_bounds = _mm_add_epi16(xstep_bounds, xstep8_vect), + xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep8_vect), + top_x -= xstep8) { + DirectionalZone2FromLeftCol_8x8_SSE4_1<upsampled_left>( + dst_x, stride, + left_column + ((left_offset + y) << upsample_left_shift), left_y); + + __m128i shifts = _mm_srli_epi16( + _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift), + shift_mask), + 1); + shifts = _mm_packus_epi16(shifts, shifts); + __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts); + shifts = _mm_unpacklo_epi8(opposite_shifts, shifts); + __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6); + DirectionalZone1Blend_8xH<upsampled_top, 8>( + dst_x, top_row + (x << upsample_top_shift), stride, sampler_top, + xstep_bounds_off, shifts, dest_index_x, top_x, xstep); + } + // Pick up from the last y-value, using the 10% slower but secure method for + // left prediction. + const auto base_left_y = static_cast<int16_t>(_mm_extract_epi16(left_y, 0)); + for (; y < min_left_only_y; + y += 8, dst_x += stride8, + xstep_bounds = _mm_add_epi16(xstep_bounds, xstep8_vect), + xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep8_vect), + top_x -= xstep8) { + const __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6); + + DirectionalZone3_8xH<upsampled_left, 8>( + dst_x, stride, + left_column + ((left_offset + y) << upsample_left_shift), base_left_y, + -ystep); + + __m128i shifts = _mm_srli_epi16( + _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift), + shift_mask), + 1); + shifts = _mm_packus_epi16(shifts, shifts); + __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts); + shifts = _mm_unpacklo_epi8(opposite_shifts, shifts); + DirectionalZone1Blend_8xH<upsampled_top, 8>( + dst_x, top_row + (x << upsample_top_shift), stride, sampler_top, + xstep_bounds_off, shifts, dest_index_x, top_x, xstep); + } + // Loop over y for left_only rows. + for (; y < height; y += 8, dst_x += stride8) { + DirectionalZone3_8xH<upsampled_left, 8>( + dst_x, stride, + left_column + ((left_offset + y) << upsample_left_shift), base_left_y, + -ystep); + } + } + for (; x < width; x += 4) { + DirectionalZone1_4xH(dst + x, stride, top_row + (x << upsample_top_shift), + height, -xstep, upsampled_top); + } +} + +template <bool upsampled_left, bool upsampled_top> +inline void DirectionalZone2_4_SSE4_1(void* dest, ptrdiff_t stride, + const uint8_t* const top_row, + const uint8_t* const left_column, + const int width, const int height, + const int xstep, const int ystep) { + auto* dst = static_cast<uint8_t*>(dest); + const int upsample_left_shift = static_cast<int>(upsampled_left); + const int upsample_top_shift = static_cast<int>(upsampled_top); + const __m128i max_shift = _mm_set1_epi8(32); + const ptrdiff_t stride4 = stride << 2; + const __m128i dest_index_x = _mm_set_epi32(0, 0, 0x00030002, 0x00010000); + const __m128i sampler_top = + upsampled_top + ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100) + : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100); + // All columns from |min_top_only_x| to the right will only need |top_row| to + // compute. + assert(xstep >= 3); + const int min_top_only_x = std::min((height * xstep) >> 6, width); + + const int xstep4 = xstep << 2; + const __m128i xstep4_vect = _mm_set1_epi16(xstep4); + const __m128i xstep_dup = _mm_set1_epi16(-xstep); + const __m128i increments = _mm_set_epi32(0, 0, 0x00040003, 0x00020001); + __m128i xstep_for_shift = _mm_mullo_epi16(xstep_dup, increments); + const __m128i scaled_one = _mm_set1_epi16(-64); + // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1 + __m128i xstep_bounds_base = + (xstep == 64) ? _mm_sub_epi16(scaled_one, xstep_for_shift) + : _mm_sub_epi16(_mm_set1_epi16(-1), xstep_for_shift); + + const int left_base_increment = ystep >> 6; + const int ystep_remainder = ystep & 0x3F; + const int ystep4 = ystep << 2; + const int left_base_increment4 = ystep4 >> 6; + // This is guaranteed to be less than 64, but accumulation may bring it past + // 64 for higher x values. + const int ystep_remainder4 = ystep4 & 0x3F; + const __m128i increment_left4 = _mm_set1_epi16(-ystep_remainder4); + const __m128i increment_top4 = _mm_set1_epi16(4 << 6); + + // If the 64 scaling is regarded as a decimal point, the first value of the + // left_y vector omits the portion which will go into the left_column offset. + // Following values need the full ystep as a relative offset. + const __m128i ystep_init = _mm_set1_epi16(-ystep_remainder); + const __m128i ystep_dup = _mm_set1_epi16(-ystep); + __m128i left_y = _mm_mullo_epi16(ystep_dup, dest_index_x); + left_y = _mm_add_epi16(ystep_init, left_y); + const __m128i shift_mask = _mm_set1_epi32(0x003F003F); + + int x = 0; + // Loop over x for columns with a mixture of sources. + for (int left_offset = -left_base_increment; x < min_top_only_x; x += 4, + xstep_bounds_base = _mm_sub_epi16(xstep_bounds_base, increment_top4), + left_y = _mm_add_epi16(left_y, increment_left4), + left_offset -= left_base_increment4) { + uint8_t* dst_x = dst + x; + + // Round down to the nearest multiple of 8. + const int max_top_only_y = std::min((x << 6) / xstep, height) & 0xFFFFFFF4; + DirectionalZone1_4xH(dst_x, stride, top_row + (x << upsample_top_shift), + max_top_only_y, -xstep, upsampled_top); + int y = max_top_only_y; + dst_x += stride * y; + const int xstep_y = xstep * y; + const __m128i xstep_y_vect = _mm_set1_epi16(xstep_y); + // All rows from |min_left_only_y| down for this set of columns, only need + // |left_column| to compute. Rounded up to the nearest multiple of 4. + const int min_left_only_y = std::min(((x + 4) << 6) / xstep, height); + + __m128i xstep_bounds = _mm_add_epi16(xstep_bounds_base, xstep_y_vect); + __m128i xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift, xstep_y_vect); + int top_x = -xstep_y; + + // Loop over y for mixed rows. + for (; y < min_left_only_y; + y += 4, dst_x += stride4, + xstep_bounds = _mm_add_epi16(xstep_bounds, xstep4_vect), + xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep4_vect), + top_x -= xstep4) { + DirectionalZone2FromLeftCol_4x4_SSE4_1<upsampled_left>( + dst_x, stride, + left_column + ((left_offset + y) * (1 << upsample_left_shift)), + left_y); + + __m128i shifts = _mm_srli_epi16( + _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift), + shift_mask), + 1); + shifts = _mm_packus_epi16(shifts, shifts); + const __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts); + shifts = _mm_unpacklo_epi8(opposite_shifts, shifts); + const __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6); + DirectionalZone1Blend_4x4<upsampled_top>( + dst_x, top_row + (x << upsample_top_shift), stride, sampler_top, + xstep_bounds_off, shifts, dest_index_x, top_x, xstep); + } + // Loop over y for left-only rows, if any. + for (; y < height; y += 4, dst_x += stride4) { + DirectionalZone2FromLeftCol_4x4_SSE4_1<upsampled_left>( + dst_x, stride, + left_column + ((left_offset + y) << upsample_left_shift), left_y); + } + } + // Loop over top-only columns, if any. + for (; x < width; x += 4) { + DirectionalZone1_4xH(dst + x, stride, top_row + (x << upsample_top_shift), + height, -xstep, upsampled_top); + } +} + +void DirectionalIntraPredictorZone2_SSE4_1(void* const dest, ptrdiff_t stride, + const void* const top_row, + const void* const left_column, + const int width, const int height, + const int xstep, const int ystep, + const bool upsampled_top, + const bool upsampled_left) { + // Increasing the negative buffer for this function allows more rows to be + // processed at a time without branching in an inner loop to check the base. + uint8_t top_buffer[288]; + uint8_t left_buffer[288]; + memcpy(top_buffer + 128, static_cast<const uint8_t*>(top_row) - 16, 160); + memcpy(left_buffer + 128, static_cast<const uint8_t*>(left_column) - 16, 160); + const uint8_t* top_ptr = top_buffer + 144; + const uint8_t* left_ptr = left_buffer + 144; + if (width == 4 || height == 4) { + if (upsampled_left) { + if (upsampled_top) { + DirectionalZone2_4_SSE4_1<true, true>(dest, stride, top_ptr, left_ptr, + width, height, xstep, ystep); + } else { + DirectionalZone2_4_SSE4_1<true, false>(dest, stride, top_ptr, left_ptr, + width, height, xstep, ystep); + } + } else { + if (upsampled_top) { + DirectionalZone2_4_SSE4_1<false, true>(dest, stride, top_ptr, left_ptr, + width, height, xstep, ystep); + } else { + DirectionalZone2_4_SSE4_1<false, false>(dest, stride, top_ptr, left_ptr, + width, height, xstep, ystep); + } + } + return; + } + if (upsampled_left) { + if (upsampled_top) { + DirectionalZone2_SSE4_1<true, true>(dest, stride, top_ptr, left_ptr, + width, height, xstep, ystep); + } else { + DirectionalZone2_SSE4_1<true, false>(dest, stride, top_ptr, left_ptr, + width, height, xstep, ystep); + } + } else { + if (upsampled_top) { + DirectionalZone2_SSE4_1<false, true>(dest, stride, top_ptr, left_ptr, + width, height, xstep, ystep); + } else { + DirectionalZone2_SSE4_1<false, false>(dest, stride, top_ptr, left_ptr, + width, height, xstep, ystep); + } + } +} + +void Init8bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8); + assert(dsp != nullptr); + static_cast<void>(dsp); +#if DSP_ENABLED_8BPP_SSE4_1(DirectionalIntraPredictorZone1) + dsp->directional_intra_predictor_zone1 = + DirectionalIntraPredictorZone1_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(DirectionalIntraPredictorZone2) + dsp->directional_intra_predictor_zone2 = + DirectionalIntraPredictorZone2_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(DirectionalIntraPredictorZone3) + dsp->directional_intra_predictor_zone3 = + DirectionalIntraPredictorZone3_SSE4_1; +#endif +} + +} // namespace +} // namespace low_bitdepth + +//------------------------------------------------------------------------------ +#if LIBGAV1_MAX_BITDEPTH >= 10 +namespace high_bitdepth { +namespace { + +//------------------------------------------------------------------------------ +// 7.11.2.4. Directional intra prediction process + +// Special case: An |xstep| of 64 corresponds to an angle delta of 45, meaning +// upsampling is ruled out. In addition, the bits masked by 0x3F for +// |shift_val| are 0 for all multiples of 64, so the formula +// val = top[top_base_x]*shift + top[top_base_x+1]*(32-shift), reduces to +// val = top[top_base_x+1] << 5, meaning only the second set of pixels is +// involved in the output. Hence |top| is offset by 1. +inline void DirectionalZone1_Step64(uint16_t* dst, ptrdiff_t stride, + const uint16_t* const top, const int width, + const int height) { + ptrdiff_t offset = 1; + if (height == 4) { + memcpy(dst, top + offset, width * sizeof(dst[0])); + dst += stride; + memcpy(dst, top + offset + 1, width * sizeof(dst[0])); + dst += stride; + memcpy(dst, top + offset + 2, width * sizeof(dst[0])); + dst += stride; + memcpy(dst, top + offset + 3, width * sizeof(dst[0])); + return; + } + int y = height; + do { + memcpy(dst, top + offset, width * sizeof(dst[0])); + dst += stride; + memcpy(dst, top + offset + 1, width * sizeof(dst[0])); + dst += stride; + memcpy(dst, top + offset + 2, width * sizeof(dst[0])); + dst += stride; + memcpy(dst, top + offset + 3, width * sizeof(dst[0])); + dst += stride; + memcpy(dst, top + offset + 4, width * sizeof(dst[0])); + dst += stride; + memcpy(dst, top + offset + 5, width * sizeof(dst[0])); + dst += stride; + memcpy(dst, top + offset + 6, width * sizeof(dst[0])); + dst += stride; + memcpy(dst, top + offset + 7, width * sizeof(dst[0])); + dst += stride; + + offset += 8; + y -= 8; + } while (y != 0); +} + +// Produce a weighted average whose weights sum to 32. +inline __m128i CombineTopVals4(const __m128i& top_vals, const __m128i& sampler, + const __m128i& shifts, + const __m128i& top_indices, + const __m128i& final_top_val, + const __m128i& border_index) { + const __m128i sampled_values = _mm_shuffle_epi8(top_vals, sampler); + __m128i prod = _mm_mullo_epi16(sampled_values, shifts); + prod = _mm_hadd_epi16(prod, prod); + const __m128i result = RightShiftWithRounding_U16(prod, 5 /*log2(32)*/); + + const __m128i past_max = _mm_cmpgt_epi16(top_indices, border_index); + // Replace pixels from invalid range with top-right corner. + return _mm_blendv_epi8(result, final_top_val, past_max); +} + +// When width is 4, only one load operation is needed per iteration. We also +// avoid extra loop precomputations that cause too much overhead. +inline void DirectionalZone1_4xH(uint16_t* dst, ptrdiff_t stride, + const uint16_t* const top, const int height, + const int xstep, const bool upsampled, + const __m128i& sampler) { + const int upsample_shift = static_cast<int>(upsampled); + const int index_scale_bits = 6 - upsample_shift; + const int max_base_x = (height + 3 /* width - 1 */) << upsample_shift; + const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x); + const __m128i final_top_val = _mm_set1_epi16(top[max_base_x]); + + // Each 16-bit value here corresponds to a position that may exceed + // |max_base_x|. When added to the top_base_x, it is used to mask values + // that pass the end of |top|. Starting from 1 to simulate "cmpge" because + // only cmpgt is available. + const __m128i offsets = + _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001); + + // All rows from |min_corner_only_y| down will simply use memcpy. + // |max_base_x| is always greater than |height|, so clipping the denominator + // to 1 is enough to make the logic work. + const int xstep_units = std::max(xstep >> index_scale_bits, 1); + const int min_corner_only_y = std::min(max_base_x / xstep_units, height); + + int y = 0; + int top_x = xstep; + const __m128i max_shift = _mm_set1_epi16(32); + + for (; y < min_corner_only_y; ++y, dst += stride, top_x += xstep) { + const int top_base_x = top_x >> index_scale_bits; + + // Permit negative values of |top_x|. + const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1; + const __m128i shift = _mm_set1_epi16(shift_val); + const __m128i opposite_shift = _mm_sub_epi16(max_shift, shift); + const __m128i shifts = _mm_unpacklo_epi16(opposite_shift, shift); + __m128i top_index_vect = _mm_set1_epi16(top_base_x); + top_index_vect = _mm_add_epi16(top_index_vect, offsets); + + // Load 8 values because we will select the sampled values based on + // |upsampled|. + const __m128i values = LoadUnaligned16(top + top_base_x); + const __m128i pred = + CombineTopVals4(values, sampler, shifts, top_index_vect, final_top_val, + max_base_x_vect); + StoreLo8(dst, pred); + } + + // Fill in corner-only rows. + for (; y < height; ++y) { + Memset(dst, top[max_base_x], /* width */ 4); + dst += stride; + } +} + +// General purpose combine function. +// |check_border| means the final source value has to be duplicated into the +// result. This simplifies the loop structures that use precomputed boundaries +// to identify sections where it is safe to compute without checking for the +// right border. +template <bool check_border> +inline __m128i CombineTopVals( + const __m128i& top_vals_0, const __m128i& top_vals_1, + const __m128i& sampler, const __m128i& shifts, + const __m128i& top_indices = _mm_setzero_si128(), + const __m128i& final_top_val = _mm_setzero_si128(), + const __m128i& border_index = _mm_setzero_si128()) { + constexpr int scale_int_bits = 5; + const __m128i sampled_values_0 = _mm_shuffle_epi8(top_vals_0, sampler); + const __m128i sampled_values_1 = _mm_shuffle_epi8(top_vals_1, sampler); + const __m128i prod_0 = _mm_mullo_epi16(sampled_values_0, shifts); + const __m128i prod_1 = _mm_mullo_epi16(sampled_values_1, shifts); + const __m128i combined = _mm_hadd_epi16(prod_0, prod_1); + const __m128i result = RightShiftWithRounding_U16(combined, scale_int_bits); + if (check_border) { + const __m128i past_max = _mm_cmpgt_epi16(top_indices, border_index); + // Replace pixels from invalid range with top-right corner. + return _mm_blendv_epi8(result, final_top_val, past_max); + } + return result; +} + +// 7.11.2.4 (7) angle < 90 +inline void DirectionalZone1_Large(uint16_t* dest, ptrdiff_t stride, + const uint16_t* const top_row, + const int width, const int height, + const int xstep, const bool upsampled, + const __m128i& sampler) { + const int upsample_shift = static_cast<int>(upsampled); + const int index_scale_bits = 6 - upsample_shift; + const int max_base_x = ((width + height) - 1) << upsample_shift; + + const __m128i max_shift = _mm_set1_epi16(32); + const int base_step = 1 << upsample_shift; + const int base_step8 = base_step << 3; + + // All rows from |min_corner_only_y| down will simply use memcpy. + // |max_base_x| is always greater than |height|, so clipping to 1 is enough + // to make the logic work. + const int xstep_units = std::max(xstep >> index_scale_bits, 1); + const int min_corner_only_y = std::min(max_base_x / xstep_units, height); + + // Rows up to this y-value can be computed without checking for bounds. + const int max_no_corner_y = std::min( + LeftShift((max_base_x - (base_step * width)), index_scale_bits) / xstep, + height); + // No need to check for exceeding |max_base_x| in the first loop. + int y = 0; + int top_x = xstep; + for (; y < max_no_corner_y; ++y, dest += stride, top_x += xstep) { + int top_base_x = top_x >> index_scale_bits; + // Permit negative values of |top_x|. + const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1; + const __m128i shift = _mm_set1_epi16(shift_val); + const __m128i opposite_shift = _mm_sub_epi16(max_shift, shift); + const __m128i shifts = _mm_unpacklo_epi16(opposite_shift, shift); + int x = 0; + do { + const __m128i top_vals_0 = LoadUnaligned16(top_row + top_base_x); + const __m128i top_vals_1 = + LoadUnaligned16(top_row + top_base_x + (4 << upsample_shift)); + + const __m128i pred = + CombineTopVals<false>(top_vals_0, top_vals_1, sampler, shifts); + + StoreUnaligned16(dest + x, pred); + top_base_x += base_step8; + x += 8; + } while (x < width); + } + + // Each 16-bit value here corresponds to a position that may exceed + // |max_base_x|. When added to |top_base_x|, it is used to mask values + // that pass the end of the |top| buffer. Starting from 1 to simulate "cmpge" + // which is not supported for packed integers. + const __m128i offsets = + _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001); + + const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x); + const __m128i final_top_val = _mm_set1_epi16(top_row[max_base_x]); + const __m128i base_step8_vect = _mm_set1_epi16(base_step8); + for (; y < min_corner_only_y; ++y, dest += stride, top_x += xstep) { + int top_base_x = top_x >> index_scale_bits; + + const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1; + const __m128i shift = _mm_set1_epi16(shift_val); + const __m128i opposite_shift = _mm_sub_epi16(max_shift, shift); + const __m128i shifts = _mm_unpacklo_epi16(opposite_shift, shift); + __m128i top_index_vect = _mm_set1_epi16(top_base_x); + top_index_vect = _mm_add_epi16(top_index_vect, offsets); + + int x = 0; + const int min_corner_only_x = + std::min(width, ((max_base_x - top_base_x) >> upsample_shift) + 7) & ~7; + for (; x < min_corner_only_x; + x += 8, top_base_x += base_step8, + top_index_vect = _mm_add_epi16(top_index_vect, base_step8_vect)) { + const __m128i top_vals_0 = LoadUnaligned16(top_row + top_base_x); + const __m128i top_vals_1 = + LoadUnaligned16(top_row + top_base_x + (4 << upsample_shift)); + const __m128i pred = + CombineTopVals<true>(top_vals_0, top_vals_1, sampler, shifts, + top_index_vect, final_top_val, max_base_x_vect); + StoreUnaligned16(dest + x, pred); + } + // Corner-only section of the row. + Memset(dest + x, top_row[max_base_x], width - x); + } + // Fill in corner-only rows. + for (; y < height; ++y) { + Memset(dest, top_row[max_base_x], width); + dest += stride; + } +} + +// 7.11.2.4 (7) angle < 90 +inline void DirectionalIntraPredictorZone1_SSE4_1( + void* dest_ptr, ptrdiff_t stride, const void* const top_ptr, + const int width, const int height, const int xstep, const bool upsampled) { + const auto* const top_row = static_cast<const uint16_t*>(top_ptr); + auto* dest = static_cast<uint16_t*>(dest_ptr); + stride /= sizeof(uint16_t); + const int upsample_shift = static_cast<int>(upsampled); + if (xstep == 64) { + DirectionalZone1_Step64(dest, stride, top_row, width, height); + return; + } + // Each base pixel paired with its following pixel, for hadd purposes. + const __m128i adjacency_shuffler = _mm_set_epi16( + 0x0908, 0x0706, 0x0706, 0x0504, 0x0504, 0x0302, 0x0302, 0x0100); + // This is equivalent to not shuffling at all. + const __m128i identity_shuffler = _mm_set_epi16( + 0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100); + // This represents a trade-off between code size and speed. When upsampled + // is true, no shuffle is necessary. But to avoid in-loop branching, we + // would need 2 copies of the main function body. + const __m128i sampler = upsampled ? identity_shuffler : adjacency_shuffler; + if (width == 4) { + DirectionalZone1_4xH(dest, stride, top_row, height, xstep, upsampled, + sampler); + return; + } + if (width >= 32) { + DirectionalZone1_Large(dest, stride, top_row, width, height, xstep, + upsampled, sampler); + return; + } + const int index_scale_bits = 6 - upsample_shift; + const int max_base_x = ((width + height) - 1) << upsample_shift; + + const __m128i max_shift = _mm_set1_epi16(32); + const int base_step = 1 << upsample_shift; + const int base_step8 = base_step << 3; + + // No need to check for exceeding |max_base_x| in the loops. + if (((xstep * height) >> index_scale_bits) + base_step * width < max_base_x) { + int top_x = xstep; + int y = height; + do { + int top_base_x = top_x >> index_scale_bits; + // Permit negative values of |top_x|. + const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1; + const __m128i shift = _mm_set1_epi16(shift_val); + const __m128i opposite_shift = _mm_sub_epi16(max_shift, shift); + const __m128i shifts = _mm_unpacklo_epi16(opposite_shift, shift); + int x = 0; + do { + const __m128i top_vals_0 = LoadUnaligned16(top_row + top_base_x); + const __m128i top_vals_1 = + LoadUnaligned16(top_row + top_base_x + (4 << upsample_shift)); + const __m128i pred = + CombineTopVals<false>(top_vals_0, top_vals_1, sampler, shifts); + StoreUnaligned16(dest + x, pred); + top_base_x += base_step8; + x += 8; + } while (x < width); + dest += stride; + top_x += xstep; + } while (--y != 0); + return; + } + + // General case. Blocks with width less than 32 do not benefit from x-wise + // loop splitting, but do benefit from using memset on appropriate rows. + + // Each 16-bit value here corresponds to a position that may exceed + // |max_base_x|. When added to the top_base_x, it is used to mask values + // that pass the end of |top|. Starting from 1 to simulate "cmpge" which is + // not supported for packed integers. + const __m128i offsets = + _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001); + + const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x); + const __m128i final_top_val = _mm_set1_epi16(top_row[max_base_x]); + const __m128i base_step8_vect = _mm_set1_epi16(base_step8); + + // All rows from |min_corner_only_y| down will simply use memcpy. + // |max_base_x| is always greater than |height|, so clipping the denominator + // to 1 is enough to make the logic work. + const int xstep_units = std::max(xstep >> index_scale_bits, 1); + const int min_corner_only_y = std::min(max_base_x / xstep_units, height); + + int top_x = xstep; + int y = 0; + for (; y < min_corner_only_y; ++y, dest += stride, top_x += xstep) { + int top_base_x = top_x >> index_scale_bits; + + const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1; + const __m128i shift = _mm_set1_epi16(shift_val); + const __m128i opposite_shift = _mm_sub_epi16(max_shift, shift); + const __m128i shifts = _mm_unpacklo_epi16(opposite_shift, shift); + __m128i top_index_vect = _mm_set1_epi16(top_base_x); + top_index_vect = _mm_add_epi16(top_index_vect, offsets); + + for (int x = 0; x < width; x += 8, top_base_x += base_step8, + top_index_vect = _mm_add_epi16(top_index_vect, base_step8_vect)) { + const __m128i top_vals_0 = LoadUnaligned16(top_row + top_base_x); + const __m128i top_vals_1 = + LoadUnaligned16(top_row + top_base_x + (4 << upsample_shift)); + const __m128i pred = + CombineTopVals<true>(top_vals_0, top_vals_1, sampler, shifts, + top_index_vect, final_top_val, max_base_x_vect); + StoreUnaligned16(dest + x, pred); + } + } + + // Fill in corner-only rows. + for (; y < height; ++y) { + Memset(dest, top_row[max_base_x], width); + dest += stride; + } +} + +void Init10bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(10); + assert(dsp != nullptr); + static_cast<void>(dsp); +#if DSP_ENABLED_10BPP_SSE4_1(DirectionalIntraPredictorZone1) + dsp->directional_intra_predictor_zone1 = + DirectionalIntraPredictorZone1_SSE4_1; +#endif +} + +} // namespace +} // namespace high_bitdepth + +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +void IntraPredDirectionalInit_SSE4_1() { + low_bitdepth::Init8bpp(); +#if LIBGAV1_MAX_BITDEPTH >= 10 + high_bitdepth::Init10bpp(); +#endif +} + +} // namespace dsp +} // namespace libgav1 + +#else // !LIBGAV1_TARGETING_SSE4_1 +namespace libgav1 { +namespace dsp { + +void IntraPredDirectionalInit_SSE4_1() {} + +} // namespace dsp +} // namespace libgav1 +#endif // LIBGAV1_TARGETING_SSE4_1 diff --git a/src/dsp/x86/intrapred_directional_sse4.h b/src/dsp/x86/intrapred_directional_sse4.h new file mode 100644 index 0000000..b352450 --- /dev/null +++ b/src/dsp/x86/intrapred_directional_sse4.h @@ -0,0 +1,54 @@ +/* + * Copyright 2021 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_X86_INTRAPRED_DIRECTIONAL_SSE4_H_ +#define LIBGAV1_SRC_DSP_X86_INTRAPRED_DIRECTIONAL_SSE4_H_ + +#include "src/dsp/dsp.h" +#include "src/utils/cpu.h" + +namespace libgav1 { +namespace dsp { + +// Initializes Dsp::directional_intra_predictor_zone*, see the defines below for +// specifics. These functions are not thread-safe. +void IntraPredDirectionalInit_SSE4_1(); + +} // namespace dsp +} // namespace libgav1 + +// If sse4 is enabled and the baseline isn't set due to a higher level of +// optimization being enabled, signal the sse4 implementation should be used. +#if LIBGAV1_TARGETING_SSE4_1 +#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1 +#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2 +#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3 +#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone1 +#define LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone1 LIBGAV1_CPU_SSE4_1 +#endif + +#endif // LIBGAV1_TARGETING_SSE4_1 + +#endif // LIBGAV1_SRC_DSP_X86_INTRAPRED_DIRECTIONAL_SSE4_H_ diff --git a/src/dsp/x86/intrapred_filter_sse4.cc b/src/dsp/x86/intrapred_filter_sse4.cc new file mode 100644 index 0000000..022af8d --- /dev/null +++ b/src/dsp/x86/intrapred_filter_sse4.cc @@ -0,0 +1,432 @@ +// Copyright 2021 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/intrapred_filter.h" +#include "src/utils/cpu.h" + +#if LIBGAV1_TARGETING_SSE4_1 + +#include <xmmintrin.h> + +#include <algorithm> +#include <cassert> +#include <cstddef> +#include <cstdint> +#include <cstring> + +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/dsp/x86/common_sse4.h" +#include "src/dsp/x86/transpose_sse4.h" +#include "src/utils/common.h" +#include "src/utils/constants.h" + +namespace libgav1 { +namespace dsp { +namespace { + +//------------------------------------------------------------------------------ +// FilterIntraPredictor_SSE4_1 +// Section 7.11.2.3. Recursive intra prediction process +// This filter applies recursively to 4x2 sub-blocks within the transform block, +// meaning that the predicted pixels in each sub-block are used as inputs to +// sub-blocks below and to the right, if present. +// +// Each output value in the sub-block is predicted by a different filter applied +// to the same array of top-left, top, and left values. If fn refers to the +// output of the nth filter, given this block: +// TL T0 T1 T2 T3 +// L0 f0 f1 f2 f3 +// L1 f4 f5 f6 f7 +// The filter input order is p0, p1, p2, p3, p4, p5, p6: +// p0 p1 p2 p3 p4 +// p5 f0 f1 f2 f3 +// p6 f4 f5 f6 f7 +// Filters usually apply to 8 values for convenience, so in this case we fix +// the 8th filter tap to 0 and disregard the value of the 8th input. + +// This shuffle mask selects 32-bit blocks in the order 0, 1, 0, 1, which +// duplicates the first 8 bytes of a 128-bit vector into the second 8 bytes. +constexpr int kDuplicateFirstHalf = 0x44; + +// Apply all filter taps to the given 7 packed 16-bit values, keeping the 8th +// at zero to preserve the sum. +// |pixels| contains p0-p7 in order as shown above. +// |taps_0_1| contains the filter kernels used to predict f0 and f1, and so on. +inline void Filter4x2_SSE4_1(uint8_t* dst, const ptrdiff_t stride, + const __m128i& pixels, const __m128i& taps_0_1, + const __m128i& taps_2_3, const __m128i& taps_4_5, + const __m128i& taps_6_7) { + const __m128i mul_0_01 = _mm_maddubs_epi16(pixels, taps_0_1); + const __m128i mul_0_23 = _mm_maddubs_epi16(pixels, taps_2_3); + // |output_half| contains 8 partial sums for f0-f7. + __m128i output_half = _mm_hadd_epi16(mul_0_01, mul_0_23); + __m128i output = _mm_hadd_epi16(output_half, output_half); + const __m128i output_row0 = + _mm_packus_epi16(RightShiftWithRounding_S16(output, 4), + /* unused half */ output); + Store4(dst, output_row0); + const __m128i mul_1_01 = _mm_maddubs_epi16(pixels, taps_4_5); + const __m128i mul_1_23 = _mm_maddubs_epi16(pixels, taps_6_7); + output_half = _mm_hadd_epi16(mul_1_01, mul_1_23); + output = _mm_hadd_epi16(output_half, output_half); + const __m128i output_row1 = + _mm_packus_epi16(RightShiftWithRounding_S16(output, 4), + /* arbitrary pack arg */ output); + Store4(dst + stride, output_row1); +} + +// 4xH transform sizes are given special treatment because LoadLo8 goes out +// of bounds and every block involves the left column. The top-left pixel, p0, +// is stored in the top buffer for the first 4x2, but comes from the left buffer +// for successive blocks. This implementation takes advantage of the fact +// that the p5 and p6 for each sub-block come solely from the |left_ptr| buffer, +// using shifts to arrange things to fit reusable shuffle vectors. +inline void Filter4xH(uint8_t* dest, ptrdiff_t stride, + const uint8_t* const top_ptr, + const uint8_t* const left_ptr, FilterIntraPredictor pred, + const int height) { + // Two filter kernels per vector. + const __m128i taps_0_1 = LoadAligned16(kFilterIntraTaps[pred][0]); + const __m128i taps_2_3 = LoadAligned16(kFilterIntraTaps[pred][2]); + const __m128i taps_4_5 = LoadAligned16(kFilterIntraTaps[pred][4]); + const __m128i taps_6_7 = LoadAligned16(kFilterIntraTaps[pred][6]); + __m128i top = Load4(top_ptr - 1); + __m128i pixels = _mm_insert_epi8(top, top_ptr[3], 4); + __m128i left = (height == 4 ? Load4(left_ptr) : LoadLo8(left_ptr)); + left = _mm_slli_si128(left, 5); + + // Relative pixels: top[-1], top[0], top[1], top[2], top[3], left[0], left[1], + // left[2], left[3], left[4], left[5], left[6], left[7] + // Let rn represent a pixel usable as pn for the 4x2 after this one. We get: + // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + // p0 p1 p2 p3 p4 p5 p6 r5 r6 ... + // r0 + pixels = _mm_or_si128(left, pixels); + + // Two sets of the same input pixels to apply two filters at once. + pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf); + Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5, + taps_6_7); + dest += stride; // Move to y = 1. + pixels = Load4(dest); + + // Relative pixels: top[0], top[1], top[2], top[3], empty, left[-2], left[-1], + // left[0], left[1], ... + // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + // p1 p2 p3 p4 xx xx p0 p5 p6 r5 r6 ... + // r0 + pixels = _mm_or_si128(left, pixels); + + // This mask rearranges bytes in the order: 6, 0, 1, 2, 3, 7, 8, 15. The last + // byte is an unused value, which shall be multiplied by 0 when we apply the + // filter. + constexpr int64_t kInsertTopLeftFirstMask = 0x0F08070302010006; + + // Insert left[-1] in front as TL and put left[0] and left[1] at the end. + const __m128i pixel_order1 = _mm_set1_epi64x(kInsertTopLeftFirstMask); + pixels = _mm_shuffle_epi8(pixels, pixel_order1); + dest += stride; // Move to y = 2. + Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5, + taps_6_7); + dest += stride; // Move to y = 3. + + // Compute the middle 8 rows before using common code for the final 4 rows, in + // order to fit the assumption that |left| has the next TL at position 8. + if (height == 16) { + // This shift allows us to use pixel_order2 twice after shifting by 2 later. + left = _mm_slli_si128(left, 1); + pixels = Load4(dest); + + // Relative pixels: top[0], top[1], top[2], top[3], empty, empty, left[-4], + // left[-3], left[-2], left[-1], left[0], left[1], left[2], left[3] + // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + // p1 p2 p3 p4 xx xx xx xx xx p0 p5 p6 r5 r6 ... + // r0 + pixels = _mm_or_si128(left, pixels); + + // This mask rearranges bytes in the order: 9, 0, 1, 2, 3, 7, 8, 15. The + // last byte is an unused value, as above. The top-left was shifted to + // position nine to keep two empty spaces after the top pixels. + constexpr int64_t kInsertTopLeftSecondMask = 0x0F0B0A0302010009; + + // Insert (relative) left[-1] in front as TL and put left[0] and left[1] at + // the end. + const __m128i pixel_order2 = _mm_set1_epi64x(kInsertTopLeftSecondMask); + pixels = _mm_shuffle_epi8(pixels, pixel_order2); + dest += stride; // Move to y = 4. + + // First 4x2 in the if body. + Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5, + taps_6_7); + + // Clear all but final pixel in the first 8 of left column. + __m128i keep_top_left = _mm_srli_si128(left, 13); + dest += stride; // Move to y = 5. + pixels = Load4(dest); + left = _mm_srli_si128(left, 2); + + // Relative pixels: top[0], top[1], top[2], top[3], left[-6], + // left[-5], left[-4], left[-3], left[-2], left[-1], left[0], left[1] + // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + // p1 p2 p3 p4 xx xx xx xx xx p0 p5 p6 r5 r6 ... + // r0 + pixels = _mm_or_si128(left, pixels); + left = LoadLo8(left_ptr + 8); + + pixels = _mm_shuffle_epi8(pixels, pixel_order2); + dest += stride; // Move to y = 6. + + // Second 4x2 in the if body. + Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5, + taps_6_7); + + // Position TL value so we can use pixel_order1. + keep_top_left = _mm_slli_si128(keep_top_left, 6); + dest += stride; // Move to y = 7. + pixels = Load4(dest); + left = _mm_slli_si128(left, 7); + left = _mm_or_si128(left, keep_top_left); + + // Relative pixels: top[0], top[1], top[2], top[3], empty, empty, + // left[-1], left[0], left[1], left[2], left[3], ... + // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + // p1 p2 p3 p4 xx xx p0 p5 p6 r5 r6 ... + // r0 + pixels = _mm_or_si128(left, pixels); + pixels = _mm_shuffle_epi8(pixels, pixel_order1); + dest += stride; // Move to y = 8. + + // Third 4x2 in the if body. + Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5, + taps_6_7); + dest += stride; // Move to y = 9. + + // Prepare final inputs. + pixels = Load4(dest); + left = _mm_srli_si128(left, 2); + + // Relative pixels: top[0], top[1], top[2], top[3], left[-3], left[-2] + // left[-1], left[0], left[1], left[2], left[3], ... + // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + // p1 p2 p3 p4 xx xx p0 p5 p6 r5 r6 ... + // r0 + pixels = _mm_or_si128(left, pixels); + pixels = _mm_shuffle_epi8(pixels, pixel_order1); + dest += stride; // Move to y = 10. + + // Fourth 4x2 in the if body. + Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5, + taps_6_7); + dest += stride; // Move to y = 11. + } + + // In both the 8 and 16 case at this point, we can assume that |left| has the + // next TL at position 8. + if (height > 4) { + // Erase prior left pixels by shifting TL to position 0. + left = _mm_srli_si128(left, 8); + left = _mm_slli_si128(left, 6); + pixels = Load4(dest); + + // Relative pixels: top[0], top[1], top[2], top[3], empty, empty, + // left[-1], left[0], left[1], left[2], left[3], ... + // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + // p1 p2 p3 p4 xx xx p0 p5 p6 r5 r6 ... + // r0 + pixels = _mm_or_si128(left, pixels); + pixels = _mm_shuffle_epi8(pixels, pixel_order1); + dest += stride; // Move to y = 12 or 4. + + // First of final two 4x2 blocks. + Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5, + taps_6_7); + dest += stride; // Move to y = 13 or 5. + pixels = Load4(dest); + left = _mm_srli_si128(left, 2); + + // Relative pixels: top[0], top[1], top[2], top[3], left[-3], left[-2] + // left[-1], left[0], left[1], left[2], left[3], ... + // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + // p1 p2 p3 p4 xx xx p0 p5 p6 r5 r6 ... + // r0 + pixels = _mm_or_si128(left, pixels); + pixels = _mm_shuffle_epi8(pixels, pixel_order1); + dest += stride; // Move to y = 14 or 6. + + // Last of final two 4x2 blocks. + Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5, + taps_6_7); + } +} + +void FilterIntraPredictor_SSE4_1(void* const dest, ptrdiff_t stride, + const void* const top_row, + const void* const left_column, + FilterIntraPredictor pred, const int width, + const int height) { + const auto* const top_ptr = static_cast<const uint8_t*>(top_row); + const auto* const left_ptr = static_cast<const uint8_t*>(left_column); + auto* dst = static_cast<uint8_t*>(dest); + if (width == 4) { + Filter4xH(dst, stride, top_ptr, left_ptr, pred, height); + return; + } + + // There is one set of 7 taps for each of the 4x2 output pixels. + const __m128i taps_0_1 = LoadAligned16(kFilterIntraTaps[pred][0]); + const __m128i taps_2_3 = LoadAligned16(kFilterIntraTaps[pred][2]); + const __m128i taps_4_5 = LoadAligned16(kFilterIntraTaps[pred][4]); + const __m128i taps_6_7 = LoadAligned16(kFilterIntraTaps[pred][6]); + + // This mask rearranges bytes in the order: 0, 1, 2, 3, 4, 8, 9, 15. The 15 at + // the end is an unused value, which shall be multiplied by 0 when we apply + // the filter. + constexpr int64_t kCondenseLeftMask = 0x0F09080403020100; + + // Takes the "left section" and puts it right after p0-p4. + const __m128i pixel_order1 = _mm_set1_epi64x(kCondenseLeftMask); + + // This mask rearranges bytes in the order: 8, 0, 1, 2, 3, 9, 10, 15. The last + // byte is unused as above. + constexpr int64_t kInsertTopLeftMask = 0x0F0A090302010008; + + // Shuffles the "top left" from the left section, to the front. Used when + // grabbing data from left_column and not top_row. + const __m128i pixel_order2 = _mm_set1_epi64x(kInsertTopLeftMask); + + // This first pass takes care of the cases where the top left pixel comes from + // top_row. + __m128i pixels = LoadLo8(top_ptr - 1); + __m128i left = _mm_slli_si128(Load4(left_column), 8); + pixels = _mm_or_si128(pixels, left); + + // Two sets of the same pixels to multiply with two sets of taps. + pixels = _mm_shuffle_epi8(pixels, pixel_order1); + Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5, taps_6_7); + left = _mm_srli_si128(left, 1); + + // Load + pixels = Load4(dst + stride); + + // Because of the above shift, this OR 'invades' the final of the first 8 + // bytes of |pixels|. This is acceptable because the 8th filter tap is always + // a padded 0. + pixels = _mm_or_si128(pixels, left); + pixels = _mm_shuffle_epi8(pixels, pixel_order2); + const ptrdiff_t stride2 = stride << 1; + const ptrdiff_t stride4 = stride << 2; + Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3, taps_4_5, + taps_6_7); + dst += 4; + for (int x = 3; x < width - 4; x += 4) { + pixels = Load4(top_ptr + x); + pixels = _mm_insert_epi8(pixels, top_ptr[x + 4], 4); + pixels = _mm_insert_epi8(pixels, dst[-1], 5); + pixels = _mm_insert_epi8(pixels, dst[stride - 1], 6); + + // Duplicate bottom half into upper half. + pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf); + Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5, + taps_6_7); + pixels = Load4(dst + stride - 1); + pixels = _mm_insert_epi8(pixels, dst[stride + 3], 4); + pixels = _mm_insert_epi8(pixels, dst[stride2 - 1], 5); + pixels = _mm_insert_epi8(pixels, dst[stride + stride2 - 1], 6); + + // Duplicate bottom half into upper half. + pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf); + Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3, + taps_4_5, taps_6_7); + dst += 4; + } + + // Now we handle heights that reference previous blocks rather than top_row. + for (int y = 4; y < height; y += 4) { + // Leftmost 4x4 block for this height. + dst -= width; + dst += stride4; + + // Top Left is not available by offset in these leftmost blocks. + pixels = Load4(dst - stride); + left = _mm_slli_si128(Load4(left_ptr + y - 1), 8); + left = _mm_insert_epi8(left, left_ptr[y + 3], 12); + pixels = _mm_or_si128(pixels, left); + pixels = _mm_shuffle_epi8(pixels, pixel_order2); + Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5, + taps_6_7); + + // The bytes shifted into positions 6 and 7 will be ignored by the shuffle. + left = _mm_srli_si128(left, 2); + pixels = Load4(dst + stride); + pixels = _mm_or_si128(pixels, left); + pixels = _mm_shuffle_epi8(pixels, pixel_order2); + Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3, + taps_4_5, taps_6_7); + + dst += 4; + + // Remaining 4x4 blocks for this height. + for (int x = 4; x < width; x += 4) { + pixels = Load4(dst - stride - 1); + pixels = _mm_insert_epi8(pixels, dst[-stride + 3], 4); + pixels = _mm_insert_epi8(pixels, dst[-1], 5); + pixels = _mm_insert_epi8(pixels, dst[stride - 1], 6); + + // Duplicate bottom half into upper half. + pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf); + Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5, + taps_6_7); + pixels = Load4(dst + stride - 1); + pixels = _mm_insert_epi8(pixels, dst[stride + 3], 4); + pixels = _mm_insert_epi8(pixels, dst[stride2 - 1], 5); + pixels = _mm_insert_epi8(pixels, dst[stride2 + stride - 1], 6); + + // Duplicate bottom half into upper half. + pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf); + Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3, + taps_4_5, taps_6_7); + dst += 4; + } + } +} + +void Init8bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8); + assert(dsp != nullptr); + static_cast<void>(dsp); +// These guards check if this version of the function was not superseded by +// a higher optimization level, such as AVX. The corresponding #define also +// prevents the C version from being added to the table. +#if DSP_ENABLED_8BPP_SSE4_1(FilterIntraPredictor) + dsp->filter_intra_predictor = FilterIntraPredictor_SSE4_1; +#endif +} + +} // namespace + +void IntraPredFilterInit_SSE4_1() { Init8bpp(); } + +} // namespace dsp +} // namespace libgav1 + +#else // !LIBGAV1_TARGETING_SSE4_1 +namespace libgav1 { +namespace dsp { + +void IntraPredFilterInit_SSE4_1() {} + +} // namespace dsp +} // namespace libgav1 +#endif // LIBGAV1_TARGETING_SSE4_1 diff --git a/src/dsp/x86/intrapred_filter_sse4.h b/src/dsp/x86/intrapred_filter_sse4.h new file mode 100644 index 0000000..ce28f93 --- /dev/null +++ b/src/dsp/x86/intrapred_filter_sse4.h @@ -0,0 +1,41 @@ +/* + * Copyright 2021 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_X86_INTRAPRED_FILTER_SSE4_H_ +#define LIBGAV1_SRC_DSP_X86_INTRAPRED_FILTER_SSE4_H_ + +#include "src/dsp/dsp.h" +#include "src/utils/cpu.h" + +namespace libgav1 { +namespace dsp { + +// Initializes Dsp::filter_intra_predictor, see the defines below for specifics. +// These functions are not thread-safe. +void IntraPredFilterInit_SSE4_1(); + +} // namespace dsp +} // namespace libgav1 + +// If sse4 is enabled and the baseline isn't set due to a higher level of +// optimization being enabled, signal the sse4 implementation should be used. +#if LIBGAV1_TARGETING_SSE4_1 +#ifndef LIBGAV1_Dsp8bpp_FilterIntraPredictor +#define LIBGAV1_Dsp8bpp_FilterIntraPredictor LIBGAV1_CPU_SSE4_1 +#endif +#endif // LIBGAV1_TARGETING_SSE4_1 + +#endif // LIBGAV1_SRC_DSP_X86_INTRAPRED_FILTER_SSE4_H_ diff --git a/src/dsp/x86/intrapred_smooth_sse4.cc b/src/dsp/x86/intrapred_smooth_sse4.cc index e944ea3..de9f551 100644 --- a/src/dsp/x86/intrapred_smooth_sse4.cc +++ b/src/dsp/x86/intrapred_smooth_sse4.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "src/dsp/intrapred.h" +#include "src/dsp/intrapred_smooth.h" #include "src/utils/cpu.h" #if LIBGAV1_TARGETING_SSE4_1 @@ -22,12 +22,12 @@ #include <cassert> #include <cstddef> #include <cstdint> -#include <cstring> // memcpy #include "src/dsp/constants.h" #include "src/dsp/dsp.h" #include "src/dsp/x86/common_sse4.h" #include "src/utils/common.h" +#include "src/utils/constants.h" namespace libgav1 { namespace dsp { @@ -67,29 +67,6 @@ inline void WriteSmoothHorizontalSum4(void* const dest, const __m128i& left, Store4(dest, _mm_shuffle_epi8(pred, cvtepi32_epi8)); } -template <int y_mask> -inline __m128i SmoothVerticalSum4(const __m128i& top, const __m128i& weights, - const __m128i& scaled_bottom_left) { - const __m128i weights_y = _mm_shuffle_epi32(weights, y_mask); - const __m128i weighted_top_y = _mm_mullo_epi16(top, weights_y); - const __m128i scaled_bottom_left_y = - _mm_shuffle_epi32(scaled_bottom_left, y_mask); - return _mm_add_epi32(scaled_bottom_left_y, weighted_top_y); -} - -template <int y_mask> -inline void WriteSmoothVerticalSum4(uint8_t* dest, const __m128i& top, - const __m128i& weights, - const __m128i& scaled_bottom_left, - const __m128i& round) { - __m128i pred_sum = - SmoothVerticalSum4<y_mask>(top, weights, scaled_bottom_left); - // Equivalent to RightShiftWithRounding(pred[x][y], 8). - pred_sum = _mm_srli_epi32(_mm_add_epi32(pred_sum, round), 8); - const __m128i cvtepi32_epi8 = _mm_set1_epi32(0x0C080400); - Store4(dest, _mm_shuffle_epi8(pred_sum, cvtepi32_epi8)); -} - // For SMOOTH_H, |pixels| is the repeated left value for the row. For SMOOTH_V, // |pixels| is a segment of the top row or the whole top row, and |weights| is // repeated. diff --git a/src/dsp/x86/intrapred_smooth_sse4.h b/src/dsp/x86/intrapred_smooth_sse4.h new file mode 100644 index 0000000..9353371 --- /dev/null +++ b/src/dsp/x86/intrapred_smooth_sse4.h @@ -0,0 +1,318 @@ +/* + * Copyright 2021 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_X86_INTRAPRED_SMOOTH_SSE4_H_ +#define LIBGAV1_SRC_DSP_X86_INTRAPRED_SMOOTH_SSE4_H_ + +#include "src/dsp/dsp.h" +#include "src/utils/cpu.h" + +namespace libgav1 { +namespace dsp { + +// Initializes Dsp::intra_predictors[][kIntraPredictorSmooth.*]. +// This function is not thread-safe. +void IntraPredSmoothInit_SSE4_1(); + +} // namespace dsp +} // namespace libgav1 + +// If sse4 is enabled and the baseline isn't set due to a higher level of +// optimization being enabled, signal the sse4 implementation should be used. +#if LIBGAV1_TARGETING_SSE4_1 +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmooth +#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmooth LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmooth +#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmooth LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmooth +#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmooth \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmooth +#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmooth LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmooth +#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmooth LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmooth +#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmooth \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmooth +#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmooth \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmooth +#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmooth \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmooth +#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmooth \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmooth +#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmooth \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmooth +#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmooth \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmooth +#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmooth \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmooth +#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmooth \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmooth +#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmooth \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmooth +#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmooth \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmooth +#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmooth \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmooth +#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmooth \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmooth +#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmooth \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmooth +#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmooth \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothVertical +#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothVertical +#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothVertical +#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothVertical +#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothVertical +#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothVertical +#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothVertical +#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothVertical +#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothVertical +#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothVertical +#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothVertical +#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothVertical +#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothVertical +#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothVertical +#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothVertical +#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothVertical +#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothVertical +#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothVertical +#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothVertical +#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothHorizontal +#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothHorizontal +#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothHorizontal +#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothHorizontal +#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothHorizontal +#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothHorizontal +#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothHorizontal +#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothHorizontal +#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothHorizontal +#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothHorizontal +#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothHorizontal +#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothHorizontal +#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothHorizontal +#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothHorizontal +#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothHorizontal +#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothHorizontal +#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothHorizontal +#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothHorizontal +#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothHorizontal +#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif +#endif // LIBGAV1_TARGETING_SSE4_1 + +#endif // LIBGAV1_SRC_DSP_X86_INTRAPRED_SMOOTH_SSE4_H_ diff --git a/src/dsp/x86/intrapred_sse4.cc b/src/dsp/x86/intrapred_sse4.cc index 9938dfe..063929d 100644 --- a/src/dsp/x86/intrapred_sse4.cc +++ b/src/dsp/x86/intrapred_sse4.cc @@ -23,13 +23,14 @@ #include <cassert> #include <cstddef> #include <cstdint> -#include <cstring> // memcpy +#include <cstring> #include "src/dsp/constants.h" #include "src/dsp/dsp.h" #include "src/dsp/x86/common_sse4.h" #include "src/dsp/x86/transpose_sse4.h" #include "src/utils/common.h" +#include "src/utils/constants.h" namespace libgav1 { namespace dsp { @@ -51,10 +52,6 @@ inline __m128i DivideByMultiplyShift_U32(const __m128i dividend) { return _mm_mulhi_epi16(interm, _mm_cvtsi32_si128(multiplier)); } -// This shuffle mask selects 32-bit blocks in the order 0, 1, 0, 1, which -// duplicates the first 8 bytes of a 128-bit vector into the second 8 bytes. -constexpr int kDuplicateFirstHalf = 0x44; - //------------------------------------------------------------------------------ // DcPredFuncs_SSE4_1 @@ -1408,1337 +1405,6 @@ void Paeth64x64_SSE4_1(void* const dest, ptrdiff_t stride, WritePaeth16x16(dst + 48, stride, top_left, top_3, left_3); } -//------------------------------------------------------------------------------ -// 7.11.2.4. Directional intra prediction process - -// Special case: An |xstep| of 64 corresponds to an angle delta of 45, meaning -// upsampling is ruled out. In addition, the bits masked by 0x3F for -// |shift_val| are 0 for all multiples of 64, so the formula -// val = top[top_base_x]*shift + top[top_base_x+1]*(32-shift), reduces to -// val = top[top_base_x+1] << 5, meaning only the second set of pixels is -// involved in the output. Hence |top| is offset by 1. -inline void DirectionalZone1_Step64(uint8_t* dst, ptrdiff_t stride, - const uint8_t* const top, const int width, - const int height) { - ptrdiff_t offset = 1; - if (height == 4) { - memcpy(dst, top + offset, width); - dst += stride; - memcpy(dst, top + offset + 1, width); - dst += stride; - memcpy(dst, top + offset + 2, width); - dst += stride; - memcpy(dst, top + offset + 3, width); - return; - } - int y = 0; - do { - memcpy(dst, top + offset, width); - dst += stride; - memcpy(dst, top + offset + 1, width); - dst += stride; - memcpy(dst, top + offset + 2, width); - dst += stride; - memcpy(dst, top + offset + 3, width); - dst += stride; - memcpy(dst, top + offset + 4, width); - dst += stride; - memcpy(dst, top + offset + 5, width); - dst += stride; - memcpy(dst, top + offset + 6, width); - dst += stride; - memcpy(dst, top + offset + 7, width); - dst += stride; - - offset += 8; - y += 8; - } while (y < height); -} - -inline void DirectionalZone1_4xH(uint8_t* dst, ptrdiff_t stride, - const uint8_t* const top, const int height, - const int xstep, const bool upsampled) { - const int upsample_shift = static_cast<int>(upsampled); - const int scale_bits = 6 - upsample_shift; - const int rounding_bits = 5; - const int max_base_x = (height + 3 /* width - 1 */) << upsample_shift; - const __m128i final_top_val = _mm_set1_epi16(top[max_base_x]); - const __m128i sampler = upsampled ? _mm_set_epi64x(0, 0x0706050403020100) - : _mm_set_epi64x(0, 0x0403030202010100); - // Each 16-bit value here corresponds to a position that may exceed - // |max_base_x|. When added to the top_base_x, it is used to mask values - // that pass the end of |top|. Starting from 1 to simulate "cmpge" which is - // not supported for packed integers. - const __m128i offsets = - _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001); - - // All rows from |min_corner_only_y| down will simply use memcpy. |max_base_x| - // is always greater than |height|, so clipping to 1 is enough to make the - // logic work. - const int xstep_units = std::max(xstep >> scale_bits, 1); - const int min_corner_only_y = std::min(max_base_x / xstep_units, height); - - // Rows up to this y-value can be computed without checking for bounds. - int y = 0; - int top_x = xstep; - - for (; y < min_corner_only_y; ++y, dst += stride, top_x += xstep) { - const int top_base_x = top_x >> scale_bits; - - // Permit negative values of |top_x|. - const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1; - const __m128i shift = _mm_set1_epi8(shift_val); - const __m128i max_shift = _mm_set1_epi8(32); - const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift); - const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift); - __m128i top_index_vect = _mm_set1_epi16(top_base_x); - top_index_vect = _mm_add_epi16(top_index_vect, offsets); - const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x); - - // Load 8 values because we will select the sampled values based on - // |upsampled|. - const __m128i values = LoadLo8(top + top_base_x); - const __m128i sampled_values = _mm_shuffle_epi8(values, sampler); - const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect); - __m128i prod = _mm_maddubs_epi16(sampled_values, shifts); - prod = RightShiftWithRounding_U16(prod, rounding_bits); - // Replace pixels from invalid range with top-right corner. - prod = _mm_blendv_epi8(prod, final_top_val, past_max); - Store4(dst, _mm_packus_epi16(prod, prod)); - } - - // Fill in corner-only rows. - for (; y < height; ++y) { - memset(dst, top[max_base_x], /* width */ 4); - dst += stride; - } -} - -// 7.11.2.4 (7) angle < 90 -inline void DirectionalZone1_Large(uint8_t* dest, ptrdiff_t stride, - const uint8_t* const top_row, - const int width, const int height, - const int xstep, const bool upsampled) { - const int upsample_shift = static_cast<int>(upsampled); - const __m128i sampler = - upsampled ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100) - : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100); - const int scale_bits = 6 - upsample_shift; - const int max_base_x = ((width + height) - 1) << upsample_shift; - - const __m128i max_shift = _mm_set1_epi8(32); - const int rounding_bits = 5; - const int base_step = 1 << upsample_shift; - const int base_step8 = base_step << 3; - - // All rows from |min_corner_only_y| down will simply use memcpy. |max_base_x| - // is always greater than |height|, so clipping to 1 is enough to make the - // logic work. - const int xstep_units = std::max(xstep >> scale_bits, 1); - const int min_corner_only_y = std::min(max_base_x / xstep_units, height); - - // Rows up to this y-value can be computed without checking for bounds. - const int max_no_corner_y = std::min( - LeftShift((max_base_x - (base_step * width)), scale_bits) / xstep, - height); - // No need to check for exceeding |max_base_x| in the first loop. - int y = 0; - int top_x = xstep; - for (; y < max_no_corner_y; ++y, dest += stride, top_x += xstep) { - int top_base_x = top_x >> scale_bits; - // Permit negative values of |top_x|. - const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1; - const __m128i shift = _mm_set1_epi8(shift_val); - const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift); - const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift); - int x = 0; - do { - const __m128i top_vals = LoadUnaligned16(top_row + top_base_x); - __m128i vals = _mm_shuffle_epi8(top_vals, sampler); - vals = _mm_maddubs_epi16(vals, shifts); - vals = RightShiftWithRounding_U16(vals, rounding_bits); - StoreLo8(dest + x, _mm_packus_epi16(vals, vals)); - top_base_x += base_step8; - x += 8; - } while (x < width); - } - - // Each 16-bit value here corresponds to a position that may exceed - // |max_base_x|. When added to the top_base_x, it is used to mask values - // that pass the end of |top|. Starting from 1 to simulate "cmpge" which is - // not supported for packed integers. - const __m128i offsets = - _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001); - - const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x); - const __m128i final_top_val = _mm_set1_epi16(top_row[max_base_x]); - const __m128i base_step8_vect = _mm_set1_epi16(base_step8); - for (; y < min_corner_only_y; ++y, dest += stride, top_x += xstep) { - int top_base_x = top_x >> scale_bits; - - const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1; - const __m128i shift = _mm_set1_epi8(shift_val); - const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift); - const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift); - __m128i top_index_vect = _mm_set1_epi16(top_base_x); - top_index_vect = _mm_add_epi16(top_index_vect, offsets); - - int x = 0; - const int min_corner_only_x = - std::min(width, ((max_base_x - top_base_x) >> upsample_shift) + 7) & ~7; - for (; x < min_corner_only_x; - x += 8, top_base_x += base_step8, - top_index_vect = _mm_add_epi16(top_index_vect, base_step8_vect)) { - const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect); - // Assuming a buffer zone of 8 bytes at the end of top_row, this prevents - // reading out of bounds. If all indices are past max and we don't need to - // use the loaded bytes at all, |top_base_x| becomes 0. |top_base_x| will - // reset for the next |y|. - top_base_x &= ~_mm_cvtsi128_si32(past_max); - const __m128i top_vals = LoadUnaligned16(top_row + top_base_x); - __m128i vals = _mm_shuffle_epi8(top_vals, sampler); - vals = _mm_maddubs_epi16(vals, shifts); - vals = RightShiftWithRounding_U16(vals, rounding_bits); - vals = _mm_blendv_epi8(vals, final_top_val, past_max); - StoreLo8(dest + x, _mm_packus_epi16(vals, vals)); - } - // Corner-only section of the row. - memset(dest + x, top_row[max_base_x], width - x); - } - // Fill in corner-only rows. - for (; y < height; ++y) { - memset(dest, top_row[max_base_x], width); - dest += stride; - } -} - -// 7.11.2.4 (7) angle < 90 -inline void DirectionalZone1_SSE4_1(uint8_t* dest, ptrdiff_t stride, - const uint8_t* const top_row, - const int width, const int height, - const int xstep, const bool upsampled) { - const int upsample_shift = static_cast<int>(upsampled); - if (xstep == 64) { - DirectionalZone1_Step64(dest, stride, top_row, width, height); - return; - } - if (width == 4) { - DirectionalZone1_4xH(dest, stride, top_row, height, xstep, upsampled); - return; - } - if (width >= 32) { - DirectionalZone1_Large(dest, stride, top_row, width, height, xstep, - upsampled); - return; - } - const __m128i sampler = - upsampled ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100) - : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100); - const int scale_bits = 6 - upsample_shift; - const int max_base_x = ((width + height) - 1) << upsample_shift; - - const __m128i max_shift = _mm_set1_epi8(32); - const int rounding_bits = 5; - const int base_step = 1 << upsample_shift; - const int base_step8 = base_step << 3; - - // No need to check for exceeding |max_base_x| in the loops. - if (((xstep * height) >> scale_bits) + base_step * width < max_base_x) { - int top_x = xstep; - int y = 0; - do { - int top_base_x = top_x >> scale_bits; - // Permit negative values of |top_x|. - const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1; - const __m128i shift = _mm_set1_epi8(shift_val); - const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift); - const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift); - int x = 0; - do { - const __m128i top_vals = LoadUnaligned16(top_row + top_base_x); - __m128i vals = _mm_shuffle_epi8(top_vals, sampler); - vals = _mm_maddubs_epi16(vals, shifts); - vals = RightShiftWithRounding_U16(vals, rounding_bits); - StoreLo8(dest + x, _mm_packus_epi16(vals, vals)); - top_base_x += base_step8; - x += 8; - } while (x < width); - dest += stride; - top_x += xstep; - } while (++y < height); - return; - } - - // Each 16-bit value here corresponds to a position that may exceed - // |max_base_x|. When added to the top_base_x, it is used to mask values - // that pass the end of |top|. Starting from 1 to simulate "cmpge" which is - // not supported for packed integers. - const __m128i offsets = - _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001); - - const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x); - const __m128i final_top_val = _mm_set1_epi16(top_row[max_base_x]); - const __m128i base_step8_vect = _mm_set1_epi16(base_step8); - int top_x = xstep; - int y = 0; - do { - int top_base_x = top_x >> scale_bits; - - if (top_base_x >= max_base_x) { - for (int i = y; i < height; ++i) { - memset(dest, top_row[max_base_x], width); - dest += stride; - } - return; - } - - const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1; - const __m128i shift = _mm_set1_epi8(shift_val); - const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift); - const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift); - __m128i top_index_vect = _mm_set1_epi16(top_base_x); - top_index_vect = _mm_add_epi16(top_index_vect, offsets); - - int x = 0; - for (; x < width - 8; - x += 8, top_base_x += base_step8, - top_index_vect = _mm_add_epi16(top_index_vect, base_step8_vect)) { - const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect); - // Assuming a buffer zone of 8 bytes at the end of top_row, this prevents - // reading out of bounds. If all indices are past max and we don't need to - // use the loaded bytes at all, |top_base_x| becomes 0. |top_base_x| will - // reset for the next |y|. - top_base_x &= ~_mm_cvtsi128_si32(past_max); - const __m128i top_vals = LoadUnaligned16(top_row + top_base_x); - __m128i vals = _mm_shuffle_epi8(top_vals, sampler); - vals = _mm_maddubs_epi16(vals, shifts); - vals = RightShiftWithRounding_U16(vals, rounding_bits); - vals = _mm_blendv_epi8(vals, final_top_val, past_max); - StoreLo8(dest + x, _mm_packus_epi16(vals, vals)); - } - const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect); - __m128i vals; - if (upsampled) { - vals = LoadUnaligned16(top_row + top_base_x); - } else { - const __m128i top_vals = LoadLo8(top_row + top_base_x); - vals = _mm_shuffle_epi8(top_vals, sampler); - vals = _mm_insert_epi8(vals, top_row[top_base_x + 8], 15); - } - vals = _mm_maddubs_epi16(vals, shifts); - vals = RightShiftWithRounding_U16(vals, rounding_bits); - vals = _mm_blendv_epi8(vals, final_top_val, past_max); - StoreLo8(dest + x, _mm_packus_epi16(vals, vals)); - dest += stride; - top_x += xstep; - } while (++y < height); -} - -void DirectionalIntraPredictorZone1_SSE4_1(void* const dest, ptrdiff_t stride, - const void* const top_row, - const int width, const int height, - const int xstep, - const bool upsampled_top) { - const auto* const top_ptr = static_cast<const uint8_t*>(top_row); - auto* dst = static_cast<uint8_t*>(dest); - DirectionalZone1_SSE4_1(dst, stride, top_ptr, width, height, xstep, - upsampled_top); -} - -template <bool upsampled> -inline void DirectionalZone3_4x4(uint8_t* dest, ptrdiff_t stride, - const uint8_t* const left_column, - const int base_left_y, const int ystep) { - // For use in the non-upsampled case. - const __m128i sampler = _mm_set_epi64x(0, 0x0403030202010100); - const int upsample_shift = static_cast<int>(upsampled); - const int scale_bits = 6 - upsample_shift; - const __m128i max_shift = _mm_set1_epi8(32); - const int rounding_bits = 5; - - __m128i result_block[4]; - for (int x = 0, left_y = base_left_y; x < 4; x++, left_y += ystep) { - const int left_base_y = left_y >> scale_bits; - const int shift_val = ((left_y << upsample_shift) & 0x3F) >> 1; - const __m128i shift = _mm_set1_epi8(shift_val); - const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift); - const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift); - __m128i vals; - if (upsampled) { - vals = LoadLo8(left_column + left_base_y); - } else { - const __m128i top_vals = LoadLo8(left_column + left_base_y); - vals = _mm_shuffle_epi8(top_vals, sampler); - } - vals = _mm_maddubs_epi16(vals, shifts); - vals = RightShiftWithRounding_U16(vals, rounding_bits); - result_block[x] = _mm_packus_epi16(vals, vals); - } - const __m128i result = Transpose4x4_U8(result_block); - // This is result_row0. - Store4(dest, result); - dest += stride; - const int result_row1 = _mm_extract_epi32(result, 1); - memcpy(dest, &result_row1, sizeof(result_row1)); - dest += stride; - const int result_row2 = _mm_extract_epi32(result, 2); - memcpy(dest, &result_row2, sizeof(result_row2)); - dest += stride; - const int result_row3 = _mm_extract_epi32(result, 3); - memcpy(dest, &result_row3, sizeof(result_row3)); -} - -template <bool upsampled, int height> -inline void DirectionalZone3_8xH(uint8_t* dest, ptrdiff_t stride, - const uint8_t* const left_column, - const int base_left_y, const int ystep) { - // For use in the non-upsampled case. - const __m128i sampler = - _mm_set_epi64x(0x0807070606050504, 0x0403030202010100); - const int upsample_shift = static_cast<int>(upsampled); - const int scale_bits = 6 - upsample_shift; - const __m128i max_shift = _mm_set1_epi8(32); - const int rounding_bits = 5; - - __m128i result_block[8]; - for (int x = 0, left_y = base_left_y; x < 8; x++, left_y += ystep) { - const int left_base_y = left_y >> scale_bits; - const int shift_val = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1; - const __m128i shift = _mm_set1_epi8(shift_val); - const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift); - const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift); - __m128i vals; - if (upsampled) { - vals = LoadUnaligned16(left_column + left_base_y); - } else { - const __m128i top_vals = LoadUnaligned16(left_column + left_base_y); - vals = _mm_shuffle_epi8(top_vals, sampler); - } - vals = _mm_maddubs_epi16(vals, shifts); - result_block[x] = RightShiftWithRounding_U16(vals, rounding_bits); - } - Transpose8x8_U16(result_block, result_block); - for (int y = 0; y < height; ++y) { - StoreLo8(dest, _mm_packus_epi16(result_block[y], result_block[y])); - dest += stride; - } -} - -// 7.11.2.4 (9) angle > 180 -void DirectionalIntraPredictorZone3_SSE4_1(void* dest, ptrdiff_t stride, - const void* const left_column, - const int width, const int height, - const int ystep, - const bool upsampled) { - const auto* const left_ptr = static_cast<const uint8_t*>(left_column); - auto* dst = static_cast<uint8_t*>(dest); - const int upsample_shift = static_cast<int>(upsampled); - if (width == 4 || height == 4) { - const ptrdiff_t stride4 = stride << 2; - if (upsampled) { - int left_y = ystep; - int x = 0; - do { - uint8_t* dst_x = dst + x; - int y = 0; - do { - DirectionalZone3_4x4<true>( - dst_x, stride, left_ptr + (y << upsample_shift), left_y, ystep); - dst_x += stride4; - y += 4; - } while (y < height); - left_y += ystep << 2; - x += 4; - } while (x < width); - } else { - int left_y = ystep; - int x = 0; - do { - uint8_t* dst_x = dst + x; - int y = 0; - do { - DirectionalZone3_4x4<false>(dst_x, stride, left_ptr + y, left_y, - ystep); - dst_x += stride4; - y += 4; - } while (y < height); - left_y += ystep << 2; - x += 4; - } while (x < width); - } - return; - } - - const ptrdiff_t stride8 = stride << 3; - if (upsampled) { - int left_y = ystep; - int x = 0; - do { - uint8_t* dst_x = dst + x; - int y = 0; - do { - DirectionalZone3_8xH<true, 8>( - dst_x, stride, left_ptr + (y << upsample_shift), left_y, ystep); - dst_x += stride8; - y += 8; - } while (y < height); - left_y += ystep << 3; - x += 8; - } while (x < width); - } else { - int left_y = ystep; - int x = 0; - do { - uint8_t* dst_x = dst + x; - int y = 0; - do { - DirectionalZone3_8xH<false, 8>( - dst_x, stride, left_ptr + (y << upsample_shift), left_y, ystep); - dst_x += stride8; - y += 8; - } while (y < height); - left_y += ystep << 3; - x += 8; - } while (x < width); - } -} - -//------------------------------------------------------------------------------ -// Directional Zone 2 Functions -// 7.11.2.4 (8) - -// DirectionalBlend* selectively overwrites the values written by -// DirectionalZone2FromLeftCol*. |zone_bounds| has one 16-bit index for each -// row. -template <int y_selector> -inline void DirectionalBlend4_SSE4_1(uint8_t* dest, - const __m128i& dest_index_vect, - const __m128i& vals, - const __m128i& zone_bounds) { - const __m128i max_dest_x_vect = _mm_shufflelo_epi16(zone_bounds, y_selector); - const __m128i use_left = _mm_cmplt_epi16(dest_index_vect, max_dest_x_vect); - const __m128i original_vals = _mm_cvtepu8_epi16(Load4(dest)); - const __m128i blended_vals = _mm_blendv_epi8(vals, original_vals, use_left); - Store4(dest, _mm_packus_epi16(blended_vals, blended_vals)); -} - -inline void DirectionalBlend8_SSE4_1(uint8_t* dest, - const __m128i& dest_index_vect, - const __m128i& vals, - const __m128i& zone_bounds, - const __m128i& bounds_selector) { - const __m128i max_dest_x_vect = - _mm_shuffle_epi8(zone_bounds, bounds_selector); - const __m128i use_left = _mm_cmplt_epi16(dest_index_vect, max_dest_x_vect); - const __m128i original_vals = _mm_cvtepu8_epi16(LoadLo8(dest)); - const __m128i blended_vals = _mm_blendv_epi8(vals, original_vals, use_left); - StoreLo8(dest, _mm_packus_epi16(blended_vals, blended_vals)); -} - -constexpr int kDirectionalWeightBits = 5; -// |source| is packed with 4 or 8 pairs of 8-bit values from left or top. -// |shifts| is named to match the specification, with 4 or 8 pairs of (32 - -// shift) and shift. Shift is guaranteed to be between 0 and 32. -inline __m128i DirectionalZone2FromSource_SSE4_1(const uint8_t* const source, - const __m128i& shifts, - const __m128i& sampler) { - const __m128i src_vals = LoadUnaligned16(source); - __m128i vals = _mm_shuffle_epi8(src_vals, sampler); - vals = _mm_maddubs_epi16(vals, shifts); - return RightShiftWithRounding_U16(vals, kDirectionalWeightBits); -} - -// Because the source values "move backwards" as the row index increases, the -// indices derived from ystep are generally negative. This is accommodated by -// making sure the relative indices are within [-15, 0] when the function is -// called, and sliding them into the inclusive range [0, 15], relative to a -// lower base address. -constexpr int kPositiveIndexOffset = 15; - -template <bool upsampled> -inline void DirectionalZone2FromLeftCol_4x4_SSE4_1( - uint8_t* dst, ptrdiff_t stride, const uint8_t* const left_column_base, - __m128i left_y) { - const int upsample_shift = static_cast<int>(upsampled); - const int scale_bits = 6 - upsample_shift; - const __m128i max_shifts = _mm_set1_epi8(32); - const __m128i shift_mask = _mm_set1_epi32(0x003F003F); - const __m128i index_increment = _mm_cvtsi32_si128(0x01010101); - const __m128i positive_offset = _mm_set1_epi8(kPositiveIndexOffset); - // Left_column and sampler are both offset by 15 so the indices are always - // positive. - const uint8_t* left_column = left_column_base - kPositiveIndexOffset; - for (int y = 0; y < 4; dst += stride, ++y) { - __m128i offset_y = _mm_srai_epi16(left_y, scale_bits); - offset_y = _mm_packs_epi16(offset_y, offset_y); - - const __m128i adjacent = _mm_add_epi8(offset_y, index_increment); - __m128i sampler = _mm_unpacklo_epi8(offset_y, adjacent); - // Slide valid |offset_y| indices from range [-15, 0] to [0, 15] so they - // can work as shuffle indices. Some values may be out of bounds, but their - // pred results will be masked over by top prediction. - sampler = _mm_add_epi8(sampler, positive_offset); - - __m128i shifts = _mm_srli_epi16( - _mm_and_si128(_mm_slli_epi16(left_y, upsample_shift), shift_mask), 1); - shifts = _mm_packus_epi16(shifts, shifts); - const __m128i opposite_shifts = _mm_sub_epi8(max_shifts, shifts); - shifts = _mm_unpacklo_epi8(opposite_shifts, shifts); - const __m128i vals = DirectionalZone2FromSource_SSE4_1( - left_column + (y << upsample_shift), shifts, sampler); - Store4(dst, _mm_packus_epi16(vals, vals)); - } -} - -// The height at which a load of 16 bytes will not contain enough source pixels -// from |left_column| to supply an accurate row when computing 8 pixels at a -// time. The values are found by inspection. By coincidence, all angles that -// satisfy (ystep >> 6) == 2 map to the same value, so it is enough to look up -// by ystep >> 6. The largest index for this lookup is 1023 >> 6 == 15. -constexpr int kDirectionalZone2ShuffleInvalidHeight[16] = { - 1024, 1024, 16, 16, 16, 16, 0, 0, 18, 0, 0, 0, 0, 0, 0, 40}; - -template <bool upsampled> -inline void DirectionalZone2FromLeftCol_8x8_SSE4_1( - uint8_t* dst, ptrdiff_t stride, const uint8_t* const left_column, - __m128i left_y) { - const int upsample_shift = static_cast<int>(upsampled); - const int scale_bits = 6 - upsample_shift; - const __m128i max_shifts = _mm_set1_epi8(32); - const __m128i shift_mask = _mm_set1_epi32(0x003F003F); - const __m128i index_increment = _mm_set1_epi8(1); - const __m128i denegation = _mm_set1_epi8(kPositiveIndexOffset); - for (int y = 0; y < 8; dst += stride, ++y) { - __m128i offset_y = _mm_srai_epi16(left_y, scale_bits); - offset_y = _mm_packs_epi16(offset_y, offset_y); - const __m128i adjacent = _mm_add_epi8(offset_y, index_increment); - - // Offset the relative index because ystep is negative in Zone 2 and shuffle - // indices must be nonnegative. - __m128i sampler = _mm_unpacklo_epi8(offset_y, adjacent); - sampler = _mm_add_epi8(sampler, denegation); - - __m128i shifts = _mm_srli_epi16( - _mm_and_si128(_mm_slli_epi16(left_y, upsample_shift), shift_mask), 1); - shifts = _mm_packus_epi16(shifts, shifts); - const __m128i opposite_shifts = _mm_sub_epi8(max_shifts, shifts); - shifts = _mm_unpacklo_epi8(opposite_shifts, shifts); - - // The specification adds (y << 6) to left_y, which is subject to - // upsampling, but this puts sampler indices out of the 0-15 range. It is - // equivalent to offset the source address by (y << upsample_shift) instead. - const __m128i vals = DirectionalZone2FromSource_SSE4_1( - left_column - kPositiveIndexOffset + (y << upsample_shift), shifts, - sampler); - StoreLo8(dst, _mm_packus_epi16(vals, vals)); - } -} - -// |zone_bounds| is an epi16 of the relative x index at which base >= -(1 << -// upsampled_top), for each row. When there are 4 values, they can be duplicated -// with a non-register shuffle mask. -// |shifts| is one pair of weights that applies throughout a given row. -template <bool upsampled_top> -inline void DirectionalZone1Blend_4x4( - uint8_t* dest, const uint8_t* const top_row, ptrdiff_t stride, - __m128i sampler, const __m128i& zone_bounds, const __m128i& shifts, - const __m128i& dest_index_x, int top_x, const int xstep) { - const int upsample_shift = static_cast<int>(upsampled_top); - const int scale_bits_x = 6 - upsample_shift; - top_x -= xstep; - - int top_base_x = (top_x >> scale_bits_x); - const __m128i vals0 = DirectionalZone2FromSource_SSE4_1( - top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0x00), sampler); - DirectionalBlend4_SSE4_1<0x00>(dest, dest_index_x, vals0, zone_bounds); - top_x -= xstep; - dest += stride; - - top_base_x = (top_x >> scale_bits_x); - const __m128i vals1 = DirectionalZone2FromSource_SSE4_1( - top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0x55), sampler); - DirectionalBlend4_SSE4_1<0x55>(dest, dest_index_x, vals1, zone_bounds); - top_x -= xstep; - dest += stride; - - top_base_x = (top_x >> scale_bits_x); - const __m128i vals2 = DirectionalZone2FromSource_SSE4_1( - top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0xAA), sampler); - DirectionalBlend4_SSE4_1<0xAA>(dest, dest_index_x, vals2, zone_bounds); - top_x -= xstep; - dest += stride; - - top_base_x = (top_x >> scale_bits_x); - const __m128i vals3 = DirectionalZone2FromSource_SSE4_1( - top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0xFF), sampler); - DirectionalBlend4_SSE4_1<0xFF>(dest, dest_index_x, vals3, zone_bounds); -} - -template <bool upsampled_top, int height> -inline void DirectionalZone1Blend_8xH( - uint8_t* dest, const uint8_t* const top_row, ptrdiff_t stride, - __m128i sampler, const __m128i& zone_bounds, const __m128i& shifts, - const __m128i& dest_index_x, int top_x, const int xstep) { - const int upsample_shift = static_cast<int>(upsampled_top); - const int scale_bits_x = 6 - upsample_shift; - - __m128i y_selector = _mm_set1_epi32(0x01000100); - const __m128i index_increment = _mm_set1_epi32(0x02020202); - for (int y = 0; y < height; ++y, - y_selector = _mm_add_epi8(y_selector, index_increment), - dest += stride) { - top_x -= xstep; - const int top_base_x = top_x >> scale_bits_x; - const __m128i vals = DirectionalZone2FromSource_SSE4_1( - top_row + top_base_x, _mm_shuffle_epi8(shifts, y_selector), sampler); - DirectionalBlend8_SSE4_1(dest, dest_index_x, vals, zone_bounds, y_selector); - } -} - -// 7.11.2.4 (8) 90 < angle > 180 -// The strategy for this function is to know how many blocks can be processed -// with just pixels from |top_ptr|, then handle mixed blocks, then handle only -// blocks that take from |left_ptr|. Additionally, a fast index-shuffle -// approach is used for pred values from |left_column| in sections that permit -// it. -template <bool upsampled_left, bool upsampled_top> -inline void DirectionalZone2_SSE4_1(void* dest, ptrdiff_t stride, - const uint8_t* const top_row, - const uint8_t* const left_column, - const int width, const int height, - const int xstep, const int ystep) { - auto* dst = static_cast<uint8_t*>(dest); - const int upsample_left_shift = static_cast<int>(upsampled_left); - const int upsample_top_shift = static_cast<int>(upsampled_top); - const __m128i max_shift = _mm_set1_epi8(32); - const ptrdiff_t stride8 = stride << 3; - const __m128i dest_index_x = - _mm_set_epi32(0x00070006, 0x00050004, 0x00030002, 0x00010000); - const __m128i sampler_top = - upsampled_top - ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100) - : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100); - const __m128i shift_mask = _mm_set1_epi32(0x003F003F); - // All columns from |min_top_only_x| to the right will only need |top_row| to - // compute. This assumes minimum |xstep| is 3. - const int min_top_only_x = std::min((height * xstep) >> 6, width); - - // For steep angles, the source pixels from left_column may not fit in a - // 16-byte load for shuffling. - // TODO(petersonab): Find a more precise formula for this subject to x. - const int max_shuffle_height = - std::min(height, kDirectionalZone2ShuffleInvalidHeight[ystep >> 6]); - - const int xstep8 = xstep << 3; - const __m128i xstep8_vect = _mm_set1_epi16(xstep8); - // Accumulate xstep across 8 rows. - const __m128i xstep_dup = _mm_set1_epi16(-xstep); - const __m128i increments = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1); - const __m128i xstep_for_shift = _mm_mullo_epi16(xstep_dup, increments); - // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1 - const __m128i scaled_one = _mm_set1_epi16(-64); - __m128i xstep_bounds_base = - (xstep == 64) ? _mm_sub_epi16(scaled_one, xstep_for_shift) - : _mm_sub_epi16(_mm_set1_epi16(-1), xstep_for_shift); - - const int left_base_increment = ystep >> 6; - const int ystep_remainder = ystep & 0x3F; - const int ystep8 = ystep << 3; - const int left_base_increment8 = ystep8 >> 6; - const int ystep_remainder8 = ystep8 & 0x3F; - const __m128i increment_left8 = _mm_set1_epi16(-ystep_remainder8); - - // If the 64 scaling is regarded as a decimal point, the first value of the - // left_y vector omits the portion which is covered under the left_column - // offset. Following values need the full ystep as a relative offset. - const __m128i ystep_init = _mm_set1_epi16(-ystep_remainder); - const __m128i ystep_dup = _mm_set1_epi16(-ystep); - __m128i left_y = _mm_mullo_epi16(ystep_dup, dest_index_x); - left_y = _mm_add_epi16(ystep_init, left_y); - - const __m128i increment_top8 = _mm_set1_epi16(8 << 6); - int x = 0; - - // This loop treats each set of 4 columns in 3 stages with y-value boundaries. - // The first stage, before the first y-loop, covers blocks that are only - // computed from the top row. The second stage, comprising two y-loops, covers - // blocks that have a mixture of values computed from top or left. The final - // stage covers blocks that are only computed from the left. - for (int left_offset = -left_base_increment; x < min_top_only_x; - x += 8, - xstep_bounds_base = _mm_sub_epi16(xstep_bounds_base, increment_top8), - // Watch left_y because it can still get big. - left_y = _mm_add_epi16(left_y, increment_left8), - left_offset -= left_base_increment8) { - uint8_t* dst_x = dst + x; - - // Round down to the nearest multiple of 8. - const int max_top_only_y = std::min(((x + 1) << 6) / xstep, height) & ~7; - DirectionalZone1_4xH(dst_x, stride, top_row + (x << upsample_top_shift), - max_top_only_y, -xstep, upsampled_top); - DirectionalZone1_4xH(dst_x + 4, stride, - top_row + ((x + 4) << upsample_top_shift), - max_top_only_y, -xstep, upsampled_top); - - int y = max_top_only_y; - dst_x += stride * y; - const int xstep_y = xstep * y; - const __m128i xstep_y_vect = _mm_set1_epi16(xstep_y); - // All rows from |min_left_only_y| down for this set of columns, only need - // |left_column| to compute. - const int min_left_only_y = std::min(((x + 8) << 6) / xstep, height); - // At high angles such that min_left_only_y < 8, ystep is low and xstep is - // high. This means that max_shuffle_height is unbounded and xstep_bounds - // will overflow in 16 bits. This is prevented by stopping the first - // blending loop at min_left_only_y for such cases, which means we skip over - // the second blending loop as well. - const int left_shuffle_stop_y = - std::min(max_shuffle_height, min_left_only_y); - __m128i xstep_bounds = _mm_add_epi16(xstep_bounds_base, xstep_y_vect); - __m128i xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift, xstep_y_vect); - int top_x = -xstep_y; - - for (; y < left_shuffle_stop_y; - y += 8, dst_x += stride8, - xstep_bounds = _mm_add_epi16(xstep_bounds, xstep8_vect), - xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep8_vect), - top_x -= xstep8) { - DirectionalZone2FromLeftCol_8x8_SSE4_1<upsampled_left>( - dst_x, stride, - left_column + ((left_offset + y) << upsample_left_shift), left_y); - - __m128i shifts = _mm_srli_epi16( - _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift), - shift_mask), - 1); - shifts = _mm_packus_epi16(shifts, shifts); - __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts); - shifts = _mm_unpacklo_epi8(opposite_shifts, shifts); - __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6); - DirectionalZone1Blend_8xH<upsampled_top, 8>( - dst_x, top_row + (x << upsample_top_shift), stride, sampler_top, - xstep_bounds_off, shifts, dest_index_x, top_x, xstep); - } - // Pick up from the last y-value, using the 10% slower but secure method for - // left prediction. - const auto base_left_y = static_cast<int16_t>(_mm_extract_epi16(left_y, 0)); - for (; y < min_left_only_y; - y += 8, dst_x += stride8, - xstep_bounds = _mm_add_epi16(xstep_bounds, xstep8_vect), - xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep8_vect), - top_x -= xstep8) { - const __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6); - - DirectionalZone3_8xH<upsampled_left, 8>( - dst_x, stride, - left_column + ((left_offset + y) << upsample_left_shift), base_left_y, - -ystep); - - __m128i shifts = _mm_srli_epi16( - _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift), - shift_mask), - 1); - shifts = _mm_packus_epi16(shifts, shifts); - __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts); - shifts = _mm_unpacklo_epi8(opposite_shifts, shifts); - DirectionalZone1Blend_8xH<upsampled_top, 8>( - dst_x, top_row + (x << upsample_top_shift), stride, sampler_top, - xstep_bounds_off, shifts, dest_index_x, top_x, xstep); - } - // Loop over y for left_only rows. - for (; y < height; y += 8, dst_x += stride8) { - DirectionalZone3_8xH<upsampled_left, 8>( - dst_x, stride, - left_column + ((left_offset + y) << upsample_left_shift), base_left_y, - -ystep); - } - } - for (; x < width; x += 4) { - DirectionalZone1_4xH(dst + x, stride, top_row + (x << upsample_top_shift), - height, -xstep, upsampled_top); - } -} - -template <bool upsampled_left, bool upsampled_top> -inline void DirectionalZone2_4_SSE4_1(void* dest, ptrdiff_t stride, - const uint8_t* const top_row, - const uint8_t* const left_column, - const int width, const int height, - const int xstep, const int ystep) { - auto* dst = static_cast<uint8_t*>(dest); - const int upsample_left_shift = static_cast<int>(upsampled_left); - const int upsample_top_shift = static_cast<int>(upsampled_top); - const __m128i max_shift = _mm_set1_epi8(32); - const ptrdiff_t stride4 = stride << 2; - const __m128i dest_index_x = _mm_set_epi32(0, 0, 0x00030002, 0x00010000); - const __m128i sampler_top = - upsampled_top - ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100) - : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100); - // All columns from |min_top_only_x| to the right will only need |top_row| to - // compute. - assert(xstep >= 3); - const int min_top_only_x = std::min((height * xstep) >> 6, width); - - const int xstep4 = xstep << 2; - const __m128i xstep4_vect = _mm_set1_epi16(xstep4); - const __m128i xstep_dup = _mm_set1_epi16(-xstep); - const __m128i increments = _mm_set_epi32(0, 0, 0x00040003, 0x00020001); - __m128i xstep_for_shift = _mm_mullo_epi16(xstep_dup, increments); - const __m128i scaled_one = _mm_set1_epi16(-64); - // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1 - __m128i xstep_bounds_base = - (xstep == 64) ? _mm_sub_epi16(scaled_one, xstep_for_shift) - : _mm_sub_epi16(_mm_set1_epi16(-1), xstep_for_shift); - - const int left_base_increment = ystep >> 6; - const int ystep_remainder = ystep & 0x3F; - const int ystep4 = ystep << 2; - const int left_base_increment4 = ystep4 >> 6; - // This is guaranteed to be less than 64, but accumulation may bring it past - // 64 for higher x values. - const int ystep_remainder4 = ystep4 & 0x3F; - const __m128i increment_left4 = _mm_set1_epi16(-ystep_remainder4); - const __m128i increment_top4 = _mm_set1_epi16(4 << 6); - - // If the 64 scaling is regarded as a decimal point, the first value of the - // left_y vector omits the portion which will go into the left_column offset. - // Following values need the full ystep as a relative offset. - const __m128i ystep_init = _mm_set1_epi16(-ystep_remainder); - const __m128i ystep_dup = _mm_set1_epi16(-ystep); - __m128i left_y = _mm_mullo_epi16(ystep_dup, dest_index_x); - left_y = _mm_add_epi16(ystep_init, left_y); - const __m128i shift_mask = _mm_set1_epi32(0x003F003F); - - int x = 0; - // Loop over x for columns with a mixture of sources. - for (int left_offset = -left_base_increment; x < min_top_only_x; x += 4, - xstep_bounds_base = _mm_sub_epi16(xstep_bounds_base, increment_top4), - left_y = _mm_add_epi16(left_y, increment_left4), - left_offset -= left_base_increment4) { - uint8_t* dst_x = dst + x; - - // Round down to the nearest multiple of 8. - const int max_top_only_y = std::min((x << 6) / xstep, height) & 0xFFFFFFF4; - DirectionalZone1_4xH(dst_x, stride, top_row + (x << upsample_top_shift), - max_top_only_y, -xstep, upsampled_top); - int y = max_top_only_y; - dst_x += stride * y; - const int xstep_y = xstep * y; - const __m128i xstep_y_vect = _mm_set1_epi16(xstep_y); - // All rows from |min_left_only_y| down for this set of columns, only need - // |left_column| to compute. Rounded up to the nearest multiple of 4. - const int min_left_only_y = std::min(((x + 4) << 6) / xstep, height); - - __m128i xstep_bounds = _mm_add_epi16(xstep_bounds_base, xstep_y_vect); - __m128i xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift, xstep_y_vect); - int top_x = -xstep_y; - - // Loop over y for mixed rows. - for (; y < min_left_only_y; - y += 4, dst_x += stride4, - xstep_bounds = _mm_add_epi16(xstep_bounds, xstep4_vect), - xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep4_vect), - top_x -= xstep4) { - DirectionalZone2FromLeftCol_4x4_SSE4_1<upsampled_left>( - dst_x, stride, - left_column + ((left_offset + y) * (1 << upsample_left_shift)), - left_y); - - __m128i shifts = _mm_srli_epi16( - _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift), - shift_mask), - 1); - shifts = _mm_packus_epi16(shifts, shifts); - const __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts); - shifts = _mm_unpacklo_epi8(opposite_shifts, shifts); - const __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6); - DirectionalZone1Blend_4x4<upsampled_top>( - dst_x, top_row + (x << upsample_top_shift), stride, sampler_top, - xstep_bounds_off, shifts, dest_index_x, top_x, xstep); - } - // Loop over y for left-only rows, if any. - for (; y < height; y += 4, dst_x += stride4) { - DirectionalZone2FromLeftCol_4x4_SSE4_1<upsampled_left>( - dst_x, stride, - left_column + ((left_offset + y) << upsample_left_shift), left_y); - } - } - // Loop over top-only columns, if any. - for (; x < width; x += 4) { - DirectionalZone1_4xH(dst + x, stride, top_row + (x << upsample_top_shift), - height, -xstep, upsampled_top); - } -} - -void DirectionalIntraPredictorZone2_SSE4_1(void* const dest, ptrdiff_t stride, - const void* const top_row, - const void* const left_column, - const int width, const int height, - const int xstep, const int ystep, - const bool upsampled_top, - const bool upsampled_left) { - // Increasing the negative buffer for this function allows more rows to be - // processed at a time without branching in an inner loop to check the base. - uint8_t top_buffer[288]; - uint8_t left_buffer[288]; - memcpy(top_buffer + 128, static_cast<const uint8_t*>(top_row) - 16, 160); - memcpy(left_buffer + 128, static_cast<const uint8_t*>(left_column) - 16, 160); - const uint8_t* top_ptr = top_buffer + 144; - const uint8_t* left_ptr = left_buffer + 144; - if (width == 4 || height == 4) { - if (upsampled_left) { - if (upsampled_top) { - DirectionalZone2_4_SSE4_1<true, true>(dest, stride, top_ptr, left_ptr, - width, height, xstep, ystep); - } else { - DirectionalZone2_4_SSE4_1<true, false>(dest, stride, top_ptr, left_ptr, - width, height, xstep, ystep); - } - } else { - if (upsampled_top) { - DirectionalZone2_4_SSE4_1<false, true>(dest, stride, top_ptr, left_ptr, - width, height, xstep, ystep); - } else { - DirectionalZone2_4_SSE4_1<false, false>(dest, stride, top_ptr, left_ptr, - width, height, xstep, ystep); - } - } - return; - } - if (upsampled_left) { - if (upsampled_top) { - DirectionalZone2_SSE4_1<true, true>(dest, stride, top_ptr, left_ptr, - width, height, xstep, ystep); - } else { - DirectionalZone2_SSE4_1<true, false>(dest, stride, top_ptr, left_ptr, - width, height, xstep, ystep); - } - } else { - if (upsampled_top) { - DirectionalZone2_SSE4_1<false, true>(dest, stride, top_ptr, left_ptr, - width, height, xstep, ystep); - } else { - DirectionalZone2_SSE4_1<false, false>(dest, stride, top_ptr, left_ptr, - width, height, xstep, ystep); - } - } -} - -//------------------------------------------------------------------------------ -// FilterIntraPredictor_SSE4_1 - -// Apply all filter taps to the given 7 packed 16-bit values, keeping the 8th -// at zero to preserve the sum. -inline void Filter4x2_SSE4_1(uint8_t* dst, const ptrdiff_t stride, - const __m128i& pixels, const __m128i& taps_0_1, - const __m128i& taps_2_3, const __m128i& taps_4_5, - const __m128i& taps_6_7) { - const __m128i mul_0_01 = _mm_maddubs_epi16(pixels, taps_0_1); - const __m128i mul_0_23 = _mm_maddubs_epi16(pixels, taps_2_3); - // |output_half| contains 8 partial sums. - __m128i output_half = _mm_hadd_epi16(mul_0_01, mul_0_23); - __m128i output = _mm_hadd_epi16(output_half, output_half); - const __m128i output_row0 = - _mm_packus_epi16(RightShiftWithRounding_S16(output, 4), - /* arbitrary pack arg */ output); - Store4(dst, output_row0); - const __m128i mul_1_01 = _mm_maddubs_epi16(pixels, taps_4_5); - const __m128i mul_1_23 = _mm_maddubs_epi16(pixels, taps_6_7); - output_half = _mm_hadd_epi16(mul_1_01, mul_1_23); - output = _mm_hadd_epi16(output_half, output_half); - const __m128i output_row1 = - _mm_packus_epi16(RightShiftWithRounding_S16(output, 4), - /* arbitrary pack arg */ output); - Store4(dst + stride, output_row1); -} - -// 4xH transform sizes are given special treatment because LoadLo8 goes out -// of bounds and every block involves the left column. This implementation -// loads TL from the top row for the first block, so it is not -inline void Filter4xH(uint8_t* dest, ptrdiff_t stride, - const uint8_t* const top_ptr, - const uint8_t* const left_ptr, FilterIntraPredictor pred, - const int height) { - const __m128i taps_0_1 = LoadUnaligned16(kFilterIntraTaps[pred][0]); - const __m128i taps_2_3 = LoadUnaligned16(kFilterIntraTaps[pred][2]); - const __m128i taps_4_5 = LoadUnaligned16(kFilterIntraTaps[pred][4]); - const __m128i taps_6_7 = LoadUnaligned16(kFilterIntraTaps[pred][6]); - __m128i top = Load4(top_ptr - 1); - __m128i pixels = _mm_insert_epi8(top, top_ptr[3], 4); - __m128i left = (height == 4 ? Load4(left_ptr) : LoadLo8(left_ptr)); - left = _mm_slli_si128(left, 5); - - // Relative pixels: top[-1], top[0], top[1], top[2], top[3], left[0], left[1], - // left[2], left[3], left[4], left[5], left[6], left[7] - pixels = _mm_or_si128(left, pixels); - - // Duplicate first 8 bytes. - pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf); - Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5, - taps_6_7); - dest += stride; // Move to y = 1. - pixels = Load4(dest); - - // Relative pixels: top[0], top[1], top[2], top[3], empty, left[-2], left[-1], - // left[0], left[1], ... - pixels = _mm_or_si128(left, pixels); - - // This mask rearranges bytes in the order: 6, 0, 1, 2, 3, 7, 8, 15. The last - // byte is an unused value, which shall be multiplied by 0 when we apply the - // filter. - constexpr int64_t kInsertTopLeftFirstMask = 0x0F08070302010006; - - // Insert left[-1] in front as TL and put left[0] and left[1] at the end. - const __m128i pixel_order1 = _mm_set1_epi64x(kInsertTopLeftFirstMask); - pixels = _mm_shuffle_epi8(pixels, pixel_order1); - dest += stride; // Move to y = 2. - Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5, - taps_6_7); - dest += stride; // Move to y = 3. - - // Compute the middle 8 rows before using common code for the final 4 rows. - // Because the common code below this block assumes that - if (height == 16) { - // This shift allows us to use pixel_order2 twice after shifting by 2 later. - left = _mm_slli_si128(left, 1); - pixels = Load4(dest); - - // Relative pixels: top[0], top[1], top[2], top[3], empty, empty, left[-4], - // left[-3], left[-2], left[-1], left[0], left[1], left[2], left[3] - pixels = _mm_or_si128(left, pixels); - - // This mask rearranges bytes in the order: 9, 0, 1, 2, 3, 7, 8, 15. The - // last byte is an unused value, as above. The top-left was shifted to - // position nine to keep two empty spaces after the top pixels. - constexpr int64_t kInsertTopLeftSecondMask = 0x0F0B0A0302010009; - - // Insert (relative) left[-1] in front as TL and put left[0] and left[1] at - // the end. - const __m128i pixel_order2 = _mm_set1_epi64x(kInsertTopLeftSecondMask); - pixels = _mm_shuffle_epi8(pixels, pixel_order2); - dest += stride; // Move to y = 4. - - // First 4x2 in the if body. - Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5, - taps_6_7); - - // Clear all but final pixel in the first 8 of left column. - __m128i keep_top_left = _mm_srli_si128(left, 13); - dest += stride; // Move to y = 5. - pixels = Load4(dest); - left = _mm_srli_si128(left, 2); - - // Relative pixels: top[0], top[1], top[2], top[3], left[-6], - // left[-5], left[-4], left[-3], left[-2], left[-1], left[0], left[1] - pixels = _mm_or_si128(left, pixels); - left = LoadLo8(left_ptr + 8); - - pixels = _mm_shuffle_epi8(pixels, pixel_order2); - dest += stride; // Move to y = 6. - - // Second 4x2 in the if body. - Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5, - taps_6_7); - - // Position TL value so we can use pixel_order1. - keep_top_left = _mm_slli_si128(keep_top_left, 6); - dest += stride; // Move to y = 7. - pixels = Load4(dest); - left = _mm_slli_si128(left, 7); - left = _mm_or_si128(left, keep_top_left); - - // Relative pixels: top[0], top[1], top[2], top[3], empty, empty, - // left[-1], left[0], left[1], left[2], left[3], ... - pixels = _mm_or_si128(left, pixels); - pixels = _mm_shuffle_epi8(pixels, pixel_order1); - dest += stride; // Move to y = 8. - - // Third 4x2 in the if body. - Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5, - taps_6_7); - dest += stride; // Move to y = 9. - - // Prepare final inputs. - pixels = Load4(dest); - left = _mm_srli_si128(left, 2); - - // Relative pixels: top[0], top[1], top[2], top[3], left[-3], left[-2] - // left[-1], left[0], left[1], left[2], left[3], ... - pixels = _mm_or_si128(left, pixels); - pixels = _mm_shuffle_epi8(pixels, pixel_order1); - dest += stride; // Move to y = 10. - - // Fourth 4x2 in the if body. - Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5, - taps_6_7); - dest += stride; // Move to y = 11. - } - - // In both the 8 and 16 case, we assume that the left vector has the next TL - // at position 8. - if (height > 4) { - // Erase prior left pixels by shifting TL to position 0. - left = _mm_srli_si128(left, 8); - left = _mm_slli_si128(left, 6); - pixels = Load4(dest); - - // Relative pixels: top[0], top[1], top[2], top[3], empty, empty, - // left[-1], left[0], left[1], left[2], left[3], ... - pixels = _mm_or_si128(left, pixels); - pixels = _mm_shuffle_epi8(pixels, pixel_order1); - dest += stride; // Move to y = 12 or 4. - - // First of final two 4x2 blocks. - Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5, - taps_6_7); - dest += stride; // Move to y = 13 or 5. - pixels = Load4(dest); - left = _mm_srli_si128(left, 2); - - // Relative pixels: top[0], top[1], top[2], top[3], left[-3], left[-2] - // left[-1], left[0], left[1], left[2], left[3], ... - pixels = _mm_or_si128(left, pixels); - pixels = _mm_shuffle_epi8(pixels, pixel_order1); - dest += stride; // Move to y = 14 or 6. - - // Last of final two 4x2 blocks. - Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5, - taps_6_7); - } -} - -void FilterIntraPredictor_SSE4_1(void* const dest, ptrdiff_t stride, - const void* const top_row, - const void* const left_column, - FilterIntraPredictor pred, const int width, - const int height) { - const auto* const top_ptr = static_cast<const uint8_t*>(top_row); - const auto* const left_ptr = static_cast<const uint8_t*>(left_column); - auto* dst = static_cast<uint8_t*>(dest); - if (width == 4) { - Filter4xH(dst, stride, top_ptr, left_ptr, pred, height); - return; - } - - // There is one set of 7 taps for each of the 4x2 output pixels. - const __m128i taps_0_1 = LoadUnaligned16(kFilterIntraTaps[pred][0]); - const __m128i taps_2_3 = LoadUnaligned16(kFilterIntraTaps[pred][2]); - const __m128i taps_4_5 = LoadUnaligned16(kFilterIntraTaps[pred][4]); - const __m128i taps_6_7 = LoadUnaligned16(kFilterIntraTaps[pred][6]); - - // This mask rearranges bytes in the order: 0, 1, 2, 3, 4, 8, 9, 15. The 15 at - // the end is an unused value, which shall be multiplied by 0 when we apply - // the filter. - constexpr int64_t kCondenseLeftMask = 0x0F09080403020100; - - // Takes the "left section" and puts it right after p0-p4. - const __m128i pixel_order1 = _mm_set1_epi64x(kCondenseLeftMask); - - // This mask rearranges bytes in the order: 8, 0, 1, 2, 3, 9, 10, 15. The last - // byte is unused as above. - constexpr int64_t kInsertTopLeftMask = 0x0F0A090302010008; - - // Shuffles the "top left" from the left section, to the front. Used when - // grabbing data from left_column and not top_row. - const __m128i pixel_order2 = _mm_set1_epi64x(kInsertTopLeftMask); - - // This first pass takes care of the cases where the top left pixel comes from - // top_row. - __m128i pixels = LoadLo8(top_ptr - 1); - __m128i left = _mm_slli_si128(Load4(left_column), 8); - pixels = _mm_or_si128(pixels, left); - - // Two sets of the same pixels to multiply with two sets of taps. - pixels = _mm_shuffle_epi8(pixels, pixel_order1); - Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5, taps_6_7); - left = _mm_srli_si128(left, 1); - - // Load - pixels = Load4(dst + stride); - - // Because of the above shift, this OR 'invades' the final of the first 8 - // bytes of |pixels|. This is acceptable because the 8th filter tap is always - // a padded 0. - pixels = _mm_or_si128(pixels, left); - pixels = _mm_shuffle_epi8(pixels, pixel_order2); - const ptrdiff_t stride2 = stride << 1; - const ptrdiff_t stride4 = stride << 2; - Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3, taps_4_5, - taps_6_7); - dst += 4; - for (int x = 3; x < width - 4; x += 4) { - pixels = Load4(top_ptr + x); - pixels = _mm_insert_epi8(pixels, top_ptr[x + 4], 4); - pixels = _mm_insert_epi8(pixels, dst[-1], 5); - pixels = _mm_insert_epi8(pixels, dst[stride - 1], 6); - - // Duplicate bottom half into upper half. - pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf); - Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5, - taps_6_7); - pixels = Load4(dst + stride - 1); - pixels = _mm_insert_epi8(pixels, dst[stride + 3], 4); - pixels = _mm_insert_epi8(pixels, dst[stride2 - 1], 5); - pixels = _mm_insert_epi8(pixels, dst[stride + stride2 - 1], 6); - - // Duplicate bottom half into upper half. - pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf); - Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3, - taps_4_5, taps_6_7); - dst += 4; - } - - // Now we handle heights that reference previous blocks rather than top_row. - for (int y = 4; y < height; y += 4) { - // Leftmost 4x4 block for this height. - dst -= width; - dst += stride4; - - // Top Left is not available by offset in these leftmost blocks. - pixels = Load4(dst - stride); - left = _mm_slli_si128(Load4(left_ptr + y - 1), 8); - left = _mm_insert_epi8(left, left_ptr[y + 3], 12); - pixels = _mm_or_si128(pixels, left); - pixels = _mm_shuffle_epi8(pixels, pixel_order2); - Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5, - taps_6_7); - - // The bytes shifted into positions 6 and 7 will be ignored by the shuffle. - left = _mm_srli_si128(left, 2); - pixels = Load4(dst + stride); - pixels = _mm_or_si128(pixels, left); - pixels = _mm_shuffle_epi8(pixels, pixel_order2); - Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3, - taps_4_5, taps_6_7); - - dst += 4; - - // Remaining 4x4 blocks for this height. - for (int x = 4; x < width; x += 4) { - pixels = Load4(dst - stride - 1); - pixels = _mm_insert_epi8(pixels, dst[-stride + 3], 4); - pixels = _mm_insert_epi8(pixels, dst[-1], 5); - pixels = _mm_insert_epi8(pixels, dst[stride - 1], 6); - - // Duplicate bottom half into upper half. - pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf); - Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5, - taps_6_7); - pixels = Load4(dst + stride - 1); - pixels = _mm_insert_epi8(pixels, dst[stride + 3], 4); - pixels = _mm_insert_epi8(pixels, dst[stride2 - 1], 5); - pixels = _mm_insert_epi8(pixels, dst[stride2 + stride - 1], 6); - - // Duplicate bottom half into upper half. - pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf); - Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3, - taps_4_5, taps_6_7); - dst += 4; - } - } -} - void Init8bpp() { Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8); assert(dsp != nullptr); @@ -2746,21 +1412,6 @@ void Init8bpp() { // These guards check if this version of the function was not superseded by // a higher optimization level, such as AVX. The corresponding #define also // prevents the C version from being added to the table. -#if DSP_ENABLED_8BPP_SSE4_1(FilterIntraPredictor) - dsp->filter_intra_predictor = FilterIntraPredictor_SSE4_1; -#endif -#if DSP_ENABLED_8BPP_SSE4_1(DirectionalIntraPredictorZone1) - dsp->directional_intra_predictor_zone1 = - DirectionalIntraPredictorZone1_SSE4_1; -#endif -#if DSP_ENABLED_8BPP_SSE4_1(DirectionalIntraPredictorZone2) - dsp->directional_intra_predictor_zone2 = - DirectionalIntraPredictorZone2_SSE4_1; -#endif -#if DSP_ENABLED_8BPP_SSE4_1(DirectionalIntraPredictorZone3) - dsp->directional_intra_predictor_zone3 = - DirectionalIntraPredictorZone3_SSE4_1; -#endif #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorDcTop) dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcTop] = DcDefs::_4x4::DcTop; @@ -3524,7 +2175,7 @@ void IntraPredInit_SSE4_1() { } // namespace dsp } // namespace libgav1 -#else // !LIBGAV1_TARGETING_SSE4_1 +#else // !LIBGAV1_TARGETING_SSE4_1 namespace libgav1 { namespace dsp { diff --git a/src/dsp/x86/intrapred_sse4.h b/src/dsp/x86/intrapred_sse4.h index 7f4fcd7..1f6f30a 100644 --- a/src/dsp/x86/intrapred_sse4.h +++ b/src/dsp/x86/intrapred_sse4.h @@ -23,13 +23,9 @@ namespace libgav1 { namespace dsp { -// Initializes Dsp::intra_predictors, Dsp::directional_intra_predictor_zone*, -// Dsp::cfl_intra_predictors, Dsp::cfl_subsamplers and -// Dsp::filter_intra_predictor, see the defines below for specifics. These -// functions are not thread-safe. +// Initializes Dsp::intra_predictors. See the defines below for specifics. +// These functions are not thread-safe. void IntraPredInit_SSE4_1(); -void IntraPredCflInit_SSE4_1(); -void IntraPredSmoothInit_SSE4_1(); } // namespace dsp } // namespace libgav1 @@ -37,22 +33,6 @@ void IntraPredSmoothInit_SSE4_1(); // If sse4 is enabled and the baseline isn't set due to a higher level of // optimization being enabled, signal the sse4 implementation should be used. #if LIBGAV1_TARGETING_SSE4_1 -#ifndef LIBGAV1_Dsp8bpp_FilterIntraPredictor -#define LIBGAV1_Dsp8bpp_FilterIntraPredictor LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1 -#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1 LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2 -#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2 LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3 -#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3 LIBGAV1_CPU_SSE4_1 -#endif - #ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcTop #define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1 #endif @@ -138,174 +118,6 @@ void IntraPredSmoothInit_SSE4_1(); LIBGAV1_CPU_SSE4_1 #endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler420 -#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler420 LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler420 -#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler420 -#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler420 -#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler420 LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler420 -#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler420 -#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler420 -#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler420 LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler420 -#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler420 LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler420 -#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler420 -#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler420 -#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler420 LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler420 -#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler420 -#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler420 -#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler420 LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler444 -#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler444 LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler444 -#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler444 -#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler444 -#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler444 LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler444 -#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler444 -#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler444 -#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler444 LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler444 -#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler444 LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler444 -#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler444 -#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler444 -#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler444 LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler444 -#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler444 -#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler444 -#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler444 LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflIntraPredictor -#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflIntraPredictor LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflIntraPredictor -#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflIntraPredictor -#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflIntraPredictor -#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflIntraPredictor LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflIntraPredictor -#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflIntraPredictor -#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflIntraPredictor -#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflIntraPredictor LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflIntraPredictor -#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflIntraPredictor LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflIntraPredictor -#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflIntraPredictor -#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflIntraPredictor -#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflIntraPredictor LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflIntraPredictor -#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflIntraPredictor -#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflIntraPredictor -#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflIntraPredictor LIBGAV1_CPU_SSE4_1 -#endif - #ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcLeft #define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcLeft LIBGAV1_CPU_SSE4_1 #endif @@ -658,287 +470,6 @@ void IntraPredSmoothInit_SSE4_1(); LIBGAV1_CPU_SSE4_1 #endif -#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmooth -#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmooth LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmooth -#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmooth LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmooth -#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmooth \ - LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmooth -#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmooth LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmooth -#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmooth LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmooth -#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmooth \ - LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmooth -#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmooth \ - LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmooth -#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmooth \ - LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmooth -#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmooth \ - LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmooth -#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmooth \ - LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmooth -#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmooth \ - LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmooth -#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmooth \ - LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmooth -#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmooth \ - LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmooth -#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmooth \ - LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmooth -#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmooth \ - LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmooth -#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmooth \ - LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmooth -#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmooth \ - LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmooth -#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmooth \ - LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmooth -#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmooth \ - LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothVertical -#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothVertical \ - LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothVertical -#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothVertical \ - LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothVertical -#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothVertical \ - LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothVertical -#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothVertical \ - LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothVertical -#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothVertical \ - LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothVertical -#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothVertical \ - LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothVertical -#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothVertical \ - LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothVertical -#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothVertical \ - LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothVertical -#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothVertical \ - LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothVertical -#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothVertical \ - LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothVertical -#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothVertical \ - LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothVertical -#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothVertical \ - LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothVertical -#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothVertical \ - LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothVertical -#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothVertical \ - LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothVertical -#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothVertical \ - LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothVertical -#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothVertical \ - LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothVertical -#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothVertical \ - LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothVertical -#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothVertical \ - LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothVertical -#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothVertical \ - LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothHorizontal -#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothHorizontal \ - LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothHorizontal -#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothHorizontal \ - LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothHorizontal -#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothHorizontal \ - LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothHorizontal -#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothHorizontal \ - LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothHorizontal -#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothHorizontal \ - LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothHorizontal -#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothHorizontal \ - LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothHorizontal -#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothHorizontal \ - LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothHorizontal -#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothHorizontal \ - LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothHorizontal -#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothHorizontal \ - LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothHorizontal -#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothHorizontal \ - LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothHorizontal -#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothHorizontal \ - LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothHorizontal -#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothHorizontal \ - LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothHorizontal -#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothHorizontal \ - LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothHorizontal -#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothHorizontal \ - LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothHorizontal -#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothHorizontal \ - LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothHorizontal -#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothHorizontal \ - LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothHorizontal -#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothHorizontal \ - LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothHorizontal -#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothHorizontal \ - LIBGAV1_CPU_SSE4_1 -#endif - -#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothHorizontal -#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothHorizontal \ - LIBGAV1_CPU_SSE4_1 -#endif - //------------------------------------------------------------------------------ // 10bpp diff --git a/src/dsp/x86/inverse_transform_sse4.cc b/src/dsp/x86/inverse_transform_sse4.cc index 787d706..12c008f 100644 --- a/src/dsp/x86/inverse_transform_sse4.cc +++ b/src/dsp/x86/inverse_transform_sse4.cc @@ -94,8 +94,7 @@ LIBGAV1_ALWAYS_INLINE void ButterflyRotation_4(__m128i* a, __m128i* b, static_cast<uint16_t>(cos128) | (static_cast<uint32_t>(sin128) << 16)); const __m128i ba = _mm_unpacklo_epi16(*a, *b); const __m128i ab = _mm_unpacklo_epi16(*b, *a); - const __m128i sign = - _mm_set_epi32(0x80000001, 0x80000001, 0x80000001, 0x80000001); + const __m128i sign = _mm_set1_epi32(static_cast<int>(0x80000001)); // -sin cos, -sin cos, -sin cos, -sin cos const __m128i msin_pcos = _mm_sign_epi16(psin_pcos, sign); const __m128i x0 = _mm_madd_epi16(ba, msin_pcos); @@ -121,8 +120,7 @@ LIBGAV1_ALWAYS_INLINE void ButterflyRotation_8(__m128i* a, __m128i* b, const int16_t sin128 = Sin128(angle); const __m128i psin_pcos = _mm_set1_epi32( static_cast<uint16_t>(cos128) | (static_cast<uint32_t>(sin128) << 16)); - const __m128i sign = - _mm_set_epi32(0x80000001, 0x80000001, 0x80000001, 0x80000001); + const __m128i sign = _mm_set1_epi32(static_cast<int>(0x80000001)); // -sin cos, -sin cos, -sin cos, -sin cos const __m128i msin_pcos = _mm_sign_epi16(psin_pcos, sign); const __m128i ba = _mm_unpacklo_epi16(*a, *b); @@ -229,7 +227,8 @@ LIBGAV1_ALWAYS_INLINE bool DctDcOnly(void* dest, int adjusted_tx_height, const __m128i v_src_lo = _mm_shufflelo_epi16(_mm_cvtsi32_si128(dst[0]), 0); const __m128i v_src = (width == 4) ? v_src_lo : _mm_shuffle_epi32(v_src_lo, 0); - const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0); + const __m128i v_mask = + _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0); const __m128i v_kTransformRowMultiplier = _mm_set1_epi16(kTransformRowMultiplier << 3); const __m128i v_src_round = @@ -1039,7 +1038,8 @@ LIBGAV1_ALWAYS_INLINE bool Adst4DcOnly(void* dest, int adjusted_tx_height, auto* dst = static_cast<int16_t*>(dest); const __m128i v_src = _mm_shuffle_epi32(_mm_shufflelo_epi16(_mm_cvtsi32_si128(dst[0]), 0), 0); - const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0); + const __m128i v_mask = + _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0); const __m128i v_kTransformRowMultiplier = _mm_set1_epi16(kTransformRowMultiplier << 3); const __m128i v_src_round = @@ -1194,7 +1194,8 @@ LIBGAV1_ALWAYS_INLINE bool Adst8DcOnly(void* dest, int adjusted_tx_height, __m128i s[8]; const __m128i v_src = _mm_shufflelo_epi16(_mm_cvtsi32_si128(dst[0]), 0); - const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0); + const __m128i v_mask = + _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0); const __m128i v_kTransformRowMultiplier = _mm_set1_epi16(kTransformRowMultiplier << 3); const __m128i v_src_round = @@ -1519,7 +1520,8 @@ LIBGAV1_ALWAYS_INLINE bool Adst16DcOnly(void* dest, int adjusted_tx_height, __m128i x[16]; const __m128i v_src = _mm_shufflelo_epi16(_mm_cvtsi32_si128(dst[0]), 0); - const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0); + const __m128i v_mask = + _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0); const __m128i v_kTransformRowMultiplier = _mm_set1_epi16(kTransformRowMultiplier << 3); const __m128i v_src_round = @@ -1615,7 +1617,8 @@ LIBGAV1_ALWAYS_INLINE bool Identity4DcOnly(void* dest, int adjusted_tx_height, auto* dst = static_cast<int16_t*>(dest); const __m128i v_src0 = _mm_cvtsi32_si128(dst[0]); - const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0); + const __m128i v_mask = + _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0); const __m128i v_kTransformRowMultiplier = _mm_set1_epi16(kTransformRowMultiplier << 3); const __m128i v_src_round = @@ -1767,7 +1770,8 @@ LIBGAV1_ALWAYS_INLINE bool Identity8DcOnly(void* dest, int adjusted_tx_height, auto* dst = static_cast<int16_t*>(dest); const __m128i v_src0 = _mm_cvtsi32_si128(dst[0]); - const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0); + const __m128i v_mask = + _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0); const __m128i v_kTransformRowMultiplier = _mm_set1_epi16(kTransformRowMultiplier << 3); const __m128i v_src_round = @@ -1859,7 +1863,8 @@ LIBGAV1_ALWAYS_INLINE bool Identity16DcOnly(void* dest, int adjusted_tx_height, auto* dst = static_cast<int16_t*>(dest); const __m128i v_src0 = _mm_cvtsi32_si128(dst[0]); - const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0); + const __m128i v_mask = + _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0); const __m128i v_kTransformRowMultiplier = _mm_set1_epi16(kTransformRowMultiplier << 3); const __m128i v_src_round0 = @@ -2918,75 +2923,11 @@ void Wht4TransformLoopColumn_SSE4_1(TransformType tx_type, //------------------------------------------------------------------------------ -template <typename Residual, typename Pixel> -void InitAll(Dsp* const dsp) { - // Maximum transform size for Dct is 64. - dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kRow] = - Dct4TransformLoopRow_SSE4_1; - dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kColumn] = - Dct4TransformLoopColumn_SSE4_1; - dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kRow] = - Dct8TransformLoopRow_SSE4_1; - dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kColumn] = - Dct8TransformLoopColumn_SSE4_1; - dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kRow] = - Dct16TransformLoopRow_SSE4_1; - dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kColumn] = - Dct16TransformLoopColumn_SSE4_1; - dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kRow] = - Dct32TransformLoopRow_SSE4_1; - dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kColumn] = - Dct32TransformLoopColumn_SSE4_1; - dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kRow] = - Dct64TransformLoopRow_SSE4_1; - dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kColumn] = - Dct64TransformLoopColumn_SSE4_1; - - // Maximum transform size for Adst is 16. - dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kRow] = - Adst4TransformLoopRow_SSE4_1; - dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kColumn] = - Adst4TransformLoopColumn_SSE4_1; - dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kRow] = - Adst8TransformLoopRow_SSE4_1; - dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kColumn] = - Adst8TransformLoopColumn_SSE4_1; - dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kRow] = - Adst16TransformLoopRow_SSE4_1; - dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kColumn] = - Adst16TransformLoopColumn_SSE4_1; - - // Maximum transform size for Identity transform is 32. - dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kRow] = - Identity4TransformLoopRow_SSE4_1; - dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kColumn] = - Identity4TransformLoopColumn_SSE4_1; - dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kRow] = - Identity8TransformLoopRow_SSE4_1; - dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kColumn] = - Identity8TransformLoopColumn_SSE4_1; - dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kRow] = - Identity16TransformLoopRow_SSE4_1; - dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kColumn] = - Identity16TransformLoopColumn_SSE4_1; - dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kRow] = - Identity32TransformLoopRow_SSE4_1; - dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kColumn] = - Identity32TransformLoopColumn_SSE4_1; - - // Maximum transform size for Wht is 4. - dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kRow] = - Wht4TransformLoopRow_SSE4_1; - dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kColumn] = - Wht4TransformLoopColumn_SSE4_1; -} - void Init8bpp() { Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8); assert(dsp != nullptr); -#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS - InitAll<int16_t, uint8_t>(dsp); -#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + + // Maximum transform size for Dct is 64. #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize4_1DTransformDct) dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kRow] = Dct4TransformLoopRow_SSE4_1; @@ -3017,6 +2958,8 @@ void Init8bpp() { dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kColumn] = Dct64TransformLoopColumn_SSE4_1; #endif + + // Maximum transform size for Adst is 16. #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize4_1DTransformAdst) dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kRow] = Adst4TransformLoopRow_SSE4_1; @@ -3035,6 +2978,8 @@ void Init8bpp() { dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kColumn] = Adst16TransformLoopColumn_SSE4_1; #endif + + // Maximum transform size for Identity transform is 32. #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize4_1DTransformIdentity) dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kRow] = Identity4TransformLoopRow_SSE4_1; @@ -3059,13 +3004,14 @@ void Init8bpp() { dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kColumn] = Identity32TransformLoopColumn_SSE4_1; #endif + + // Maximum transform size for Wht is 4. #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize4_1DTransformWht) dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kRow] = Wht4TransformLoopRow_SSE4_1; dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kColumn] = Wht4TransformLoopColumn_SSE4_1; #endif -#endif } } // namespace @@ -3075,7 +3021,7 @@ void InverseTransformInit_SSE4_1() { low_bitdepth::Init8bpp(); } } // namespace dsp } // namespace libgav1 -#else // !LIBGAV1_TARGETING_SSE4_1 +#else // !LIBGAV1_TARGETING_SSE4_1 namespace libgav1 { namespace dsp { diff --git a/src/dsp/x86/loop_filter_sse4.cc b/src/dsp/x86/loop_filter_sse4.cc index d67b450..b9da2d5 100644 --- a/src/dsp/x86/loop_filter_sse4.cc +++ b/src/dsp/x86/loop_filter_sse4.cc @@ -350,7 +350,7 @@ void Horizontal6(void* dest, ptrdiff_t stride, int outer_thresh, const __m128i v_mask = _mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat3_mask), 0); - if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi8(v_mask, v_mask)) == 0) { + if (_mm_test_all_zeros(v_mask, v_mask) == 0) { __m128i oqp1_f6; __m128i oqp0_f6; @@ -454,7 +454,7 @@ void Vertical6(void* dest, ptrdiff_t stride, int outer_thresh, int inner_thresh, const __m128i v_mask = _mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat3_mask), 0); - if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi8(v_mask, v_mask)) == 0) { + if (_mm_test_all_zeros(v_mask, v_mask) == 0) { __m128i oqp1_f6; __m128i oqp0_f6; @@ -595,7 +595,7 @@ void Horizontal8(void* dest, ptrdiff_t stride, int outer_thresh, const __m128i v_mask = _mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat4_mask), 0); - if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi8(v_mask, v_mask)) == 0) { + if (_mm_test_all_zeros(v_mask, v_mask) == 0) { __m128i oqp2_f8; __m128i oqp1_f8; __m128i oqp0_f8; @@ -697,7 +697,7 @@ void Vertical8(void* dest, ptrdiff_t stride, int outer_thresh, int inner_thresh, const __m128i v_mask = _mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat4_mask), 0); - if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi8(v_mask, v_mask)) == 0) { + if (_mm_test_all_zeros(v_mask, v_mask) == 0) { __m128i oqp2_f8; __m128i oqp1_f8; __m128i oqp0_f8; @@ -838,7 +838,7 @@ void Horizontal14(void* dest, ptrdiff_t stride, int outer_thresh, const __m128i v_mask = _mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat4_mask), 0); - if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi8(v_mask, v_mask)) == 0) { + if (_mm_test_all_zeros(v_mask, v_mask) == 0) { const __m128i p6 = Load4(dst - 7 * stride); const __m128i p5 = Load4(dst - 6 * stride); const __m128i p4 = Load4(dst - 5 * stride); @@ -864,8 +864,7 @@ void Horizontal14(void* dest, ptrdiff_t stride, int outer_thresh, oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask); oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask); - if (_mm_test_all_zeros(v_flat4_mask, - _mm_cmpeq_epi8(v_flat4_mask, v_flat4_mask)) == 0) { + if (_mm_test_all_zeros(v_flat4_mask, v_flat4_mask) == 0) { __m128i oqp5_f14; __m128i oqp4_f14; __m128i oqp3_f14; @@ -1050,7 +1049,7 @@ void Vertical14(void* dest, ptrdiff_t stride, int outer_thresh, const __m128i v_mask = _mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat4_mask), 0); - if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi8(v_mask, v_mask)) == 0) { + if (_mm_test_all_zeros(v_mask, v_mask) == 0) { const __m128i v_isflatouter4_mask = IsFlat4(qp6, qp5, qp4, qp0, v_flat_thresh); const __m128i v_flat4_mask = @@ -1066,8 +1065,7 @@ void Vertical14(void* dest, ptrdiff_t stride, int outer_thresh, oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask); oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask); - if (_mm_test_all_zeros(v_flat4_mask, - _mm_cmpeq_epi8(v_flat4_mask, v_flat4_mask)) == 0) { + if (_mm_test_all_zeros(v_flat4_mask, v_flat4_mask) == 0) { __m128i oqp5_f14; __m128i oqp4_f14; __m128i oqp3_f14; @@ -1458,7 +1456,7 @@ void LoopFilterFuncs_SSE4_1<bitdepth>::Horizontal6(void* dest, const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat3_mask); const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo); - if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi16(v_mask, v_mask)) == 0) { + if (_mm_test_all_zeros(v_mask, v_mask) == 0) { __m128i oqp1_f6; __m128i oqp0_f6; @@ -1572,7 +1570,7 @@ void LoopFilterFuncs_SSE4_1<bitdepth>::Vertical6(void* dest, ptrdiff_t stride8, const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat3_mask); const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo); - if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi16(v_mask, v_mask)) == 0) { + if (_mm_test_all_zeros(v_mask, v_mask) == 0) { __m128i oqp1_f6; __m128i oqp0_f6; @@ -1711,7 +1709,7 @@ void LoopFilterFuncs_SSE4_1<bitdepth>::Horizontal8(void* dest, const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat4_mask); const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo); - if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi16(v_mask, v_mask)) == 0) { + if (_mm_test_all_zeros(v_mask, v_mask) == 0) { __m128i oqp2_f8; __m128i oqp1_f8; __m128i oqp0_f8; @@ -1821,7 +1819,7 @@ void LoopFilterFuncs_SSE4_1<bitdepth>::Vertical8(void* dest, ptrdiff_t stride8, const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat4_mask); const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo); - if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi16(v_mask, v_mask)) == 0) { + if (_mm_test_all_zeros(v_mask, v_mask) == 0) { __m128i oqp2_f8; __m128i oqp1_f8; __m128i oqp0_f8; @@ -1957,7 +1955,7 @@ void LoopFilterFuncs_SSE4_1<bitdepth>::Horizontal14(void* dest, const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat4_mask); const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo); - if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi16(v_mask, v_mask)) == 0) { + if (_mm_test_all_zeros(v_mask, v_mask) == 0) { const __m128i p6 = LoadLo8(dst - 7 * stride); const __m128i p5 = LoadLo8(dst - 6 * stride); const __m128i p4 = LoadLo8(dst - 5 * stride); @@ -1984,8 +1982,7 @@ void LoopFilterFuncs_SSE4_1<bitdepth>::Horizontal14(void* dest, oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask); oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask); - if (_mm_test_all_zeros(v_flat4_mask, - _mm_cmpeq_epi16(v_flat4_mask, v_flat4_mask)) == 0) { + if (_mm_test_all_zeros(v_flat4_mask, v_flat4_mask) == 0) { __m128i oqp5_f14; __m128i oqp4_f14; __m128i oqp3_f14; @@ -2133,7 +2130,7 @@ void LoopFilterFuncs_SSE4_1<bitdepth>::Vertical14(void* dest, ptrdiff_t stride8, const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat4_mask); const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo); - if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi16(v_mask, v_mask)) == 0) { + if (_mm_test_all_zeros(v_mask, v_mask) == 0) { const __m128i v_isflatouter4_mask = IsFlat4(qp6, qp5, qp4, qp0, v_flat_thresh); const __m128i v_flat4_mask_lo = _mm_and_si128(v_mask, v_isflatouter4_mask); @@ -2150,8 +2147,7 @@ void LoopFilterFuncs_SSE4_1<bitdepth>::Vertical14(void* dest, ptrdiff_t stride8, oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask); oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask); - if (_mm_test_all_zeros(v_flat4_mask, - _mm_cmpeq_epi16(v_flat4_mask, v_flat4_mask)) == 0) { + if (_mm_test_all_zeros(v_flat4_mask, v_flat4_mask) == 0) { __m128i oqp5_f14; __m128i oqp4_f14; __m128i oqp3_f14; @@ -2245,7 +2241,7 @@ void LoopFilterInit_SSE4_1() { } // namespace dsp } // namespace libgav1 -#else // !LIBGAV1_TARGETING_SSE4_1 +#else // !LIBGAV1_TARGETING_SSE4_1 namespace libgav1 { namespace dsp { diff --git a/src/dsp/x86/loop_restoration_10bit_avx2.cc b/src/dsp/x86/loop_restoration_10bit_avx2.cc index 702bdea..b38f322 100644 --- a/src/dsp/x86/loop_restoration_10bit_avx2.cc +++ b/src/dsp/x86/loop_restoration_10bit_avx2.cc @@ -28,7 +28,6 @@ #include "src/dsp/constants.h" #include "src/dsp/dsp.h" #include "src/dsp/x86/common_avx2.h" -#include "src/dsp/x86/common_sse4.h" #include "src/utils/common.h" #include "src/utils/constants.h" @@ -472,12 +471,12 @@ inline void WienerVerticalTap1(const int16_t* wiener_buffer, } } -void WienerFilter_AVX2(const RestorationUnitInfo& restoration_info, - const void* const source, const void* const top_border, - const void* const bottom_border, const ptrdiff_t stride, - const int width, const int height, - RestorationBuffer* const restoration_buffer, - void* const dest) { +void WienerFilter_AVX2( + const RestorationUnitInfo& restoration_info, const void* const source, + const ptrdiff_t stride, const void* const top_border, + const ptrdiff_t top_border_stride, const void* const bottom_border, + const ptrdiff_t bottom_border_stride, const int width, const int height, + RestorationBuffer* const restoration_buffer, void* const dest) { const int16_t* const number_leading_zero_coefficients = restoration_info.wiener_info.number_leading_zero_coefficients; const int number_rows_to_skip = std::max( @@ -502,39 +501,42 @@ void WienerFilter_AVX2(const RestorationUnitInfo& restoration_info, LoadLo8(restoration_info.wiener_info.filter[WienerInfo::kHorizontal]); const __m256i coefficients_horizontal = _mm256_broadcastq_epi64(c); if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) { - WienerHorizontalTap7(top + (2 - height_extra) * stride - 3, stride, - wiener_stride, height_extra, &coefficients_horizontal, - &wiener_buffer_horizontal); - WienerHorizontalTap7(src - 3, stride, wiener_stride, height, + WienerHorizontalTap7(top + (2 - height_extra) * top_border_stride - 3, + top_border_stride, wiener_stride, height_extra, &coefficients_horizontal, &wiener_buffer_horizontal); - WienerHorizontalTap7(bottom - 3, stride, wiener_stride, height_extra, + WienerHorizontalTap7(src - 3, stride, wiener_stride, height, &coefficients_horizontal, &wiener_buffer_horizontal); - } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) { - WienerHorizontalTap5(top + (2 - height_extra) * stride - 2, stride, - wiener_stride, height_extra, &coefficients_horizontal, + WienerHorizontalTap7(bottom - 3, bottom_border_stride, wiener_stride, + height_extra, &coefficients_horizontal, &wiener_buffer_horizontal); - WienerHorizontalTap5(src - 2, stride, wiener_stride, height, + } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) { + WienerHorizontalTap5(top + (2 - height_extra) * top_border_stride - 2, + top_border_stride, wiener_stride, height_extra, &coefficients_horizontal, &wiener_buffer_horizontal); - WienerHorizontalTap5(bottom - 2, stride, wiener_stride, height_extra, + WienerHorizontalTap5(src - 2, stride, wiener_stride, height, &coefficients_horizontal, &wiener_buffer_horizontal); + WienerHorizontalTap5(bottom - 2, bottom_border_stride, wiener_stride, + height_extra, &coefficients_horizontal, + &wiener_buffer_horizontal); } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) { // The maximum over-reads happen here. - WienerHorizontalTap3(top + (2 - height_extra) * stride - 1, stride, - wiener_stride, height_extra, &coefficients_horizontal, - &wiener_buffer_horizontal); - WienerHorizontalTap3(src - 1, stride, wiener_stride, height, + WienerHorizontalTap3(top + (2 - height_extra) * top_border_stride - 1, + top_border_stride, wiener_stride, height_extra, &coefficients_horizontal, &wiener_buffer_horizontal); - WienerHorizontalTap3(bottom - 1, stride, wiener_stride, height_extra, + WienerHorizontalTap3(src - 1, stride, wiener_stride, height, &coefficients_horizontal, &wiener_buffer_horizontal); + WienerHorizontalTap3(bottom - 1, bottom_border_stride, wiener_stride, + height_extra, &coefficients_horizontal, + &wiener_buffer_horizontal); } else { assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3); - WienerHorizontalTap1(top + (2 - height_extra) * stride, stride, - wiener_stride, height_extra, + WienerHorizontalTap1(top + (2 - height_extra) * top_border_stride, + top_border_stride, wiener_stride, height_extra, &wiener_buffer_horizontal); WienerHorizontalTap1(src, stride, wiener_stride, height, &wiener_buffer_horizontal); - WienerHorizontalTap1(bottom, stride, wiener_stride, height_extra, - &wiener_buffer_horizontal); + WienerHorizontalTap1(bottom, bottom_border_stride, wiener_stride, + height_extra, &wiener_buffer_horizontal); } // vertical filtering. @@ -566,12 +568,2575 @@ void WienerFilter_AVX2(const RestorationUnitInfo& restoration_info, } } +//------------------------------------------------------------------------------ +// SGR + +constexpr int kSumOffset = 24; + +// SIMD overreads the number of pixels in SIMD registers - (width % 8) - 2 * +// padding pixels, where padding is 3 for Pass 1 and 2 for Pass 2. The number of +// bytes in SIMD registers is 16 for SSE4.1 and 32 for AVX2. +constexpr int kOverreadInBytesPass1_128 = 4; +constexpr int kOverreadInBytesPass2_128 = 8; +constexpr int kOverreadInBytesPass1_256 = kOverreadInBytesPass1_128 + 16; +constexpr int kOverreadInBytesPass2_256 = kOverreadInBytesPass2_128 + 16; + +inline void LoadAligned16x2U16(const uint16_t* const src[2], const ptrdiff_t x, + __m128i dst[2]) { + dst[0] = LoadAligned16(src[0] + x); + dst[1] = LoadAligned16(src[1] + x); +} + +inline void LoadAligned32x2U16(const uint16_t* const src[2], const ptrdiff_t x, + __m256i dst[2]) { + dst[0] = LoadAligned32(src[0] + x); + dst[1] = LoadAligned32(src[1] + x); +} + +inline void LoadAligned32x2U16Msan(const uint16_t* const src[2], + const ptrdiff_t x, const ptrdiff_t border, + __m256i dst[2]) { + dst[0] = LoadAligned32Msan(src[0] + x, sizeof(**src) * (x + 16 - border)); + dst[1] = LoadAligned32Msan(src[1] + x, sizeof(**src) * (x + 16 - border)); +} + +inline void LoadAligned16x3U16(const uint16_t* const src[3], const ptrdiff_t x, + __m128i dst[3]) { + dst[0] = LoadAligned16(src[0] + x); + dst[1] = LoadAligned16(src[1] + x); + dst[2] = LoadAligned16(src[2] + x); +} + +inline void LoadAligned32x3U16(const uint16_t* const src[3], const ptrdiff_t x, + __m256i dst[3]) { + dst[0] = LoadAligned32(src[0] + x); + dst[1] = LoadAligned32(src[1] + x); + dst[2] = LoadAligned32(src[2] + x); +} + +inline void LoadAligned32x3U16Msan(const uint16_t* const src[3], + const ptrdiff_t x, const ptrdiff_t border, + __m256i dst[3]) { + dst[0] = LoadAligned32Msan(src[0] + x, sizeof(**src) * (x + 16 - border)); + dst[1] = LoadAligned32Msan(src[1] + x, sizeof(**src) * (x + 16 - border)); + dst[2] = LoadAligned32Msan(src[2] + x, sizeof(**src) * (x + 16 - border)); +} + +inline void LoadAligned32U32(const uint32_t* const src, __m128i dst[2]) { + dst[0] = LoadAligned16(src + 0); + dst[1] = LoadAligned16(src + 4); +} + +inline void LoadAligned32x2U32(const uint32_t* const src[2], const ptrdiff_t x, + __m128i dst[2][2]) { + LoadAligned32U32(src[0] + x, dst[0]); + LoadAligned32U32(src[1] + x, dst[1]); +} + +inline void LoadAligned64x2U32(const uint32_t* const src[2], const ptrdiff_t x, + __m256i dst[2][2]) { + LoadAligned64(src[0] + x, dst[0]); + LoadAligned64(src[1] + x, dst[1]); +} + +inline void LoadAligned64x2U32Msan(const uint32_t* const src[2], + const ptrdiff_t x, const ptrdiff_t border, + __m256i dst[2][2]) { + LoadAligned64Msan(src[0] + x, sizeof(**src) * (x + 16 - border), dst[0]); + LoadAligned64Msan(src[1] + x, sizeof(**src) * (x + 16 - border), dst[1]); +} + +inline void LoadAligned32x3U32(const uint32_t* const src[3], const ptrdiff_t x, + __m128i dst[3][2]) { + LoadAligned32U32(src[0] + x, dst[0]); + LoadAligned32U32(src[1] + x, dst[1]); + LoadAligned32U32(src[2] + x, dst[2]); +} + +inline void LoadAligned64x3U32(const uint32_t* const src[3], const ptrdiff_t x, + __m256i dst[3][2]) { + LoadAligned64(src[0] + x, dst[0]); + LoadAligned64(src[1] + x, dst[1]); + LoadAligned64(src[2] + x, dst[2]); +} + +inline void LoadAligned64x3U32Msan(const uint32_t* const src[3], + const ptrdiff_t x, const ptrdiff_t border, + __m256i dst[3][2]) { + LoadAligned64Msan(src[0] + x, sizeof(**src) * (x + 16 - border), dst[0]); + LoadAligned64Msan(src[1] + x, sizeof(**src) * (x + 16 - border), dst[1]); + LoadAligned64Msan(src[2] + x, sizeof(**src) * (x + 16 - border), dst[2]); +} + +inline void StoreAligned32U32(uint32_t* const dst, const __m128i src[2]) { + StoreAligned16(dst + 0, src[0]); + StoreAligned16(dst + 4, src[1]); +} + +// The AVX2 ymm register holds ma[0], ma[1], ..., ma[7], and ma[16], ma[17], +// ..., ma[23]. +// There is an 8 pixel gap between the first half and the second half. +constexpr int kMaStoreOffset = 8; + +inline void StoreAligned32_ma(uint16_t* src, const __m256i v) { + StoreAligned16(src + 0 * 8, _mm256_extracti128_si256(v, 0)); + StoreAligned16(src + 2 * 8, _mm256_extracti128_si256(v, 1)); +} + +inline void StoreAligned64_ma(uint16_t* src, const __m256i v[2]) { + // The next 4 lines are much faster than: + // StoreAligned32(src + 0, _mm256_permute2x128_si256(v[0], v[1], 0x20)); + // StoreAligned32(src + 16, _mm256_permute2x128_si256(v[0], v[1], 0x31)); + StoreAligned16(src + 0 * 8, _mm256_extracti128_si256(v[0], 0)); + StoreAligned16(src + 1 * 8, _mm256_extracti128_si256(v[1], 0)); + StoreAligned16(src + 2 * 8, _mm256_extracti128_si256(v[0], 1)); + StoreAligned16(src + 3 * 8, _mm256_extracti128_si256(v[1], 1)); +} + +// Don't use _mm_cvtepu8_epi16() or _mm_cvtepu16_epi32() in the following +// functions. Some compilers may generate super inefficient code and the whole +// decoder could be 15% slower. + +inline __m256i VaddlLo8(const __m256i src0, const __m256i src1) { + const __m256i s0 = _mm256_unpacklo_epi8(src0, _mm256_setzero_si256()); + const __m256i s1 = _mm256_unpacklo_epi8(src1, _mm256_setzero_si256()); + return _mm256_add_epi16(s0, s1); +} + +inline __m256i VaddlHi8(const __m256i src0, const __m256i src1) { + const __m256i s0 = _mm256_unpackhi_epi8(src0, _mm256_setzero_si256()); + const __m256i s1 = _mm256_unpackhi_epi8(src1, _mm256_setzero_si256()); + return _mm256_add_epi16(s0, s1); +} + +inline __m256i VaddwLo8(const __m256i src0, const __m256i src1) { + const __m256i s1 = _mm256_unpacklo_epi8(src1, _mm256_setzero_si256()); + return _mm256_add_epi16(src0, s1); +} + +inline __m256i VaddwHi8(const __m256i src0, const __m256i src1) { + const __m256i s1 = _mm256_unpackhi_epi8(src1, _mm256_setzero_si256()); + return _mm256_add_epi16(src0, s1); +} + +inline __m256i VmullNLo8(const __m256i src0, const int src1) { + const __m256i s0 = _mm256_unpacklo_epi16(src0, _mm256_setzero_si256()); + return _mm256_madd_epi16(s0, _mm256_set1_epi32(src1)); +} + +inline __m256i VmullNHi8(const __m256i src0, const int src1) { + const __m256i s0 = _mm256_unpackhi_epi16(src0, _mm256_setzero_si256()); + return _mm256_madd_epi16(s0, _mm256_set1_epi32(src1)); +} + +inline __m128i VmullLo16(const __m128i src0, const __m128i src1) { + const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128()); + const __m128i s1 = _mm_unpacklo_epi16(src1, _mm_setzero_si128()); + return _mm_madd_epi16(s0, s1); +} + +inline __m256i VmullLo16(const __m256i src0, const __m256i src1) { + const __m256i s0 = _mm256_unpacklo_epi16(src0, _mm256_setzero_si256()); + const __m256i s1 = _mm256_unpacklo_epi16(src1, _mm256_setzero_si256()); + return _mm256_madd_epi16(s0, s1); +} + +inline __m128i VmullHi16(const __m128i src0, const __m128i src1) { + const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128()); + const __m128i s1 = _mm_unpackhi_epi16(src1, _mm_setzero_si128()); + return _mm_madd_epi16(s0, s1); +} + +inline __m256i VmullHi16(const __m256i src0, const __m256i src1) { + const __m256i s0 = _mm256_unpackhi_epi16(src0, _mm256_setzero_si256()); + const __m256i s1 = _mm256_unpackhi_epi16(src1, _mm256_setzero_si256()); + return _mm256_madd_epi16(s0, s1); +} + +inline __m128i VrshrU16(const __m128i src0, const int src1) { + const __m128i sum = _mm_add_epi16(src0, _mm_set1_epi16(1 << (src1 - 1))); + return _mm_srli_epi16(sum, src1); +} + +inline __m256i VrshrU16(const __m256i src0, const int src1) { + const __m256i sum = + _mm256_add_epi16(src0, _mm256_set1_epi16(1 << (src1 - 1))); + return _mm256_srli_epi16(sum, src1); +} + +inline __m256i VrshrS32(const __m256i src0, const int src1) { + const __m256i sum = + _mm256_add_epi32(src0, _mm256_set1_epi32(1 << (src1 - 1))); + return _mm256_srai_epi32(sum, src1); +} + +inline __m128i VrshrU32(const __m128i src0, const int src1) { + const __m128i sum = _mm_add_epi32(src0, _mm_set1_epi32(1 << (src1 - 1))); + return _mm_srli_epi32(sum, src1); +} + +inline __m256i VrshrU32(const __m256i src0, const int src1) { + const __m256i sum = + _mm256_add_epi32(src0, _mm256_set1_epi32(1 << (src1 - 1))); + return _mm256_srli_epi32(sum, src1); +} + +inline void Square(const __m128i src, __m128i dst[2]) { + const __m128i s0 = _mm_unpacklo_epi16(src, _mm_setzero_si128()); + const __m128i s1 = _mm_unpackhi_epi16(src, _mm_setzero_si128()); + dst[0] = _mm_madd_epi16(s0, s0); + dst[1] = _mm_madd_epi16(s1, s1); +} + +inline void Square(const __m256i src, __m256i dst[2]) { + const __m256i s0 = _mm256_unpacklo_epi16(src, _mm256_setzero_si256()); + const __m256i s1 = _mm256_unpackhi_epi16(src, _mm256_setzero_si256()); + dst[0] = _mm256_madd_epi16(s0, s0); + dst[1] = _mm256_madd_epi16(s1, s1); +} + +inline void Prepare3_8(const __m256i src[2], __m256i dst[3]) { + dst[0] = _mm256_alignr_epi8(src[1], src[0], 0); + dst[1] = _mm256_alignr_epi8(src[1], src[0], 1); + dst[2] = _mm256_alignr_epi8(src[1], src[0], 2); +} + +inline void Prepare3_16(const __m128i src[2], __m128i dst[3]) { + dst[0] = src[0]; + dst[1] = _mm_alignr_epi8(src[1], src[0], 2); + dst[2] = _mm_alignr_epi8(src[1], src[0], 4); +} + +inline void Prepare3_32(const __m128i src[2], __m128i dst[3]) { + dst[0] = src[0]; + dst[1] = _mm_alignr_epi8(src[1], src[0], 4); + dst[2] = _mm_alignr_epi8(src[1], src[0], 8); +} + +inline void Prepare3_32(const __m256i src[2], __m256i dst[3]) { + dst[0] = src[0]; + dst[1] = _mm256_alignr_epi8(src[1], src[0], 4); + dst[2] = _mm256_alignr_epi8(src[1], src[0], 8); +} + +inline void Prepare5_16(const __m128i src[2], __m128i dst[5]) { + Prepare3_16(src, dst); + dst[3] = _mm_alignr_epi8(src[1], src[0], 6); + dst[4] = _mm_alignr_epi8(src[1], src[0], 8); +} + +inline void Prepare5_32(const __m128i src[2], __m128i dst[5]) { + Prepare3_32(src, dst); + dst[3] = _mm_alignr_epi8(src[1], src[0], 12); + dst[4] = src[1]; +} + +inline void Prepare5_32(const __m256i src[2], __m256i dst[5]) { + Prepare3_32(src, dst); + dst[3] = _mm256_alignr_epi8(src[1], src[0], 12); + dst[4] = src[1]; +} + +inline __m128i Sum3_16(const __m128i src0, const __m128i src1, + const __m128i src2) { + const __m128i sum = _mm_add_epi16(src0, src1); + return _mm_add_epi16(sum, src2); +} + +inline __m256i Sum3_16(const __m256i src0, const __m256i src1, + const __m256i src2) { + const __m256i sum = _mm256_add_epi16(src0, src1); + return _mm256_add_epi16(sum, src2); +} + +inline __m128i Sum3_16(const __m128i src[3]) { + return Sum3_16(src[0], src[1], src[2]); +} + +inline __m256i Sum3_16(const __m256i src[3]) { + return Sum3_16(src[0], src[1], src[2]); +} + +inline __m128i Sum3_32(const __m128i src0, const __m128i src1, + const __m128i src2) { + const __m128i sum = _mm_add_epi32(src0, src1); + return _mm_add_epi32(sum, src2); +} + +inline __m256i Sum3_32(const __m256i src0, const __m256i src1, + const __m256i src2) { + const __m256i sum = _mm256_add_epi32(src0, src1); + return _mm256_add_epi32(sum, src2); +} + +inline __m128i Sum3_32(const __m128i src[3]) { + return Sum3_32(src[0], src[1], src[2]); +} + +inline __m256i Sum3_32(const __m256i src[3]) { + return Sum3_32(src[0], src[1], src[2]); +} + +inline void Sum3_32(const __m128i src[3][2], __m128i dst[2]) { + dst[0] = Sum3_32(src[0][0], src[1][0], src[2][0]); + dst[1] = Sum3_32(src[0][1], src[1][1], src[2][1]); +} + +inline void Sum3_32(const __m256i src[3][2], __m256i dst[2]) { + dst[0] = Sum3_32(src[0][0], src[1][0], src[2][0]); + dst[1] = Sum3_32(src[0][1], src[1][1], src[2][1]); +} + +inline __m256i Sum3WLo16(const __m256i src[3]) { + const __m256i sum = VaddlLo8(src[0], src[1]); + return VaddwLo8(sum, src[2]); +} + +inline __m256i Sum3WHi16(const __m256i src[3]) { + const __m256i sum = VaddlHi8(src[0], src[1]); + return VaddwHi8(sum, src[2]); +} + +inline __m128i Sum5_16(const __m128i src[5]) { + const __m128i sum01 = _mm_add_epi16(src[0], src[1]); + const __m128i sum23 = _mm_add_epi16(src[2], src[3]); + const __m128i sum = _mm_add_epi16(sum01, sum23); + return _mm_add_epi16(sum, src[4]); +} + +inline __m256i Sum5_16(const __m256i src[5]) { + const __m256i sum01 = _mm256_add_epi16(src[0], src[1]); + const __m256i sum23 = _mm256_add_epi16(src[2], src[3]); + const __m256i sum = _mm256_add_epi16(sum01, sum23); + return _mm256_add_epi16(sum, src[4]); +} + +inline __m128i Sum5_32(const __m128i* const src0, const __m128i* const src1, + const __m128i* const src2, const __m128i* const src3, + const __m128i* const src4) { + const __m128i sum01 = _mm_add_epi32(*src0, *src1); + const __m128i sum23 = _mm_add_epi32(*src2, *src3); + const __m128i sum = _mm_add_epi32(sum01, sum23); + return _mm_add_epi32(sum, *src4); +} + +inline __m256i Sum5_32(const __m256i* const src0, const __m256i* const src1, + const __m256i* const src2, const __m256i* const src3, + const __m256i* const src4) { + const __m256i sum01 = _mm256_add_epi32(*src0, *src1); + const __m256i sum23 = _mm256_add_epi32(*src2, *src3); + const __m256i sum = _mm256_add_epi32(sum01, sum23); + return _mm256_add_epi32(sum, *src4); +} + +inline __m128i Sum5_32(const __m128i src[5]) { + return Sum5_32(&src[0], &src[1], &src[2], &src[3], &src[4]); +} + +inline __m256i Sum5_32(const __m256i src[5]) { + return Sum5_32(&src[0], &src[1], &src[2], &src[3], &src[4]); +} + +inline void Sum5_32(const __m128i src[5][2], __m128i dst[2]) { + dst[0] = Sum5_32(&src[0][0], &src[1][0], &src[2][0], &src[3][0], &src[4][0]); + dst[1] = Sum5_32(&src[0][1], &src[1][1], &src[2][1], &src[3][1], &src[4][1]); +} + +inline void Sum5_32(const __m256i src[5][2], __m256i dst[2]) { + dst[0] = Sum5_32(&src[0][0], &src[1][0], &src[2][0], &src[3][0], &src[4][0]); + dst[1] = Sum5_32(&src[0][1], &src[1][1], &src[2][1], &src[3][1], &src[4][1]); +} + +inline __m128i Sum3Horizontal16(const __m128i src[2]) { + __m128i s[3]; + Prepare3_16(src, s); + return Sum3_16(s); +} + +inline __m256i Sum3Horizontal16(const uint16_t* const src, + const ptrdiff_t over_read_in_bytes) { + __m256i s[3]; + s[0] = LoadUnaligned32Msan(src + 0, over_read_in_bytes + 0); + s[1] = LoadUnaligned32Msan(src + 1, over_read_in_bytes + 2); + s[2] = LoadUnaligned32Msan(src + 2, over_read_in_bytes + 4); + return Sum3_16(s); +} + +inline __m128i Sum5Horizontal16(const __m128i src[2]) { + __m128i s[5]; + Prepare5_16(src, s); + return Sum5_16(s); +} + +inline __m256i Sum5Horizontal16(const uint16_t* const src, + const ptrdiff_t over_read_in_bytes) { + __m256i s[5]; + s[0] = LoadUnaligned32Msan(src + 0, over_read_in_bytes + 0); + s[1] = LoadUnaligned32Msan(src + 1, over_read_in_bytes + 2); + s[2] = LoadUnaligned32Msan(src + 2, over_read_in_bytes + 4); + s[3] = LoadUnaligned32Msan(src + 3, over_read_in_bytes + 6); + s[4] = LoadUnaligned32Msan(src + 4, over_read_in_bytes + 8); + return Sum5_16(s); +} + +inline void SumHorizontal16(const uint16_t* const src, + const ptrdiff_t over_read_in_bytes, + __m256i* const row3, __m256i* const row5) { + __m256i s[5]; + s[0] = LoadUnaligned32Msan(src + 0, over_read_in_bytes + 0); + s[1] = LoadUnaligned32Msan(src + 1, over_read_in_bytes + 2); + s[2] = LoadUnaligned32Msan(src + 2, over_read_in_bytes + 4); + s[3] = LoadUnaligned32Msan(src + 3, over_read_in_bytes + 6); + s[4] = LoadUnaligned32Msan(src + 4, over_read_in_bytes + 8); + const __m256i sum04 = _mm256_add_epi16(s[0], s[4]); + *row3 = Sum3_16(s + 1); + *row5 = _mm256_add_epi16(sum04, *row3); +} + +inline void SumHorizontal16(const uint16_t* const src, + const ptrdiff_t over_read_in_bytes, + __m256i* const row3_0, __m256i* const row3_1, + __m256i* const row5_0, __m256i* const row5_1) { + SumHorizontal16(src + 0, over_read_in_bytes + 0, row3_0, row5_0); + SumHorizontal16(src + 16, over_read_in_bytes + 32, row3_1, row5_1); +} + +inline void SumHorizontal32(const __m128i src[5], __m128i* const row_sq3, + __m128i* const row_sq5) { + const __m128i sum04 = _mm_add_epi32(src[0], src[4]); + *row_sq3 = Sum3_32(src + 1); + *row_sq5 = _mm_add_epi32(sum04, *row_sq3); +} + +inline void SumHorizontal32(const __m256i src[5], __m256i* const row_sq3, + __m256i* const row_sq5) { + const __m256i sum04 = _mm256_add_epi32(src[0], src[4]); + *row_sq3 = Sum3_32(src + 1); + *row_sq5 = _mm256_add_epi32(sum04, *row_sq3); +} + +inline void SumHorizontal32(const __m128i src[3], __m128i* const row_sq3_0, + __m128i* const row_sq3_1, __m128i* const row_sq5_0, + __m128i* const row_sq5_1) { + __m128i s[5]; + Prepare5_32(src + 0, s); + SumHorizontal32(s, row_sq3_0, row_sq5_0); + Prepare5_32(src + 1, s); + SumHorizontal32(s, row_sq3_1, row_sq5_1); +} + +inline void SumHorizontal32(const __m256i src[3], __m256i* const row_sq3_0, + __m256i* const row_sq3_1, __m256i* const row_sq5_0, + __m256i* const row_sq5_1) { + __m256i s[5]; + Prepare5_32(src + 0, s); + SumHorizontal32(s, row_sq3_0, row_sq5_0); + Prepare5_32(src + 1, s); + SumHorizontal32(s, row_sq3_1, row_sq5_1); +} + +inline void Sum3Horizontal32(const __m128i src[3], __m128i dst[2]) { + __m128i s[3]; + Prepare3_32(src + 0, s); + dst[0] = Sum3_32(s); + Prepare3_32(src + 1, s); + dst[1] = Sum3_32(s); +} + +inline void Sum3Horizontal32(const __m256i src[3], __m256i dst[2]) { + __m256i s[3]; + Prepare3_32(src + 0, s); + dst[0] = Sum3_32(s); + Prepare3_32(src + 1, s); + dst[1] = Sum3_32(s); +} + +inline void Sum5Horizontal32(const __m128i src[3], __m128i dst[2]) { + __m128i s[5]; + Prepare5_32(src + 0, s); + dst[0] = Sum5_32(s); + Prepare5_32(src + 1, s); + dst[1] = Sum5_32(s); +} + +inline void Sum5Horizontal32(const __m256i src[3], __m256i dst[2]) { + __m256i s[5]; + Prepare5_32(src + 0, s); + dst[0] = Sum5_32(s); + Prepare5_32(src + 1, s); + dst[1] = Sum5_32(s); +} + +void SumHorizontal16(const __m128i src[2], __m128i* const row3, + __m128i* const row5) { + __m128i s[5]; + Prepare5_16(src, s); + const __m128i sum04 = _mm_add_epi16(s[0], s[4]); + *row3 = Sum3_16(s + 1); + *row5 = _mm_add_epi16(sum04, *row3); +} + +inline __m256i Sum343Lo(const __m256i ma3[3]) { + const __m256i sum = Sum3WLo16(ma3); + const __m256i sum3 = Sum3_16(sum, sum, sum); + return VaddwLo8(sum3, ma3[1]); +} + +inline __m256i Sum343Hi(const __m256i ma3[3]) { + const __m256i sum = Sum3WHi16(ma3); + const __m256i sum3 = Sum3_16(sum, sum, sum); + return VaddwHi8(sum3, ma3[1]); +} + +inline __m256i Sum343(const __m256i src[3]) { + const __m256i sum = Sum3_32(src); + const __m256i sum3 = Sum3_32(sum, sum, sum); + return _mm256_add_epi32(sum3, src[1]); +} + +inline void Sum343(const __m256i src[3], __m256i dst[2]) { + __m256i s[3]; + Prepare3_32(src + 0, s); + dst[0] = Sum343(s); + Prepare3_32(src + 1, s); + dst[1] = Sum343(s); +} + +inline __m256i Sum565Lo(const __m256i src[3]) { + const __m256i sum = Sum3WLo16(src); + const __m256i sum4 = _mm256_slli_epi16(sum, 2); + const __m256i sum5 = _mm256_add_epi16(sum4, sum); + return VaddwLo8(sum5, src[1]); +} + +inline __m256i Sum565Hi(const __m256i src[3]) { + const __m256i sum = Sum3WHi16(src); + const __m256i sum4 = _mm256_slli_epi16(sum, 2); + const __m256i sum5 = _mm256_add_epi16(sum4, sum); + return VaddwHi8(sum5, src[1]); +} + +inline __m256i Sum565(const __m256i src[3]) { + const __m256i sum = Sum3_32(src); + const __m256i sum4 = _mm256_slli_epi32(sum, 2); + const __m256i sum5 = _mm256_add_epi32(sum4, sum); + return _mm256_add_epi32(sum5, src[1]); +} + +inline void Sum565(const __m256i src[3], __m256i dst[2]) { + __m256i s[3]; + Prepare3_32(src + 0, s); + dst[0] = Sum565(s); + Prepare3_32(src + 1, s); + dst[1] = Sum565(s); +} + +inline void BoxSum(const uint16_t* src, const ptrdiff_t src_stride, + const ptrdiff_t width, const ptrdiff_t sum_stride, + const ptrdiff_t sum_width, uint16_t* sum3, uint16_t* sum5, + uint32_t* square_sum3, uint32_t* square_sum5) { + const ptrdiff_t overread_in_bytes_128 = + kOverreadInBytesPass1_128 - sizeof(*src) * width; + const ptrdiff_t overread_in_bytes_256 = + kOverreadInBytesPass1_256 - sizeof(*src) * width; + int y = 2; + do { + __m128i s0[2], sq_128[4], s3, s5, sq3[2], sq5[2]; + __m256i sq[8]; + s0[0] = LoadUnaligned16Msan(src + 0, overread_in_bytes_128 + 0); + s0[1] = LoadUnaligned16Msan(src + 8, overread_in_bytes_128 + 16); + Square(s0[0], sq_128 + 0); + Square(s0[1], sq_128 + 2); + SumHorizontal16(s0, &s3, &s5); + StoreAligned16(sum3, s3); + StoreAligned16(sum5, s5); + SumHorizontal32(sq_128, &sq3[0], &sq3[1], &sq5[0], &sq5[1]); + StoreAligned32U32(square_sum3, sq3); + StoreAligned32U32(square_sum5, sq5); + src += 8; + sum3 += 8; + sum5 += 8; + square_sum3 += 8; + square_sum5 += 8; + sq[0] = SetrM128i(sq_128[2], sq_128[2]); + sq[1] = SetrM128i(sq_128[3], sq_128[3]); + ptrdiff_t x = sum_width; + do { + __m256i s[2], row3[2], row5[2], row_sq3[2], row_sq5[2]; + s[0] = LoadUnaligned32Msan( + src + 8, overread_in_bytes_256 + sizeof(*src) * (sum_width - x + 8)); + s[1] = LoadUnaligned32Msan( + src + 24, + overread_in_bytes_256 + sizeof(*src) * (sum_width - x + 24)); + Square(s[0], sq + 2); + Square(s[1], sq + 6); + sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21); + sq[1] = _mm256_permute2x128_si256(sq[1], sq[3], 0x21); + sq[4] = _mm256_permute2x128_si256(sq[2], sq[6], 0x21); + sq[5] = _mm256_permute2x128_si256(sq[3], sq[7], 0x21); + SumHorizontal16( + src, overread_in_bytes_256 + sizeof(*src) * (sum_width - x + 8), + &row3[0], &row3[1], &row5[0], &row5[1]); + StoreAligned64(sum3, row3); + StoreAligned64(sum5, row5); + SumHorizontal32(sq + 0, &row_sq3[0], &row_sq3[1], &row_sq5[0], + &row_sq5[1]); + StoreAligned64(square_sum3 + 0, row_sq3); + StoreAligned64(square_sum5 + 0, row_sq5); + SumHorizontal32(sq + 4, &row_sq3[0], &row_sq3[1], &row_sq5[0], + &row_sq5[1]); + StoreAligned64(square_sum3 + 16, row_sq3); + StoreAligned64(square_sum5 + 16, row_sq5); + sq[0] = sq[6]; + sq[1] = sq[7]; + src += 32; + sum3 += 32; + sum5 += 32; + square_sum3 += 32; + square_sum5 += 32; + x -= 32; + } while (x != 0); + src += src_stride - sum_width - 8; + sum3 += sum_stride - sum_width - 8; + sum5 += sum_stride - sum_width - 8; + square_sum3 += sum_stride - sum_width - 8; + square_sum5 += sum_stride - sum_width - 8; + } while (--y != 0); +} + +template <int size> +inline void BoxSum(const uint16_t* src, const ptrdiff_t src_stride, + const ptrdiff_t width, const ptrdiff_t sum_stride, + const ptrdiff_t sum_width, uint16_t* sums, + uint32_t* square_sums) { + static_assert(size == 3 || size == 5, ""); + int overread_in_bytes_128, overread_in_bytes_256; + if (size == 3) { + overread_in_bytes_128 = kOverreadInBytesPass2_128; + overread_in_bytes_256 = kOverreadInBytesPass2_256; + } else { + overread_in_bytes_128 = kOverreadInBytesPass1_128; + overread_in_bytes_256 = kOverreadInBytesPass1_256; + } + overread_in_bytes_128 -= sizeof(*src) * width; + overread_in_bytes_256 -= sizeof(*src) * width; + int y = 2; + do { + __m128i s_128[2], ss, sq_128[4], sqs[2]; + __m256i sq[8]; + s_128[0] = LoadUnaligned16Msan(src + 0, overread_in_bytes_128); + s_128[1] = LoadUnaligned16Msan(src + 8, overread_in_bytes_128 + 16); + Square(s_128[0], sq_128 + 0); + Square(s_128[1], sq_128 + 2); + if (size == 3) { + ss = Sum3Horizontal16(s_128); + Sum3Horizontal32(sq_128, sqs); + } else { + ss = Sum5Horizontal16(s_128); + Sum5Horizontal32(sq_128, sqs); + } + StoreAligned16(sums, ss); + StoreAligned32U32(square_sums, sqs); + src += 8; + sums += 8; + square_sums += 8; + sq[0] = SetrM128i(sq_128[2], sq_128[2]); + sq[1] = SetrM128i(sq_128[3], sq_128[3]); + ptrdiff_t x = sum_width; + do { + __m256i s[2], row[2], row_sq[4]; + s[0] = LoadUnaligned32Msan( + src + 8, overread_in_bytes_256 + sizeof(*src) * (sum_width - x + 8)); + s[1] = LoadUnaligned32Msan( + src + 24, + overread_in_bytes_256 + sizeof(*src) * (sum_width - x + 24)); + Square(s[0], sq + 2); + Square(s[1], sq + 6); + sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21); + sq[1] = _mm256_permute2x128_si256(sq[1], sq[3], 0x21); + sq[4] = _mm256_permute2x128_si256(sq[2], sq[6], 0x21); + sq[5] = _mm256_permute2x128_si256(sq[3], sq[7], 0x21); + if (size == 3) { + row[0] = Sum3Horizontal16( + src, overread_in_bytes_256 + sizeof(*src) * (sum_width - x + 8)); + row[1] = + Sum3Horizontal16(src + 16, overread_in_bytes_256 + + sizeof(*src) * (sum_width - x + 24)); + Sum3Horizontal32(sq + 0, row_sq + 0); + Sum3Horizontal32(sq + 4, row_sq + 2); + } else { + row[0] = Sum5Horizontal16( + src, overread_in_bytes_256 + sizeof(*src) * (sum_width - x + 8)); + row[1] = + Sum5Horizontal16(src + 16, overread_in_bytes_256 + + sizeof(*src) * (sum_width - x + 24)); + Sum5Horizontal32(sq + 0, row_sq + 0); + Sum5Horizontal32(sq + 4, row_sq + 2); + } + StoreAligned64(sums, row); + StoreAligned64(square_sums + 0, row_sq + 0); + StoreAligned64(square_sums + 16, row_sq + 2); + sq[0] = sq[6]; + sq[1] = sq[7]; + src += 32; + sums += 32; + square_sums += 32; + x -= 32; + } while (x != 0); + src += src_stride - sum_width - 8; + sums += sum_stride - sum_width - 8; + square_sums += sum_stride - sum_width - 8; + } while (--y != 0); +} + +template <int n> +inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq, + const uint32_t scale) { + static_assert(n == 9 || n == 25, ""); + // a = |sum_sq| + // d = |sum| + // p = (a * n < d * d) ? 0 : a * n - d * d; + const __m128i dxd = _mm_madd_epi16(sum, sum); + // _mm_mullo_epi32() has high latency. Using shifts and additions instead. + // Some compilers could do this for us but we make this explicit. + // return _mm_mullo_epi32(sum_sq, _mm_set1_epi32(n)); + __m128i axn = _mm_add_epi32(sum_sq, _mm_slli_epi32(sum_sq, 3)); + if (n == 25) axn = _mm_add_epi32(axn, _mm_slli_epi32(sum_sq, 4)); + const __m128i sub = _mm_sub_epi32(axn, dxd); + const __m128i p = _mm_max_epi32(sub, _mm_setzero_si128()); + const __m128i pxs = _mm_mullo_epi32(p, _mm_set1_epi32(scale)); + return VrshrU32(pxs, kSgrProjScaleBits); +} + +template <int n> +inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq[2], + const uint32_t scale) { + static_assert(n == 9 || n == 25, ""); + const __m128i b = VrshrU16(sum, 2); + const __m128i sum_lo = _mm_unpacklo_epi16(b, _mm_setzero_si128()); + const __m128i sum_hi = _mm_unpackhi_epi16(b, _mm_setzero_si128()); + const __m128i z0 = CalculateMa<n>(sum_lo, VrshrU32(sum_sq[0], 4), scale); + const __m128i z1 = CalculateMa<n>(sum_hi, VrshrU32(sum_sq[1], 4), scale); + return _mm_packus_epi32(z0, z1); +} + +template <int n> +inline __m256i CalculateMa(const __m256i sum, const __m256i sum_sq, + const uint32_t scale) { + static_assert(n == 9 || n == 25, ""); + // a = |sum_sq| + // d = |sum| + // p = (a * n < d * d) ? 0 : a * n - d * d; + const __m256i dxd = _mm256_madd_epi16(sum, sum); + // _mm256_mullo_epi32() has high latency. Using shifts and additions instead. + // Some compilers could do this for us but we make this explicit. + // return _mm256_mullo_epi32(sum_sq, _mm256_set1_epi32(n)); + __m256i axn = _mm256_add_epi32(sum_sq, _mm256_slli_epi32(sum_sq, 3)); + if (n == 25) axn = _mm256_add_epi32(axn, _mm256_slli_epi32(sum_sq, 4)); + const __m256i sub = _mm256_sub_epi32(axn, dxd); + const __m256i p = _mm256_max_epi32(sub, _mm256_setzero_si256()); + const __m256i pxs = _mm256_mullo_epi32(p, _mm256_set1_epi32(scale)); + return VrshrU32(pxs, kSgrProjScaleBits); +} + +template <int n> +inline __m256i CalculateMa(const __m256i sum, const __m256i sum_sq[2], + const uint32_t scale) { + static_assert(n == 9 || n == 25, ""); + const __m256i b = VrshrU16(sum, 2); + const __m256i sum_lo = _mm256_unpacklo_epi16(b, _mm256_setzero_si256()); + const __m256i sum_hi = _mm256_unpackhi_epi16(b, _mm256_setzero_si256()); + const __m256i z0 = CalculateMa<n>(sum_lo, VrshrU32(sum_sq[0], 4), scale); + const __m256i z1 = CalculateMa<n>(sum_hi, VrshrU32(sum_sq[1], 4), scale); + return _mm256_packus_epi32(z0, z1); +} + +inline void CalculateB5(const __m128i sum, const __m128i ma, __m128i b[2]) { + // one_over_n == 164. + constexpr uint32_t one_over_n = + ((1 << kSgrProjReciprocalBits) + (25 >> 1)) / 25; + // one_over_n_quarter == 41. + constexpr uint32_t one_over_n_quarter = one_over_n >> 2; + static_assert(one_over_n == one_over_n_quarter << 2, ""); + // |ma| is in range [0, 255]. + const __m128i m = _mm_maddubs_epi16(ma, _mm_set1_epi16(one_over_n_quarter)); + const __m128i m0 = VmullLo16(m, sum); + const __m128i m1 = VmullHi16(m, sum); + b[0] = VrshrU32(m0, kSgrProjReciprocalBits - 2); + b[1] = VrshrU32(m1, kSgrProjReciprocalBits - 2); +} + +inline void CalculateB5(const __m256i sum, const __m256i ma, __m256i b[2]) { + // one_over_n == 164. + constexpr uint32_t one_over_n = + ((1 << kSgrProjReciprocalBits) + (25 >> 1)) / 25; + // one_over_n_quarter == 41. + constexpr uint32_t one_over_n_quarter = one_over_n >> 2; + static_assert(one_over_n == one_over_n_quarter << 2, ""); + // |ma| is in range [0, 255]. + const __m256i m = + _mm256_maddubs_epi16(ma, _mm256_set1_epi16(one_over_n_quarter)); + const __m256i m0 = VmullLo16(m, sum); + const __m256i m1 = VmullHi16(m, sum); + b[0] = VrshrU32(m0, kSgrProjReciprocalBits - 2); + b[1] = VrshrU32(m1, kSgrProjReciprocalBits - 2); +} + +inline void CalculateB3(const __m128i sum, const __m128i ma, __m128i b[2]) { + // one_over_n == 455. + constexpr uint32_t one_over_n = + ((1 << kSgrProjReciprocalBits) + (9 >> 1)) / 9; + const __m128i m0 = VmullLo16(ma, sum); + const __m128i m1 = VmullHi16(ma, sum); + const __m128i m2 = _mm_mullo_epi32(m0, _mm_set1_epi32(one_over_n)); + const __m128i m3 = _mm_mullo_epi32(m1, _mm_set1_epi32(one_over_n)); + b[0] = VrshrU32(m2, kSgrProjReciprocalBits); + b[1] = VrshrU32(m3, kSgrProjReciprocalBits); +} + +inline void CalculateB3(const __m256i sum, const __m256i ma, __m256i b[2]) { + // one_over_n == 455. + constexpr uint32_t one_over_n = + ((1 << kSgrProjReciprocalBits) + (9 >> 1)) / 9; + const __m256i m0 = VmullLo16(ma, sum); + const __m256i m1 = VmullHi16(ma, sum); + const __m256i m2 = _mm256_mullo_epi32(m0, _mm256_set1_epi32(one_over_n)); + const __m256i m3 = _mm256_mullo_epi32(m1, _mm256_set1_epi32(one_over_n)); + b[0] = VrshrU32(m2, kSgrProjReciprocalBits); + b[1] = VrshrU32(m3, kSgrProjReciprocalBits); +} + +inline void CalculateSumAndIndex5(const __m128i s5[5], const __m128i sq5[5][2], + const uint32_t scale, __m128i* const sum, + __m128i* const index) { + __m128i sum_sq[2]; + *sum = Sum5_16(s5); + Sum5_32(sq5, sum_sq); + *index = CalculateMa<25>(*sum, sum_sq, scale); +} + +inline void CalculateSumAndIndex5(const __m256i s5[5], const __m256i sq5[5][2], + const uint32_t scale, __m256i* const sum, + __m256i* const index) { + __m256i sum_sq[2]; + *sum = Sum5_16(s5); + Sum5_32(sq5, sum_sq); + *index = CalculateMa<25>(*sum, sum_sq, scale); +} + +inline void CalculateSumAndIndex3(const __m128i s3[3], const __m128i sq3[3][2], + const uint32_t scale, __m128i* const sum, + __m128i* const index) { + __m128i sum_sq[2]; + *sum = Sum3_16(s3); + Sum3_32(sq3, sum_sq); + *index = CalculateMa<9>(*sum, sum_sq, scale); +} + +inline void CalculateSumAndIndex3(const __m256i s3[3], const __m256i sq3[3][2], + const uint32_t scale, __m256i* const sum, + __m256i* const index) { + __m256i sum_sq[2]; + *sum = Sum3_16(s3); + Sum3_32(sq3, sum_sq); + *index = CalculateMa<9>(*sum, sum_sq, scale); +} + +template <int n> +inline void LookupIntermediate(const __m128i sum, const __m128i index, + __m128i* const ma, __m128i b[2]) { + static_assert(n == 9 || n == 25, ""); + const __m128i idx = _mm_packus_epi16(index, index); + // Actually it's not stored and loaded. The compiler will use a 64-bit + // general-purpose register to process. Faster than using _mm_extract_epi8(). + uint8_t temp[8]; + StoreLo8(temp, idx); + *ma = _mm_cvtsi32_si128(kSgrMaLookup[temp[0]]); + *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[1]], 1); + *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[2]], 2); + *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[3]], 3); + *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[4]], 4); + *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[5]], 5); + *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[6]], 6); + *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[7]], 7); + // b = ma * b * one_over_n + // |ma| = [0, 255] + // |sum| is a box sum with radius 1 or 2. + // For the first pass radius is 2. Maximum value is 5x5x255 = 6375. + // For the second pass radius is 1. Maximum value is 3x3x255 = 2295. + // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n + // When radius is 2 |n| is 25. |one_over_n| is 164. + // When radius is 1 |n| is 9. |one_over_n| is 455. + // |kSgrProjReciprocalBits| is 12. + // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits). + // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits). + const __m128i maq = _mm_unpacklo_epi8(*ma, _mm_setzero_si128()); + if (n == 9) { + CalculateB3(sum, maq, b); + } else { + CalculateB5(sum, maq, b); + } +} + +// Repeat the first 48 elements in kSgrMaLookup with a period of 16. +alignas(32) constexpr uint8_t kSgrMaLookupAvx2[96] = { + 255, 128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16, + 255, 128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16, + 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, + 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, + 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 5, 5, + 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 5, 5}; + +// Set the shuffle control mask of indices out of range [0, 15] to (1xxxxxxx)b +// to get value 0 as the shuffle result. The most significiant bit 1 comes +// either from the comparison instruction, or from the sign bit of the index. +inline __m128i ShuffleIndex(const __m128i table, const __m128i index) { + __m128i mask; + mask = _mm_cmpgt_epi8(index, _mm_set1_epi8(15)); + mask = _mm_or_si128(mask, index); + return _mm_shuffle_epi8(table, mask); +} + +inline __m256i ShuffleIndex(const __m256i table, const __m256i index) { + __m256i mask; + mask = _mm256_cmpgt_epi8(index, _mm256_set1_epi8(15)); + mask = _mm256_or_si256(mask, index); + return _mm256_shuffle_epi8(table, mask); +} + +inline __m128i AdjustValue(const __m128i value, const __m128i index, + const int threshold) { + const __m128i thresholds = _mm_set1_epi8(threshold - 128); + const __m128i offset = _mm_cmpgt_epi8(index, thresholds); + return _mm_add_epi8(value, offset); +} + +inline __m256i AdjustValue(const __m256i value, const __m256i index, + const int threshold) { + const __m256i thresholds = _mm256_set1_epi8(threshold - 128); + const __m256i offset = _mm256_cmpgt_epi8(index, thresholds); + return _mm256_add_epi8(value, offset); +} + +inline void CalculateIntermediate(const __m128i sum[2], const __m128i index[2], + __m128i* const ma, __m128i b0[2], + __m128i b1[2]) { + // Use table lookup to read elements whose indices are less than 48. + const __m128i c0 = LoadAligned16(kSgrMaLookup + 0 * 16); + const __m128i c1 = LoadAligned16(kSgrMaLookup + 1 * 16); + const __m128i c2 = LoadAligned16(kSgrMaLookup + 2 * 16); + const __m128i indices = _mm_packus_epi16(index[0], index[1]); + __m128i idx; + // Clip idx to 127 to apply signed comparison instructions. + idx = _mm_min_epu8(indices, _mm_set1_epi8(127)); + // All elements whose indices are less than 48 are set to 0. + // Get shuffle results for indices in range [0, 15]. + *ma = ShuffleIndex(c0, idx); + // Get shuffle results for indices in range [16, 31]. + // Subtract 16 to utilize the sign bit of the index. + idx = _mm_sub_epi8(idx, _mm_set1_epi8(16)); + const __m128i res1 = ShuffleIndex(c1, idx); + // Use OR instruction to combine shuffle results together. + *ma = _mm_or_si128(*ma, res1); + // Get shuffle results for indices in range [32, 47]. + // Subtract 16 to utilize the sign bit of the index. + idx = _mm_sub_epi8(idx, _mm_set1_epi8(16)); + const __m128i res2 = ShuffleIndex(c2, idx); + *ma = _mm_or_si128(*ma, res2); + + // For elements whose indices are larger than 47, since they seldom change + // values with the increase of the index, we use comparison and arithmetic + // operations to calculate their values. + // Add -128 to apply signed comparison instructions. + idx = _mm_add_epi8(indices, _mm_set1_epi8(-128)); + // Elements whose indices are larger than 47 (with value 0) are set to 5. + *ma = _mm_max_epu8(*ma, _mm_set1_epi8(5)); + *ma = AdjustValue(*ma, idx, 55); // 55 is the last index which value is 5. + *ma = AdjustValue(*ma, idx, 72); // 72 is the last index which value is 4. + *ma = AdjustValue(*ma, idx, 101); // 101 is the last index which value is 3. + *ma = AdjustValue(*ma, idx, 169); // 169 is the last index which value is 2. + *ma = AdjustValue(*ma, idx, 254); // 254 is the last index which value is 1. + + // b = ma * b * one_over_n + // |ma| = [0, 255] + // |sum| is a box sum with radius 1 or 2. + // For the first pass radius is 2. Maximum value is 5x5x255 = 6375. + // For the second pass radius is 1. Maximum value is 3x3x255 = 2295. + // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n + // When radius is 2 |n| is 25. |one_over_n| is 164. + // When radius is 1 |n| is 9. |one_over_n| is 455. + // |kSgrProjReciprocalBits| is 12. + // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits). + // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits). + const __m128i maq0 = _mm_unpacklo_epi8(*ma, _mm_setzero_si128()); + CalculateB3(sum[0], maq0, b0); + const __m128i maq1 = _mm_unpackhi_epi8(*ma, _mm_setzero_si128()); + CalculateB3(sum[1], maq1, b1); +} + +template <int n> +inline void CalculateIntermediate(const __m256i sum[2], const __m256i index[2], + __m256i ma[3], __m256i b0[2], __m256i b1[2]) { + static_assert(n == 9 || n == 25, ""); + // Use table lookup to read elements whose indices are less than 48. + const __m256i c0 = LoadAligned32(kSgrMaLookupAvx2 + 0 * 32); + const __m256i c1 = LoadAligned32(kSgrMaLookupAvx2 + 1 * 32); + const __m256i c2 = LoadAligned32(kSgrMaLookupAvx2 + 2 * 32); + const __m256i indices = _mm256_packus_epi16(index[0], index[1]); // 0 2 1 3 + __m256i idx, mas; + // Clip idx to 127 to apply signed comparison instructions. + idx = _mm256_min_epu8(indices, _mm256_set1_epi8(127)); + // All elements whose indices are less than 48 are set to 0. + // Get shuffle results for indices in range [0, 15]. + mas = ShuffleIndex(c0, idx); + // Get shuffle results for indices in range [16, 31]. + // Subtract 16 to utilize the sign bit of the index. + idx = _mm256_sub_epi8(idx, _mm256_set1_epi8(16)); + const __m256i res1 = ShuffleIndex(c1, idx); + // Use OR instruction to combine shuffle results together. + mas = _mm256_or_si256(mas, res1); + // Get shuffle results for indices in range [32, 47]. + // Subtract 16 to utilize the sign bit of the index. + idx = _mm256_sub_epi8(idx, _mm256_set1_epi8(16)); + const __m256i res2 = ShuffleIndex(c2, idx); + mas = _mm256_or_si256(mas, res2); + + // For elements whose indices are larger than 47, since they seldom change + // values with the increase of the index, we use comparison and arithmetic + // operations to calculate their values. + // Add -128 to apply signed comparison instructions. + idx = _mm256_add_epi8(indices, _mm256_set1_epi8(-128)); + // Elements whose indices are larger than 47 (with value 0) are set to 5. + mas = _mm256_max_epu8(mas, _mm256_set1_epi8(5)); + mas = AdjustValue(mas, idx, 55); // 55 is the last index which value is 5. + mas = AdjustValue(mas, idx, 72); // 72 is the last index which value is 4. + mas = AdjustValue(mas, idx, 101); // 101 is the last index which value is 3. + mas = AdjustValue(mas, idx, 169); // 169 is the last index which value is 2. + mas = AdjustValue(mas, idx, 254); // 254 is the last index which value is 1. + + ma[2] = _mm256_permute4x64_epi64(mas, 0x63); // 32-39 8-15 16-23 24-31 + ma[0] = _mm256_blend_epi32(ma[0], ma[2], 0xfc); // 0-7 8-15 16-23 24-31 + ma[1] = _mm256_permute2x128_si256(ma[0], ma[2], 0x21); + + // b = ma * b * one_over_n + // |ma| = [0, 255] + // |sum| is a box sum with radius 1 or 2. + // For the first pass radius is 2. Maximum value is 5x5x255 = 6375. + // For the second pass radius is 1. Maximum value is 3x3x255 = 2295. + // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n + // When radius is 2 |n| is 25. |one_over_n| is 164. + // When radius is 1 |n| is 9. |one_over_n| is 455. + // |kSgrProjReciprocalBits| is 12. + // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits). + // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits). + const __m256i maq0 = _mm256_unpackhi_epi8(ma[0], _mm256_setzero_si256()); + const __m256i maq1 = _mm256_unpacklo_epi8(ma[1], _mm256_setzero_si256()); + __m256i sums[2]; + sums[0] = _mm256_permute2x128_si256(sum[0], sum[1], 0x20); + sums[1] = _mm256_permute2x128_si256(sum[0], sum[1], 0x31); + if (n == 9) { + CalculateB3(sums[0], maq0, b0); + CalculateB3(sums[1], maq1, b1); + } else { + CalculateB5(sums[0], maq0, b0); + CalculateB5(sums[1], maq1, b1); + } +} + +inline void CalculateIntermediate5(const __m128i s5[5], const __m128i sq5[5][2], + const uint32_t scale, __m128i* const ma, + __m128i b[2]) { + __m128i sum, index; + CalculateSumAndIndex5(s5, sq5, scale, &sum, &index); + LookupIntermediate<25>(sum, index, ma, b); +} + +inline void CalculateIntermediate3(const __m128i s3[3], const __m128i sq3[3][2], + const uint32_t scale, __m128i* const ma, + __m128i b[2]) { + __m128i sum, index; + CalculateSumAndIndex3(s3, sq3, scale, &sum, &index); + LookupIntermediate<9>(sum, index, ma, b); +} + +inline void Store343_444(const __m256i b3[3], const ptrdiff_t x, + __m256i sum_b343[2], __m256i sum_b444[2], + uint32_t* const b343, uint32_t* const b444) { + __m256i b[3], sum_b111[2]; + Prepare3_32(b3 + 0, b); + sum_b111[0] = Sum3_32(b); + sum_b444[0] = _mm256_slli_epi32(sum_b111[0], 2); + sum_b343[0] = _mm256_sub_epi32(sum_b444[0], sum_b111[0]); + sum_b343[0] = _mm256_add_epi32(sum_b343[0], b[1]); + Prepare3_32(b3 + 1, b); + sum_b111[1] = Sum3_32(b); + sum_b444[1] = _mm256_slli_epi32(sum_b111[1], 2); + sum_b343[1] = _mm256_sub_epi32(sum_b444[1], sum_b111[1]); + sum_b343[1] = _mm256_add_epi32(sum_b343[1], b[1]); + StoreAligned64(b444 + x, sum_b444); + StoreAligned64(b343 + x, sum_b343); +} + +inline void Store343_444Lo(const __m256i ma3[3], const __m256i b3[2], + const ptrdiff_t x, __m256i* const sum_ma343, + __m256i* const sum_ma444, __m256i sum_b343[2], + __m256i sum_b444[2], uint16_t* const ma343, + uint16_t* const ma444, uint32_t* const b343, + uint32_t* const b444) { + const __m256i sum_ma111 = Sum3WLo16(ma3); + *sum_ma444 = _mm256_slli_epi16(sum_ma111, 2); + StoreAligned32_ma(ma444 + x, *sum_ma444); + const __m256i sum333 = _mm256_sub_epi16(*sum_ma444, sum_ma111); + *sum_ma343 = VaddwLo8(sum333, ma3[1]); + StoreAligned32_ma(ma343 + x, *sum_ma343); + Store343_444(b3, x, sum_b343, sum_b444, b343, b444); +} + +inline void Store343_444Hi(const __m256i ma3[3], const __m256i b3[2], + const ptrdiff_t x, __m256i* const sum_ma343, + __m256i* const sum_ma444, __m256i sum_b343[2], + __m256i sum_b444[2], uint16_t* const ma343, + uint16_t* const ma444, uint32_t* const b343, + uint32_t* const b444) { + const __m256i sum_ma111 = Sum3WHi16(ma3); + *sum_ma444 = _mm256_slli_epi16(sum_ma111, 2); + StoreAligned32_ma(ma444 + x, *sum_ma444); + const __m256i sum333 = _mm256_sub_epi16(*sum_ma444, sum_ma111); + *sum_ma343 = VaddwHi8(sum333, ma3[1]); + StoreAligned32_ma(ma343 + x, *sum_ma343); + Store343_444(b3, x + kMaStoreOffset, sum_b343, sum_b444, b343, b444); +} + +inline void Store343_444Lo(const __m256i ma3[3], const __m256i b3[2], + const ptrdiff_t x, __m256i* const sum_ma343, + __m256i sum_b343[2], uint16_t* const ma343, + uint16_t* const ma444, uint32_t* const b343, + uint32_t* const b444) { + __m256i sum_ma444, sum_b444[2]; + Store343_444Lo(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343, + ma444, b343, b444); +} + +inline void Store343_444Hi(const __m256i ma3[3], const __m256i b3[2], + const ptrdiff_t x, __m256i* const sum_ma343, + __m256i sum_b343[2], uint16_t* const ma343, + uint16_t* const ma444, uint32_t* const b343, + uint32_t* const b444) { + __m256i sum_ma444, sum_b444[2]; + Store343_444Hi(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343, + ma444, b343, b444); +} + +inline void Store343_444Lo(const __m256i ma3[3], const __m256i b3[2], + const ptrdiff_t x, uint16_t* const ma343, + uint16_t* const ma444, uint32_t* const b343, + uint32_t* const b444) { + __m256i sum_ma343, sum_b343[2]; + Store343_444Lo(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444); +} + +inline void Store343_444Hi(const __m256i ma3[3], const __m256i b3[2], + const ptrdiff_t x, uint16_t* const ma343, + uint16_t* const ma444, uint32_t* const b343, + uint32_t* const b444) { + __m256i sum_ma343, sum_b343[2]; + Store343_444Hi(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444); +} + +// Don't combine the following 2 functions, which would be slower. +inline void Store343_444(const __m256i ma3[3], const __m256i b3[6], + const ptrdiff_t x, __m256i* const sum_ma343_lo, + __m256i* const sum_ma343_hi, + __m256i* const sum_ma444_lo, + __m256i* const sum_ma444_hi, __m256i sum_b343_lo[2], + __m256i sum_b343_hi[2], __m256i sum_b444_lo[2], + __m256i sum_b444_hi[2], uint16_t* const ma343, + uint16_t* const ma444, uint32_t* const b343, + uint32_t* const b444) { + __m256i sum_mat343[2], sum_mat444[2]; + const __m256i sum_ma111_lo = Sum3WLo16(ma3); + sum_mat444[0] = _mm256_slli_epi16(sum_ma111_lo, 2); + const __m256i sum333_lo = _mm256_sub_epi16(sum_mat444[0], sum_ma111_lo); + sum_mat343[0] = VaddwLo8(sum333_lo, ma3[1]); + Store343_444(b3, x, sum_b343_lo, sum_b444_lo, b343, b444); + const __m256i sum_ma111_hi = Sum3WHi16(ma3); + sum_mat444[1] = _mm256_slli_epi16(sum_ma111_hi, 2); + *sum_ma444_lo = _mm256_permute2x128_si256(sum_mat444[0], sum_mat444[1], 0x20); + *sum_ma444_hi = _mm256_permute2x128_si256(sum_mat444[0], sum_mat444[1], 0x31); + StoreAligned32(ma444 + x + 0, *sum_ma444_lo); + StoreAligned32(ma444 + x + 16, *sum_ma444_hi); + const __m256i sum333_hi = _mm256_sub_epi16(sum_mat444[1], sum_ma111_hi); + sum_mat343[1] = VaddwHi8(sum333_hi, ma3[1]); + *sum_ma343_lo = _mm256_permute2x128_si256(sum_mat343[0], sum_mat343[1], 0x20); + *sum_ma343_hi = _mm256_permute2x128_si256(sum_mat343[0], sum_mat343[1], 0x31); + StoreAligned32(ma343 + x + 0, *sum_ma343_lo); + StoreAligned32(ma343 + x + 16, *sum_ma343_hi); + Store343_444(b3 + 3, x + 16, sum_b343_hi, sum_b444_hi, b343, b444); +} + +inline void Store343_444(const __m256i ma3[3], const __m256i b3[6], + const ptrdiff_t x, __m256i* const sum_ma343_lo, + __m256i* const sum_ma343_hi, __m256i sum_b343_lo[2], + __m256i sum_b343_hi[2], uint16_t* const ma343, + uint16_t* const ma444, uint32_t* const b343, + uint32_t* const b444) { + __m256i sum_ma444[2], sum_b444[2], sum_mat343[2]; + const __m256i sum_ma111_lo = Sum3WLo16(ma3); + sum_ma444[0] = _mm256_slli_epi16(sum_ma111_lo, 2); + const __m256i sum333_lo = _mm256_sub_epi16(sum_ma444[0], sum_ma111_lo); + sum_mat343[0] = VaddwLo8(sum333_lo, ma3[1]); + Store343_444(b3, x, sum_b343_lo, sum_b444, b343, b444); + const __m256i sum_ma111_hi = Sum3WHi16(ma3); + sum_ma444[1] = _mm256_slli_epi16(sum_ma111_hi, 2); + StoreAligned64_ma(ma444 + x, sum_ma444); + const __m256i sum333_hi = _mm256_sub_epi16(sum_ma444[1], sum_ma111_hi); + sum_mat343[1] = VaddwHi8(sum333_hi, ma3[1]); + *sum_ma343_lo = _mm256_permute2x128_si256(sum_mat343[0], sum_mat343[1], 0x20); + *sum_ma343_hi = _mm256_permute2x128_si256(sum_mat343[0], sum_mat343[1], 0x31); + StoreAligned32(ma343 + x + 0, *sum_ma343_lo); + StoreAligned32(ma343 + x + 16, *sum_ma343_hi); + Store343_444(b3 + 3, x + 16, sum_b343_hi, sum_b444, b343, b444); +} + +inline void PermuteB(const __m256i t[4], __m256i b[7]) { + // Input: + // 0 1 2 3 // b[0] + // 4 5 6 7 // b[1] + // 8 9 10 11 24 25 26 27 // t[0] + // 12 13 14 15 28 29 30 31 // t[1] + // 16 17 18 19 32 33 34 35 // t[2] + // 20 21 22 23 36 37 38 39 // t[3] + + // Output: + // 0 1 2 3 8 9 10 11 // b[0] + // 4 5 6 7 12 13 14 15 // b[1] + // 8 9 10 11 16 17 18 19 // b[2] + // 16 17 18 19 24 25 26 27 // b[3] + // 20 21 22 23 28 29 30 31 // b[4] + // 24 25 26 27 32 33 34 35 // b[5] + // 20 21 22 23 36 37 38 39 // b[6] + b[0] = _mm256_permute2x128_si256(b[0], t[0], 0x21); + b[1] = _mm256_permute2x128_si256(b[1], t[1], 0x21); + b[2] = _mm256_permute2x128_si256(t[0], t[2], 0x20); + b[3] = _mm256_permute2x128_si256(t[2], t[0], 0x30); + b[4] = _mm256_permute2x128_si256(t[3], t[1], 0x30); + b[5] = _mm256_permute2x128_si256(t[0], t[2], 0x31); + b[6] = t[3]; +} + +LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5Lo( + const __m128i s[2][2], const uint32_t scale, uint16_t* const sum5[5], + uint32_t* const square_sum5[5], __m128i sq[2][4], __m128i* const ma, + __m128i b[2]) { + __m128i s5[2][5], sq5[5][2]; + Square(s[0][1], sq[0] + 2); + Square(s[1][1], sq[1] + 2); + s5[0][3] = Sum5Horizontal16(s[0]); + StoreAligned16(sum5[3], s5[0][3]); + s5[0][4] = Sum5Horizontal16(s[1]); + StoreAligned16(sum5[4], s5[0][4]); + Sum5Horizontal32(sq[0], sq5[3]); + StoreAligned32U32(square_sum5[3], sq5[3]); + Sum5Horizontal32(sq[1], sq5[4]); + StoreAligned32U32(square_sum5[4], sq5[4]); + LoadAligned16x3U16(sum5, 0, s5[0]); + LoadAligned32x3U32(square_sum5, 0, sq5); + CalculateIntermediate5(s5[0], sq5, scale, ma, b); +} + +LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5( + const uint16_t* const src0, const uint16_t* const src1, + const ptrdiff_t over_read_in_bytes, const ptrdiff_t sum_width, + const ptrdiff_t x, const uint32_t scale, uint16_t* const sum5[5], + uint32_t* const square_sum5[5], __m256i sq[2][8], __m256i ma[3], + __m256i b[3]) { + __m256i s[2], s5[2][5], sq5[5][2], sum[2], index[2], t[4]; + s[0] = LoadUnaligned32Msan(src0 + 8, over_read_in_bytes + 16); + s[1] = LoadUnaligned32Msan(src1 + 8, over_read_in_bytes + 16); + Square(s[0], sq[0] + 2); + Square(s[1], sq[1] + 2); + sq[0][0] = _mm256_permute2x128_si256(sq[0][0], sq[0][2], 0x21); + sq[0][1] = _mm256_permute2x128_si256(sq[0][1], sq[0][3], 0x21); + sq[1][0] = _mm256_permute2x128_si256(sq[1][0], sq[1][2], 0x21); + sq[1][1] = _mm256_permute2x128_si256(sq[1][1], sq[1][3], 0x21); + s5[0][3] = Sum5Horizontal16(src0 + 0, over_read_in_bytes + 0); + s5[1][3] = Sum5Horizontal16(src0 + 16, over_read_in_bytes + 32); + s5[0][4] = Sum5Horizontal16(src1 + 0, over_read_in_bytes + 0); + s5[1][4] = Sum5Horizontal16(src1 + 16, over_read_in_bytes + 32); + StoreAligned32(sum5[3] + x + 0, s5[0][3]); + StoreAligned32(sum5[3] + x + 16, s5[1][3]); + StoreAligned32(sum5[4] + x + 0, s5[0][4]); + StoreAligned32(sum5[4] + x + 16, s5[1][4]); + Sum5Horizontal32(sq[0], sq5[3]); + StoreAligned64(square_sum5[3] + x, sq5[3]); + Sum5Horizontal32(sq[1], sq5[4]); + StoreAligned64(square_sum5[4] + x, sq5[4]); + LoadAligned32x3U16(sum5, x, s5[0]); + LoadAligned64x3U32(square_sum5, x, sq5); + CalculateSumAndIndex5(s5[0], sq5, scale, &sum[0], &index[0]); + + s[0] = LoadUnaligned32Msan(src0 + 24, over_read_in_bytes + 48); + s[1] = LoadUnaligned32Msan(src1 + 24, over_read_in_bytes + 48); + Square(s[0], sq[0] + 6); + Square(s[1], sq[1] + 6); + sq[0][4] = _mm256_permute2x128_si256(sq[0][2], sq[0][6], 0x21); + sq[0][5] = _mm256_permute2x128_si256(sq[0][3], sq[0][7], 0x21); + sq[1][4] = _mm256_permute2x128_si256(sq[1][2], sq[1][6], 0x21); + sq[1][5] = _mm256_permute2x128_si256(sq[1][3], sq[1][7], 0x21); + Sum5Horizontal32(sq[0] + 4, sq5[3]); + StoreAligned64(square_sum5[3] + x + 16, sq5[3]); + Sum5Horizontal32(sq[1] + 4, sq5[4]); + StoreAligned64(square_sum5[4] + x + 16, sq5[4]); + LoadAligned32x3U16Msan(sum5, x + 16, sum_width, s5[1]); + LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5); + CalculateSumAndIndex5(s5[1], sq5, scale, &sum[1], &index[1]); + CalculateIntermediate<25>(sum, index, ma, t, t + 2); + PermuteB(t, b); +} + +LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRowLo( + const __m128i s[2], const uint32_t scale, const uint16_t* const sum5[5], + const uint32_t* const square_sum5[5], __m128i sq[4], __m128i* const ma, + __m128i b[2]) { + __m128i s5[5], sq5[5][2]; + Square(s[1], sq + 2); + s5[3] = s5[4] = Sum5Horizontal16(s); + Sum5Horizontal32(sq, sq5[3]); + sq5[4][0] = sq5[3][0]; + sq5[4][1] = sq5[3][1]; + LoadAligned16x3U16(sum5, 0, s5); + LoadAligned32x3U32(square_sum5, 0, sq5); + CalculateIntermediate5(s5, sq5, scale, ma, b); +} + +LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRow( + const uint16_t* const src, const ptrdiff_t over_read_in_bytes, + const ptrdiff_t sum_width, const ptrdiff_t x, const uint32_t scale, + const uint16_t* const sum5[5], const uint32_t* const square_sum5[5], + __m256i sq[3], __m256i ma[3], __m256i b[3]) { + const __m256i s0 = LoadUnaligned32Msan(src + 8, over_read_in_bytes + 16); + __m256i s5[2][5], sq5[5][2], sum[2], index[2], t[4]; + Square(s0, sq + 2); + sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21); + sq[1] = _mm256_permute2x128_si256(sq[1], sq[3], 0x21); + s5[0][3] = Sum5Horizontal16(src + 0, over_read_in_bytes + 0); + s5[1][3] = Sum5Horizontal16(src + 16, over_read_in_bytes + 32); + s5[0][4] = s5[0][3]; + s5[1][4] = s5[1][3]; + Sum5Horizontal32(sq, sq5[3]); + sq5[4][0] = sq5[3][0]; + sq5[4][1] = sq5[3][1]; + LoadAligned32x3U16(sum5, x, s5[0]); + LoadAligned64x3U32(square_sum5, x, sq5); + CalculateSumAndIndex5(s5[0], sq5, scale, &sum[0], &index[0]); + + const __m256i s1 = LoadUnaligned32Msan(src + 24, over_read_in_bytes + 48); + Square(s1, sq + 6); + sq[4] = _mm256_permute2x128_si256(sq[2], sq[6], 0x21); + sq[5] = _mm256_permute2x128_si256(sq[3], sq[7], 0x21); + Sum5Horizontal32(sq + 4, sq5[3]); + sq5[4][0] = sq5[3][0]; + sq5[4][1] = sq5[3][1]; + LoadAligned32x3U16Msan(sum5, x + 16, sum_width, s5[1]); + LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5); + CalculateSumAndIndex5(s5[1], sq5, scale, &sum[1], &index[1]); + CalculateIntermediate<25>(sum, index, ma, t, t + 2); + PermuteB(t, b); +} + +LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3Lo( + const __m128i s[2], const uint32_t scale, uint16_t* const sum3[3], + uint32_t* const square_sum3[3], __m128i sq[4], __m128i* const ma, + __m128i b[2]) { + __m128i s3[3], sq3[3][2]; + Square(s[1], sq + 2); + s3[2] = Sum3Horizontal16(s); + StoreAligned16(sum3[2], s3[2]); + Sum3Horizontal32(sq, sq3[2]); + StoreAligned32U32(square_sum3[2], sq3[2]); + LoadAligned16x2U16(sum3, 0, s3); + LoadAligned32x2U32(square_sum3, 0, sq3); + CalculateIntermediate3(s3, sq3, scale, ma, b); +} + +LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3( + const uint16_t* const src, const ptrdiff_t over_read_in_bytes, + const ptrdiff_t x, const ptrdiff_t sum_width, const uint32_t scale, + uint16_t* const sum3[3], uint32_t* const square_sum3[3], __m256i sq[8], + __m256i ma[3], __m256i b[7]) { + __m256i s[2], s3[4], sq3[3][2], sum[2], index[2], t[4]; + s[0] = LoadUnaligned32Msan(src + 8, over_read_in_bytes + 16); + s[1] = LoadUnaligned32Msan(src + 24, over_read_in_bytes + 48); + Square(s[0], sq + 2); + sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21); + sq[1] = _mm256_permute2x128_si256(sq[1], sq[3], 0x21); + s3[2] = Sum3Horizontal16(src, over_read_in_bytes); + s3[3] = Sum3Horizontal16(src + 16, over_read_in_bytes + 32); + StoreAligned64(sum3[2] + x, s3 + 2); + Sum3Horizontal32(sq + 0, sq3[2]); + StoreAligned64(square_sum3[2] + x, sq3[2]); + LoadAligned32x2U16(sum3, x, s3); + LoadAligned64x2U32(square_sum3, x, sq3); + CalculateSumAndIndex3(s3, sq3, scale, &sum[0], &index[0]); + + Square(s[1], sq + 6); + sq[4] = _mm256_permute2x128_si256(sq[2], sq[6], 0x21); + sq[5] = _mm256_permute2x128_si256(sq[3], sq[7], 0x21); + Sum3Horizontal32(sq + 4, sq3[2]); + StoreAligned64(square_sum3[2] + x + 16, sq3[2]); + LoadAligned32x2U16Msan(sum3, x + 16, sum_width, s3 + 1); + LoadAligned64x2U32Msan(square_sum3, x + 16, sum_width, sq3); + CalculateSumAndIndex3(s3 + 1, sq3, scale, &sum[1], &index[1]); + CalculateIntermediate<9>(sum, index, ma, t, t + 2); + PermuteB(t, b); +} + +LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLo( + const __m128i s[2][4], const uint16_t scales[2], uint16_t* const sum3[4], + uint16_t* const sum5[5], uint32_t* const square_sum3[4], + uint32_t* const square_sum5[5], __m128i sq[2][8], __m128i ma3[2][3], + __m128i b3[2][10], __m128i* const ma5, __m128i b5[2]) { + __m128i s3[4], s5[5], sq3[4][2], sq5[5][2], sum[2], index[2]; + Square(s[0][1], sq[0] + 2); + Square(s[1][1], sq[1] + 2); + SumHorizontal16(s[0], &s3[2], &s5[3]); + SumHorizontal16(s[1], &s3[3], &s5[4]); + StoreAligned16(sum3[2], s3[2]); + StoreAligned16(sum3[3], s3[3]); + StoreAligned16(sum5[3], s5[3]); + StoreAligned16(sum5[4], s5[4]); + SumHorizontal32(sq[0], &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]); + StoreAligned32U32(square_sum3[2], sq3[2]); + StoreAligned32U32(square_sum5[3], sq5[3]); + SumHorizontal32(sq[1], &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]); + StoreAligned32U32(square_sum3[3], sq3[3]); + StoreAligned32U32(square_sum5[4], sq5[4]); + LoadAligned16x2U16(sum3, 0, s3); + LoadAligned32x2U32(square_sum3, 0, sq3); + LoadAligned16x3U16(sum5, 0, s5); + LoadAligned32x3U32(square_sum5, 0, sq5); + CalculateSumAndIndex3(s3 + 0, sq3 + 0, scales[1], &sum[0], &index[0]); + CalculateSumAndIndex3(s3 + 1, sq3 + 1, scales[1], &sum[1], &index[1]); + CalculateIntermediate(sum, index, &ma3[0][0], b3[0], b3[1]); + ma3[1][0] = _mm_srli_si128(ma3[0][0], 8); + CalculateIntermediate5(s5, sq5, scales[0], ma5, b5); +} + +LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess( + const uint16_t* const src0, const uint16_t* const src1, + const ptrdiff_t over_read_in_bytes, const ptrdiff_t x, + const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5], + uint32_t* const square_sum3[4], uint32_t* const square_sum5[5], + const ptrdiff_t sum_width, __m256i sq[2][8], __m256i ma3[2][3], + __m256i b3[2][7], __m256i ma5[3], __m256i b5[5]) { + __m256i s[2], s3[2][4], s5[2][5], sq3[4][2], sq5[5][2], sum_3[2][2], + index_3[2][2], sum_5[2], index_5[2], t[4]; + s[0] = LoadUnaligned32Msan(src0 + 8, over_read_in_bytes + 16); + s[1] = LoadUnaligned32Msan(src1 + 8, over_read_in_bytes + 16); + Square(s[0], sq[0] + 2); + Square(s[1], sq[1] + 2); + sq[0][0] = _mm256_permute2x128_si256(sq[0][0], sq[0][2], 0x21); + sq[0][1] = _mm256_permute2x128_si256(sq[0][1], sq[0][3], 0x21); + sq[1][0] = _mm256_permute2x128_si256(sq[1][0], sq[1][2], 0x21); + sq[1][1] = _mm256_permute2x128_si256(sq[1][1], sq[1][3], 0x21); + SumHorizontal16(src0, over_read_in_bytes, &s3[0][2], &s3[1][2], &s5[0][3], + &s5[1][3]); + SumHorizontal16(src1, over_read_in_bytes, &s3[0][3], &s3[1][3], &s5[0][4], + &s5[1][4]); + StoreAligned32(sum3[2] + x + 0, s3[0][2]); + StoreAligned32(sum3[2] + x + 16, s3[1][2]); + StoreAligned32(sum3[3] + x + 0, s3[0][3]); + StoreAligned32(sum3[3] + x + 16, s3[1][3]); + StoreAligned32(sum5[3] + x + 0, s5[0][3]); + StoreAligned32(sum5[3] + x + 16, s5[1][3]); + StoreAligned32(sum5[4] + x + 0, s5[0][4]); + StoreAligned32(sum5[4] + x + 16, s5[1][4]); + SumHorizontal32(sq[0], &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]); + SumHorizontal32(sq[1], &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]); + StoreAligned64(square_sum3[2] + x, sq3[2]); + StoreAligned64(square_sum5[3] + x, sq5[3]); + StoreAligned64(square_sum3[3] + x, sq3[3]); + StoreAligned64(square_sum5[4] + x, sq5[4]); + LoadAligned32x2U16(sum3, x, s3[0]); + LoadAligned64x2U32(square_sum3, x, sq3); + CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum_3[0][0], &index_3[0][0]); + CalculateSumAndIndex3(s3[0] + 1, sq3 + 1, scales[1], &sum_3[1][0], + &index_3[1][0]); + LoadAligned32x3U16(sum5, x, s5[0]); + LoadAligned64x3U32(square_sum5, x, sq5); + CalculateSumAndIndex5(s5[0], sq5, scales[0], &sum_5[0], &index_5[0]); + + s[0] = LoadUnaligned32Msan(src0 + 24, over_read_in_bytes + 48); + s[1] = LoadUnaligned32Msan(src1 + 24, over_read_in_bytes + 48); + Square(s[0], sq[0] + 6); + Square(s[1], sq[1] + 6); + sq[0][4] = _mm256_permute2x128_si256(sq[0][2], sq[0][6], 0x21); + sq[0][5] = _mm256_permute2x128_si256(sq[0][3], sq[0][7], 0x21); + sq[1][4] = _mm256_permute2x128_si256(sq[1][2], sq[1][6], 0x21); + sq[1][5] = _mm256_permute2x128_si256(sq[1][3], sq[1][7], 0x21); + SumHorizontal32(sq[0] + 4, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]); + SumHorizontal32(sq[1] + 4, &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]); + StoreAligned64(square_sum3[2] + x + 16, sq3[2]); + StoreAligned64(square_sum5[3] + x + 16, sq5[3]); + StoreAligned64(square_sum3[3] + x + 16, sq3[3]); + StoreAligned64(square_sum5[4] + x + 16, sq5[4]); + LoadAligned32x2U16Msan(sum3, x + 16, sum_width, s3[1]); + LoadAligned64x2U32Msan(square_sum3, x + 16, sum_width, sq3); + CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum_3[0][1], &index_3[0][1]); + CalculateSumAndIndex3(s3[1] + 1, sq3 + 1, scales[1], &sum_3[1][1], + &index_3[1][1]); + CalculateIntermediate<9>(sum_3[0], index_3[0], ma3[0], t, t + 2); + PermuteB(t, b3[0]); + CalculateIntermediate<9>(sum_3[1], index_3[1], ma3[1], t, t + 2); + PermuteB(t, b3[1]); + LoadAligned32x3U16Msan(sum5, x + 16, sum_width, s5[1]); + LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5); + CalculateSumAndIndex5(s5[1], sq5, scales[0], &sum_5[1], &index_5[1]); + CalculateIntermediate<25>(sum_5, index_5, ma5, t, t + 2); + PermuteB(t, b5); +} + +LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRowLo( + const __m128i s[2], const uint16_t scales[2], const uint16_t* const sum3[4], + const uint16_t* const sum5[5], const uint32_t* const square_sum3[4], + const uint32_t* const square_sum5[5], __m128i sq[4], __m128i* const ma3, + __m128i* const ma5, __m128i b3[2], __m128i b5[2]) { + __m128i s3[3], s5[5], sq3[3][2], sq5[5][2]; + Square(s[1], sq + 2); + SumHorizontal16(s, &s3[2], &s5[3]); + SumHorizontal32(sq, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]); + LoadAligned16x3U16(sum5, 0, s5); + s5[4] = s5[3]; + LoadAligned32x3U32(square_sum5, 0, sq5); + sq5[4][0] = sq5[3][0]; + sq5[4][1] = sq5[3][1]; + CalculateIntermediate5(s5, sq5, scales[0], ma5, b5); + LoadAligned16x2U16(sum3, 0, s3); + LoadAligned32x2U32(square_sum3, 0, sq3); + CalculateIntermediate3(s3, sq3, scales[1], ma3, b3); +} + +LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRow( + const uint16_t* const src, const ptrdiff_t over_read_in_bytes, + const ptrdiff_t sum_width, const ptrdiff_t x, const uint16_t scales[2], + const uint16_t* const sum3[4], const uint16_t* const sum5[5], + const uint32_t* const square_sum3[4], const uint32_t* const square_sum5[5], + __m256i sq[6], __m256i ma3[2], __m256i ma5[2], __m256i b3[5], + __m256i b5[5]) { + const __m256i s0 = LoadUnaligned32Msan(src + 8, over_read_in_bytes + 16); + __m256i s3[2][3], s5[2][5], sq3[4][2], sq5[5][2], sum_3[2], index_3[2], + sum_5[2], index_5[2], t[4]; + Square(s0, sq + 2); + sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21); + sq[1] = _mm256_permute2x128_si256(sq[1], sq[3], 0x21); + SumHorizontal16(src, over_read_in_bytes, &s3[0][2], &s3[1][2], &s5[0][3], + &s5[1][3]); + SumHorizontal32(sq, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]); + LoadAligned32x2U16(sum3, x, s3[0]); + LoadAligned64x2U32(square_sum3, x, sq3); + CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum_3[0], &index_3[0]); + LoadAligned32x3U16(sum5, x, s5[0]); + s5[0][4] = s5[0][3]; + LoadAligned64x3U32(square_sum5, x, sq5); + sq5[4][0] = sq5[3][0]; + sq5[4][1] = sq5[3][1]; + CalculateSumAndIndex5(s5[0], sq5, scales[0], &sum_5[0], &index_5[0]); + + const __m256i s1 = LoadUnaligned32Msan(src + 24, over_read_in_bytes + 48); + Square(s1, sq + 6); + sq[4] = _mm256_permute2x128_si256(sq[2], sq[6], 0x21); + sq[5] = _mm256_permute2x128_si256(sq[3], sq[7], 0x21); + SumHorizontal32(sq + 4, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]); + LoadAligned32x2U16Msan(sum3, x + 16, sum_width, s3[1]); + LoadAligned64x2U32Msan(square_sum3, x + 16, sum_width, sq3); + CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum_3[1], &index_3[1]); + CalculateIntermediate<9>(sum_3, index_3, ma3, t, t + 2); + PermuteB(t, b3); + LoadAligned32x3U16Msan(sum5, x + 16, sum_width, s5[1]); + s5[1][4] = s5[1][3]; + LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5); + sq5[4][0] = sq5[3][0]; + sq5[4][1] = sq5[3][1]; + CalculateSumAndIndex5(s5[1], sq5, scales[0], &sum_5[1], &index_5[1]); + CalculateIntermediate<25>(sum_5, index_5, ma5, t, t + 2); + PermuteB(t, b5); +} + +inline void BoxSumFilterPreProcess5(const uint16_t* const src0, + const uint16_t* const src1, const int width, + const uint32_t scale, + uint16_t* const sum5[5], + uint32_t* const square_sum5[5], + const ptrdiff_t sum_width, uint16_t* ma565, + uint32_t* b565) { + const ptrdiff_t overread_in_bytes = + kOverreadInBytesPass1_128 - sizeof(*src0) * width; + __m128i s[2][2], ma0, sq_128[2][4], b0[2]; + __m256i mas[3], sq[2][8], bs[10]; + s[0][0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0); + s[0][1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16); + s[1][0] = LoadUnaligned16Msan(src1 + 0, overread_in_bytes + 0); + s[1][1] = LoadUnaligned16Msan(src1 + 8, overread_in_bytes + 16); + Square(s[0][0], sq_128[0]); + Square(s[1][0], sq_128[1]); + BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq_128, &ma0, b0); + sq[0][0] = SetrM128i(sq_128[0][2], sq_128[0][2]); + sq[0][1] = SetrM128i(sq_128[0][3], sq_128[0][3]); + sq[1][0] = SetrM128i(sq_128[1][2], sq_128[1][2]); + sq[1][1] = SetrM128i(sq_128[1][3], sq_128[1][3]); + mas[0] = SetrM128i(ma0, ma0); + bs[0] = SetrM128i(b0[0], b0[0]); + bs[1] = SetrM128i(b0[1], b0[1]); + + int x = 0; + do { + __m256i ma5[3], ma[2], b[4]; + BoxFilterPreProcess5( + src0 + x + 8, src1 + x + 8, + kOverreadInBytesPass1_256 + sizeof(*src0) * (x + 8 - width), sum_width, + x + 8, scale, sum5, square_sum5, sq, mas, bs); + Prepare3_8(mas, ma5); + ma[0] = Sum565Lo(ma5); + ma[1] = Sum565Hi(ma5); + StoreAligned64_ma(ma565, ma); + Sum565(bs + 0, b + 0); + Sum565(bs + 3, b + 2); + StoreAligned64(b565, b + 0); + StoreAligned64(b565 + 16, b + 2); + sq[0][0] = sq[0][6]; + sq[0][1] = sq[0][7]; + sq[1][0] = sq[1][6]; + sq[1][1] = sq[1][7]; + mas[0] = mas[2]; + bs[0] = bs[5]; + bs[1] = bs[6]; + ma565 += 32; + b565 += 32; + x += 32; + } while (x < width); +} + +template <bool calculate444> +LIBGAV1_ALWAYS_INLINE void BoxSumFilterPreProcess3( + const uint16_t* const src, const int width, const uint32_t scale, + uint16_t* const sum3[3], uint32_t* const square_sum3[3], + const ptrdiff_t sum_width, uint16_t* ma343, uint16_t* ma444, uint32_t* b343, + uint32_t* b444) { + const ptrdiff_t overread_in_bytes_128 = + kOverreadInBytesPass2_128 - sizeof(*src) * width; + __m128i s[2], ma0, sq_128[4], b0[2]; + __m256i mas[3], sq[8], bs[7]; + s[0] = LoadUnaligned16Msan(src + 0, overread_in_bytes_128 + 0); + s[1] = LoadUnaligned16Msan(src + 8, overread_in_bytes_128 + 16); + Square(s[0], sq_128); + BoxFilterPreProcess3Lo(s, scale, sum3, square_sum3, sq_128, &ma0, b0); + sq[0] = SetrM128i(sq_128[2], sq_128[2]); + sq[1] = SetrM128i(sq_128[3], sq_128[3]); + mas[0] = SetrM128i(ma0, ma0); + bs[0] = SetrM128i(b0[0], b0[0]); + bs[1] = SetrM128i(b0[1], b0[1]); + + int x = 0; + do { + __m256i ma3[3]; + BoxFilterPreProcess3( + src + x + 8, kOverreadInBytesPass2_256 + sizeof(*src) * (x + 8 - width), + x + 8, sum_width, scale, sum3, square_sum3, sq, mas, bs); + Prepare3_8(mas, ma3); + if (calculate444) { // NOLINT(readability-simplify-boolean-expr) + Store343_444Lo(ma3, bs + 0, 0, ma343, ma444, b343, b444); + Store343_444Hi(ma3, bs + 3, kMaStoreOffset, ma343, ma444, b343, b444); + ma444 += 32; + b444 += 32; + } else { + __m256i ma[2], b[4]; + ma[0] = Sum343Lo(ma3); + ma[1] = Sum343Hi(ma3); + StoreAligned64_ma(ma343, ma); + Sum343(bs + 0, b + 0); + Sum343(bs + 3, b + 2); + StoreAligned64(b343 + 0, b + 0); + StoreAligned64(b343 + 16, b + 2); + } + sq[0] = sq[6]; + sq[1] = sq[7]; + mas[0] = mas[2]; + bs[0] = bs[5]; + bs[1] = bs[6]; + ma343 += 32; + b343 += 32; + x += 32; + } while (x < width); +} + +inline void BoxSumFilterPreProcess( + const uint16_t* const src0, const uint16_t* const src1, const int width, + const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5], + uint32_t* const square_sum3[4], uint32_t* const square_sum5[5], + const ptrdiff_t sum_width, uint16_t* const ma343[4], uint16_t* const ma444, + uint16_t* ma565, uint32_t* const b343[4], uint32_t* const b444, + uint32_t* b565) { + const ptrdiff_t overread_in_bytes = + kOverreadInBytesPass1_128 - sizeof(*src0) * width; + __m128i s[2][4], ma3_128[2][3], ma5_128[3], sq_128[2][8], b3_128[2][10], + b5_128[10]; + __m256i ma3[2][3], ma5[3], sq[2][8], b3[2][7], b5[7]; + s[0][0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0); + s[0][1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16); + s[1][0] = LoadUnaligned16Msan(src1 + 0, overread_in_bytes + 0); + s[1][1] = LoadUnaligned16Msan(src1 + 8, overread_in_bytes + 16); + Square(s[0][0], sq_128[0]); + Square(s[1][0], sq_128[1]); + BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq_128, + ma3_128, b3_128, &ma5_128[0], b5_128); + sq[0][0] = SetrM128i(sq_128[0][2], sq_128[0][2]); + sq[0][1] = SetrM128i(sq_128[0][3], sq_128[0][3]); + sq[1][0] = SetrM128i(sq_128[1][2], sq_128[1][2]); + sq[1][1] = SetrM128i(sq_128[1][3], sq_128[1][3]); + ma3[0][0] = SetrM128i(ma3_128[0][0], ma3_128[0][0]); + ma3[1][0] = SetrM128i(ma3_128[1][0], ma3_128[1][0]); + ma5[0] = SetrM128i(ma5_128[0], ma5_128[0]); + b3[0][0] = SetrM128i(b3_128[0][0], b3_128[0][0]); + b3[0][1] = SetrM128i(b3_128[0][1], b3_128[0][1]); + b3[1][0] = SetrM128i(b3_128[1][0], b3_128[1][0]); + b3[1][1] = SetrM128i(b3_128[1][1], b3_128[1][1]); + b5[0] = SetrM128i(b5_128[0], b5_128[0]); + b5[1] = SetrM128i(b5_128[1], b5_128[1]); + + int x = 0; + do { + __m256i ma[2], b[4], ma3x[3], ma5x[3]; + BoxFilterPreProcess( + src0 + x + 8, src1 + x + 8, + kOverreadInBytesPass1_256 + sizeof(*src0) * (x + 8 - width), x + 8, + scales, sum3, sum5, square_sum3, square_sum5, sum_width, sq, ma3, b3, + ma5, b5); + Prepare3_8(ma3[0], ma3x); + ma[0] = Sum343Lo(ma3x); + ma[1] = Sum343Hi(ma3x); + StoreAligned64_ma(ma343[0] + x, ma); + Sum343(b3[0], b); + Sum343(b3[0] + 3, b + 2); + StoreAligned64(b343[0] + x, b); + StoreAligned64(b343[0] + x + 16, b + 2); + Prepare3_8(ma3[1], ma3x); + Store343_444Lo(ma3x, b3[1], x, ma343[1], ma444, b343[1], b444); + Store343_444Hi(ma3x, b3[1] + 3, x + kMaStoreOffset, ma343[1], ma444, + b343[1], b444); + Prepare3_8(ma5, ma5x); + ma[0] = Sum565Lo(ma5x); + ma[1] = Sum565Hi(ma5x); + StoreAligned64_ma(ma565, ma); + Sum565(b5, b); + StoreAligned64(b565, b); + Sum565(b5 + 3, b); + StoreAligned64(b565 + 16, b); + sq[0][0] = sq[0][6]; + sq[0][1] = sq[0][7]; + sq[1][0] = sq[1][6]; + sq[1][1] = sq[1][7]; + ma3[0][0] = ma3[0][2]; + ma3[1][0] = ma3[1][2]; + ma5[0] = ma5[2]; + b3[0][0] = b3[0][5]; + b3[0][1] = b3[0][6]; + b3[1][0] = b3[1][5]; + b3[1][1] = b3[1][6]; + b5[0] = b5[5]; + b5[1] = b5[6]; + ma565 += 32; + b565 += 32; + x += 32; + } while (x < width); +} + +template <int shift> +inline __m256i FilterOutput(const __m256i ma_x_src, const __m256i b) { + // ma: 255 * 32 = 8160 (13 bits) + // b: 65088 * 32 = 2082816 (21 bits) + // v: b - ma * 255 (22 bits) + const __m256i v = _mm256_sub_epi32(b, ma_x_src); + // kSgrProjSgrBits = 8 + // kSgrProjRestoreBits = 4 + // shift = 4 or 5 + // v >> 8 or 9 (13 bits) + return VrshrS32(v, kSgrProjSgrBits + shift - kSgrProjRestoreBits); +} + +template <int shift> +inline __m256i CalculateFilteredOutput(const __m256i src, const __m256i ma, + const __m256i b[2]) { + const __m256i ma_x_src_lo = VmullLo16(ma, src); + const __m256i ma_x_src_hi = VmullHi16(ma, src); + const __m256i dst_lo = FilterOutput<shift>(ma_x_src_lo, b[0]); + const __m256i dst_hi = FilterOutput<shift>(ma_x_src_hi, b[1]); + return _mm256_packs_epi32(dst_lo, dst_hi); // 13 bits +} + +inline __m256i CalculateFilteredOutputPass1(const __m256i src, + const __m256i ma[2], + const __m256i b[2][2]) { + const __m256i ma_sum = _mm256_add_epi16(ma[0], ma[1]); + __m256i b_sum[2]; + b_sum[0] = _mm256_add_epi32(b[0][0], b[1][0]); + b_sum[1] = _mm256_add_epi32(b[0][1], b[1][1]); + return CalculateFilteredOutput<5>(src, ma_sum, b_sum); +} + +inline __m256i CalculateFilteredOutputPass2(const __m256i src, + const __m256i ma[3], + const __m256i b[3][2]) { + const __m256i ma_sum = Sum3_16(ma); + __m256i b_sum[2]; + Sum3_32(b, b_sum); + return CalculateFilteredOutput<5>(src, ma_sum, b_sum); +} + +inline __m256i SelfGuidedFinal(const __m256i src, const __m256i v[2]) { + const __m256i v_lo = + VrshrS32(v[0], kSgrProjRestoreBits + kSgrProjPrecisionBits); + const __m256i v_hi = + VrshrS32(v[1], kSgrProjRestoreBits + kSgrProjPrecisionBits); + const __m256i vv = _mm256_packs_epi32(v_lo, v_hi); + return _mm256_add_epi16(src, vv); +} + +inline __m256i SelfGuidedDoubleMultiplier(const __m256i src, + const __m256i filter[2], const int w0, + const int w2) { + __m256i v[2]; + const __m256i w0_w2 = + _mm256_set1_epi32((w2 << 16) | static_cast<uint16_t>(w0)); + const __m256i f_lo = _mm256_unpacklo_epi16(filter[0], filter[1]); + const __m256i f_hi = _mm256_unpackhi_epi16(filter[0], filter[1]); + v[0] = _mm256_madd_epi16(w0_w2, f_lo); + v[1] = _mm256_madd_epi16(w0_w2, f_hi); + return SelfGuidedFinal(src, v); +} + +inline __m256i SelfGuidedSingleMultiplier(const __m256i src, + const __m256i filter, const int w0) { + // weight: -96 to 96 (Sgrproj_Xqd_Min/Max) + __m256i v[2]; + v[0] = VmullNLo8(filter, w0); + v[1] = VmullNHi8(filter, w0); + return SelfGuidedFinal(src, v); +} + +inline void ClipAndStore(uint16_t* const dst, const __m256i val) { + const __m256i val0 = _mm256_max_epi16(val, _mm256_setzero_si256()); + const __m256i val1 = _mm256_min_epi16(val0, _mm256_set1_epi16(1023)); + StoreUnaligned32(dst, val1); +} + +LIBGAV1_ALWAYS_INLINE void BoxFilterPass1( + const uint16_t* const src, const uint16_t* const src0, + const uint16_t* const src1, const ptrdiff_t stride, uint16_t* const sum5[5], + uint32_t* const square_sum5[5], const int width, const ptrdiff_t sum_width, + const uint32_t scale, const int16_t w0, uint16_t* const ma565[2], + uint32_t* const b565[2], uint16_t* const dst) { + const ptrdiff_t overread_in_bytes = + kOverreadInBytesPass1_128 - sizeof(*src0) * width; + __m128i s[2][2], ma0, sq_128[2][4], b0[2]; + __m256i mas[3], sq[2][8], bs[7]; + s[0][0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0); + s[0][1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16); + s[1][0] = LoadUnaligned16Msan(src1 + 0, overread_in_bytes + 0); + s[1][1] = LoadUnaligned16Msan(src1 + 8, overread_in_bytes + 16); + Square(s[0][0], sq_128[0]); + Square(s[1][0], sq_128[1]); + BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq_128, &ma0, b0); + sq[0][0] = SetrM128i(sq_128[0][2], sq_128[0][2]); + sq[0][1] = SetrM128i(sq_128[0][3], sq_128[0][3]); + sq[1][0] = SetrM128i(sq_128[1][2], sq_128[1][2]); + sq[1][1] = SetrM128i(sq_128[1][3], sq_128[1][3]); + mas[0] = SetrM128i(ma0, ma0); + bs[0] = SetrM128i(b0[0], b0[0]); + bs[1] = SetrM128i(b0[1], b0[1]); + + int x = 0; + do { + __m256i ma5[3], ma[4], b[4][2]; + BoxFilterPreProcess5( + src0 + x + 8, src1 + x + 8, + kOverreadInBytesPass1_256 + sizeof(*src0) * (x + 8 - width), sum_width, + x + 8, scale, sum5, square_sum5, sq, mas, bs); + Prepare3_8(mas, ma5); + ma[2] = Sum565Lo(ma5); + ma[3] = Sum565Hi(ma5); + ma[1] = _mm256_permute2x128_si256(ma[2], ma[3], 0x20); + ma[3] = _mm256_permute2x128_si256(ma[2], ma[3], 0x31); + StoreAligned32(ma565[1] + x + 0, ma[1]); + StoreAligned32(ma565[1] + x + 16, ma[3]); + Sum565(bs + 0, b[1]); + Sum565(bs + 3, b[3]); + StoreAligned64(b565[1] + x, b[1]); + StoreAligned64(b565[1] + x + 16, b[3]); + const __m256i sr0_lo = LoadUnaligned32(src + x + 0); + ma[0] = LoadAligned32(ma565[0] + x); + LoadAligned64(b565[0] + x, b[0]); + const __m256i p0 = CalculateFilteredOutputPass1(sr0_lo, ma, b); + const __m256i d0 = SelfGuidedSingleMultiplier(sr0_lo, p0, w0); + ClipAndStore(dst + x + 0, d0); + const __m256i sr0_hi = LoadUnaligned32(src + x + 16); + ma[2] = LoadAligned32(ma565[0] + x + 16); + LoadAligned64(b565[0] + x + 16, b[2]); + const __m256i p1 = CalculateFilteredOutputPass1(sr0_hi, ma + 2, b + 2); + const __m256i d1 = SelfGuidedSingleMultiplier(sr0_hi, p1, w0); + ClipAndStore(dst + x + 16, d1); + const __m256i sr1_lo = LoadUnaligned32(src + stride + x + 0); + const __m256i p10 = CalculateFilteredOutput<4>(sr1_lo, ma[1], b[1]); + const __m256i d10 = SelfGuidedSingleMultiplier(sr1_lo, p10, w0); + ClipAndStore(dst + stride + x + 0, d10); + const __m256i sr1_hi = LoadUnaligned32(src + stride + x + 16); + const __m256i p11 = CalculateFilteredOutput<4>(sr1_hi, ma[3], b[3]); + const __m256i d11 = SelfGuidedSingleMultiplier(sr1_hi, p11, w0); + ClipAndStore(dst + stride + x + 16, d11); + sq[0][0] = sq[0][6]; + sq[0][1] = sq[0][7]; + sq[1][0] = sq[1][6]; + sq[1][1] = sq[1][7]; + mas[0] = mas[2]; + bs[0] = bs[5]; + bs[1] = bs[6]; + x += 32; + } while (x < width); +} + +inline void BoxFilterPass1LastRow( + const uint16_t* const src, const uint16_t* const src0, const int width, + const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0, + uint16_t* const sum5[5], uint32_t* const square_sum5[5], uint16_t* ma565, + uint32_t* b565, uint16_t* const dst) { + const ptrdiff_t overread_in_bytes = + kOverreadInBytesPass1_128 - sizeof(*src0) * width; + __m128i s[2], ma0[2], sq_128[8], b0[6]; + __m256i mas[3], sq[8], bs[7]; + s[0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0); + s[1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16); + Square(s[0], sq_128); + BoxFilterPreProcess5LastRowLo(s, scale, sum5, square_sum5, sq_128, &ma0[0], + b0); + sq[0] = SetrM128i(sq_128[2], sq_128[2]); + sq[1] = SetrM128i(sq_128[3], sq_128[3]); + mas[0] = SetrM128i(ma0[0], ma0[0]); + bs[0] = SetrM128i(b0[0], b0[0]); + bs[1] = SetrM128i(b0[1], b0[1]); + + int x = 0; + do { + __m256i ma5[3], ma[4], b[4][2]; + BoxFilterPreProcess5LastRow( + src0 + x + 8, + kOverreadInBytesPass1_256 + sizeof(*src0) * (x + 8 - width), sum_width, + x + 8, scale, sum5, square_sum5, sq, mas, bs); + Prepare3_8(mas, ma5); + ma[2] = Sum565Lo(ma5); + ma[3] = Sum565Hi(ma5); + Sum565(bs + 0, b[1]); + Sum565(bs + 3, b[3]); + const __m256i sr0_lo = LoadUnaligned32(src + x + 0); + ma[0] = LoadAligned32(ma565 + x); + ma[1] = _mm256_permute2x128_si256(ma[2], ma[3], 0x20); + LoadAligned64(b565 + x, b[0]); + const __m256i p0 = CalculateFilteredOutputPass1(sr0_lo, ma, b); + const __m256i d0 = SelfGuidedSingleMultiplier(sr0_lo, p0, w0); + ClipAndStore(dst + x + 0, d0); + const __m256i sr0_hi = LoadUnaligned32(src + x + 16); + ma[0] = LoadAligned32(ma565 + x + 16); + ma[1] = _mm256_permute2x128_si256(ma[2], ma[3], 0x31); + LoadAligned64(b565 + x + 16, b[2]); + const __m256i p1 = CalculateFilteredOutputPass1(sr0_hi, ma, b + 2); + const __m256i d1 = SelfGuidedSingleMultiplier(sr0_hi, p1, w0); + ClipAndStore(dst + x + 16, d1); + sq[0] = sq[6]; + sq[1] = sq[7]; + mas[0] = mas[2]; + bs[0] = bs[5]; + bs[1] = bs[6]; + x += 32; + } while (x < width); +} + +LIBGAV1_ALWAYS_INLINE void BoxFilterPass2( + const uint16_t* const src, const uint16_t* const src0, const int width, + const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0, + uint16_t* const sum3[3], uint32_t* const square_sum3[3], + uint16_t* const ma343[3], uint16_t* const ma444[2], uint32_t* const b343[3], + uint32_t* const b444[2], uint16_t* const dst) { + const ptrdiff_t overread_in_bytes_128 = + kOverreadInBytesPass2_128 - sizeof(*src0) * width; + __m128i s0[2], ma0, sq_128[4], b0[2]; + __m256i mas[3], sq[8], bs[7]; + s0[0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes_128 + 0); + s0[1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes_128 + 16); + Square(s0[0], sq_128); + BoxFilterPreProcess3Lo(s0, scale, sum3, square_sum3, sq_128, &ma0, b0); + sq[0] = SetrM128i(sq_128[2], sq_128[2]); + sq[1] = SetrM128i(sq_128[3], sq_128[3]); + mas[0] = SetrM128i(ma0, ma0); + bs[0] = SetrM128i(b0[0], b0[0]); + bs[1] = SetrM128i(b0[1], b0[1]); + + int x = 0; + do { + __m256i ma[4], b[4][2], ma3[3]; + BoxFilterPreProcess3( + src0 + x + 8, + kOverreadInBytesPass2_256 + sizeof(*src0) * (x + 8 - width), x + 8, + sum_width, scale, sum3, square_sum3, sq, mas, bs); + Prepare3_8(mas, ma3); + Store343_444(ma3, bs, x, &ma[2], &ma[3], b[2], b[3], ma343[2], ma444[1], + b343[2], b444[1]); + const __m256i sr_lo = LoadUnaligned32(src + x + 0); + const __m256i sr_hi = LoadUnaligned32(src + x + 16); + ma[0] = LoadAligned32(ma343[0] + x); + ma[1] = LoadAligned32(ma444[0] + x); + LoadAligned64(b343[0] + x, b[0]); + LoadAligned64(b444[0] + x, b[1]); + const __m256i p0 = CalculateFilteredOutputPass2(sr_lo, ma, b); + ma[1] = LoadAligned32(ma343[0] + x + 16); + ma[2] = LoadAligned32(ma444[0] + x + 16); + LoadAligned64(b343[0] + x + 16, b[1]); + LoadAligned64(b444[0] + x + 16, b[2]); + const __m256i p1 = CalculateFilteredOutputPass2(sr_hi, ma + 1, b + 1); + const __m256i d0 = SelfGuidedSingleMultiplier(sr_lo, p0, w0); + const __m256i d1 = SelfGuidedSingleMultiplier(sr_hi, p1, w0); + ClipAndStore(dst + x + 0, d0); + ClipAndStore(dst + x + 16, d1); + sq[0] = sq[6]; + sq[1] = sq[7]; + mas[0] = mas[2]; + bs[0] = bs[5]; + bs[1] = bs[6]; + x += 32; + } while (x < width); +} + +LIBGAV1_ALWAYS_INLINE void BoxFilter( + const uint16_t* const src, const uint16_t* const src0, + const uint16_t* const src1, const ptrdiff_t stride, const int width, + const uint16_t scales[2], const int16_t w0, const int16_t w2, + uint16_t* const sum3[4], uint16_t* const sum5[5], + uint32_t* const square_sum3[4], uint32_t* const square_sum5[5], + const ptrdiff_t sum_width, uint16_t* const ma343[4], + uint16_t* const ma444[3], uint16_t* const ma565[2], uint32_t* const b343[4], + uint32_t* const b444[3], uint32_t* const b565[2], uint16_t* const dst) { + const ptrdiff_t overread_in_bytes = + kOverreadInBytesPass1_128 - sizeof(*src0) * width; + __m128i s[2][4], ma3_128[2][3], ma5_0, sq_128[2][8], b3_128[2][10], b5_128[2]; + __m256i ma3[2][3], ma5[3], sq[2][8], b3[2][7], b5[7]; + s[0][0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0); + s[0][1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16); + s[1][0] = LoadUnaligned16Msan(src1 + 0, overread_in_bytes + 0); + s[1][1] = LoadUnaligned16Msan(src1 + 8, overread_in_bytes + 16); + Square(s[0][0], sq_128[0]); + Square(s[1][0], sq_128[1]); + BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq_128, + ma3_128, b3_128, &ma5_0, b5_128); + sq[0][0] = SetrM128i(sq_128[0][2], sq_128[0][2]); + sq[0][1] = SetrM128i(sq_128[0][3], sq_128[0][3]); + sq[1][0] = SetrM128i(sq_128[1][2], sq_128[1][2]); + sq[1][1] = SetrM128i(sq_128[1][3], sq_128[1][3]); + ma3[0][0] = SetrM128i(ma3_128[0][0], ma3_128[0][0]); + ma3[1][0] = SetrM128i(ma3_128[1][0], ma3_128[1][0]); + ma5[0] = SetrM128i(ma5_0, ma5_0); + b3[0][0] = SetrM128i(b3_128[0][0], b3_128[0][0]); + b3[0][1] = SetrM128i(b3_128[0][1], b3_128[0][1]); + b3[1][0] = SetrM128i(b3_128[1][0], b3_128[1][0]); + b3[1][1] = SetrM128i(b3_128[1][1], b3_128[1][1]); + b5[0] = SetrM128i(b5_128[0], b5_128[0]); + b5[1] = SetrM128i(b5_128[1], b5_128[1]); + + int x = 0; + do { + __m256i ma[3][4], mat[3][3], b[3][3][2], bt[3][3][2], p[2][2], ma3x[2][3], + ma5x[3]; + BoxFilterPreProcess( + src0 + x + 8, src1 + x + 8, + kOverreadInBytesPass1_256 + sizeof(*src0) * (x + 8 - width), x + 8, + scales, sum3, sum5, square_sum3, square_sum5, sum_width, sq, ma3, b3, + ma5, b5); + Prepare3_8(ma3[0], ma3x[0]); + Prepare3_8(ma3[1], ma3x[1]); + Prepare3_8(ma5, ma5x); + Store343_444(ma3x[0], b3[0], x, &ma[1][2], &mat[1][2], &ma[2][1], + &mat[2][1], b[1][2], bt[1][2], b[2][1], bt[2][1], ma343[2], + ma444[1], b343[2], b444[1]); + Store343_444(ma3x[1], b3[1], x, &ma[2][2], &mat[2][2], b[2][2], bt[2][2], + ma343[3], ma444[2], b343[3], b444[2]); + + ma[0][2] = Sum565Lo(ma5x); + ma[0][3] = Sum565Hi(ma5x); + ma[0][1] = _mm256_permute2x128_si256(ma[0][2], ma[0][3], 0x20); + ma[0][3] = _mm256_permute2x128_si256(ma[0][2], ma[0][3], 0x31); + StoreAligned32(ma565[1] + x + 0, ma[0][1]); + StoreAligned32(ma565[1] + x + 16, ma[0][3]); + Sum565(b5, b[0][1]); + StoreAligned64(b565[1] + x, b[0][1]); + const __m256i sr0_lo = LoadUnaligned32(src + x); + const __m256i sr1_lo = LoadUnaligned32(src + stride + x); + ma[0][0] = LoadAligned32(ma565[0] + x); + LoadAligned64(b565[0] + x, b[0][0]); + p[0][0] = CalculateFilteredOutputPass1(sr0_lo, ma[0], b[0]); + p[1][0] = CalculateFilteredOutput<4>(sr1_lo, ma[0][1], b[0][1]); + ma[1][0] = LoadAligned32(ma343[0] + x); + ma[1][1] = LoadAligned32(ma444[0] + x); + // Keeping the following 4 redundant lines is faster. The reason is that + // there are not enough registers available, and these values could be saved + // and loaded which is even slower. + ma[1][2] = LoadAligned32(ma343[2] + x); // Redundant line 1. + LoadAligned64(b343[0] + x, b[1][0]); + LoadAligned64(b444[0] + x, b[1][1]); + p[0][1] = CalculateFilteredOutputPass2(sr0_lo, ma[1], b[1]); + ma[2][0] = LoadAligned32(ma343[1] + x); + ma[2][1] = LoadAligned32(ma444[1] + x); // Redundant line 2. + LoadAligned64(b343[1] + x, b[2][0]); + p[1][1] = CalculateFilteredOutputPass2(sr1_lo, ma[2], b[2]); + const __m256i d00 = SelfGuidedDoubleMultiplier(sr0_lo, p[0], w0, w2); + ClipAndStore(dst + x, d00); + const __m256i d10x = SelfGuidedDoubleMultiplier(sr1_lo, p[1], w0, w2); + ClipAndStore(dst + stride + x, d10x); + + Sum565(b5 + 3, bt[0][1]); + StoreAligned64(b565[1] + x + 16, bt[0][1]); + const __m256i sr0_hi = LoadUnaligned32(src + x + 16); + const __m256i sr1_hi = LoadUnaligned32(src + stride + x + 16); + ma[0][2] = LoadAligned32(ma565[0] + x + 16); + LoadAligned64(b565[0] + x + 16, bt[0][0]); + p[0][0] = CalculateFilteredOutputPass1(sr0_hi, ma[0] + 2, bt[0]); + p[1][0] = CalculateFilteredOutput<4>(sr1_hi, ma[0][3], bt[0][1]); + mat[1][0] = LoadAligned32(ma343[0] + x + 16); + mat[1][1] = LoadAligned32(ma444[0] + x + 16); + mat[1][2] = LoadAligned32(ma343[2] + x + 16); // Redundant line 3. + LoadAligned64(b343[0] + x + 16, bt[1][0]); + LoadAligned64(b444[0] + x + 16, bt[1][1]); + p[0][1] = CalculateFilteredOutputPass2(sr0_hi, mat[1], bt[1]); + mat[2][0] = LoadAligned32(ma343[1] + x + 16); + mat[2][1] = LoadAligned32(ma444[1] + x + 16); // Redundant line 4. + LoadAligned64(b343[1] + x + 16, bt[2][0]); + p[1][1] = CalculateFilteredOutputPass2(sr1_hi, mat[2], bt[2]); + const __m256i d01 = SelfGuidedDoubleMultiplier(sr0_hi, p[0], w0, w2); + ClipAndStore(dst + x + 16, d01); + const __m256i d11 = SelfGuidedDoubleMultiplier(sr1_hi, p[1], w0, w2); + ClipAndStore(dst + stride + x + 16, d11); + + sq[0][0] = sq[0][6]; + sq[0][1] = sq[0][7]; + sq[1][0] = sq[1][6]; + sq[1][1] = sq[1][7]; + ma3[0][0] = ma3[0][2]; + ma3[1][0] = ma3[1][2]; + ma5[0] = ma5[2]; + b3[0][0] = b3[0][5]; + b3[0][1] = b3[0][6]; + b3[1][0] = b3[1][5]; + b3[1][1] = b3[1][6]; + b5[0] = b5[5]; + b5[1] = b5[6]; + x += 32; + } while (x < width); +} + +inline void BoxFilterLastRow( + const uint16_t* const src, const uint16_t* const src0, const int width, + const ptrdiff_t sum_width, const uint16_t scales[2], const int16_t w0, + const int16_t w2, uint16_t* const sum3[4], uint16_t* const sum5[5], + uint32_t* const square_sum3[4], uint32_t* const square_sum5[5], + uint16_t* const ma343, uint16_t* const ma444, uint16_t* const ma565, + uint32_t* const b343, uint32_t* const b444, uint32_t* const b565, + uint16_t* const dst) { + const ptrdiff_t overread_in_bytes = + kOverreadInBytesPass1_128 - sizeof(*src0) * width; + __m128i s[2], ma3_0, ma5_0, sq_128[4], b3_128[2], b5_128[2]; + __m256i ma3[3], ma5[3], sq[8], b3[7], b5[7]; + s[0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0); + s[1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16); + Square(s[0], sq_128); + BoxFilterPreProcessLastRowLo(s, scales, sum3, sum5, square_sum3, square_sum5, + sq_128, &ma3_0, &ma5_0, b3_128, b5_128); + sq[0] = SetrM128i(sq_128[2], sq_128[2]); + sq[1] = SetrM128i(sq_128[3], sq_128[3]); + ma3[0] = SetrM128i(ma3_0, ma3_0); + ma5[0] = SetrM128i(ma5_0, ma5_0); + b3[0] = SetrM128i(b3_128[0], b3_128[0]); + b3[1] = SetrM128i(b3_128[1], b3_128[1]); + b5[0] = SetrM128i(b5_128[0], b5_128[0]); + b5[1] = SetrM128i(b5_128[1], b5_128[1]); + + int x = 0; + do { + __m256i ma[4], mat[4], b[3][2], bt[3][2], ma3x[3], ma5x[3], p[2]; + BoxFilterPreProcessLastRow( + src0 + x + 8, + kOverreadInBytesPass1_256 + sizeof(*src0) * (x + 8 - width), sum_width, + x + 8, scales, sum3, sum5, square_sum3, square_sum5, sq, ma3, ma5, b3, + b5); + Prepare3_8(ma3, ma3x); + Prepare3_8(ma5, ma5x); + ma[2] = Sum565Lo(ma5x); + Sum565(b5, b[1]); + mat[1] = Sum565Hi(ma5x); + Sum565(b5 + 3, bt[1]); + ma[3] = Sum343Lo(ma3x); + Sum343(b3, b[2]); + mat[2] = Sum343Hi(ma3x); + Sum343(b3 + 3, bt[2]); + + const __m256i sr_lo = LoadUnaligned32(src + x); + ma[0] = LoadAligned32(ma565 + x); + ma[1] = _mm256_permute2x128_si256(ma[2], mat[1], 0x20); + mat[1] = _mm256_permute2x128_si256(ma[2], mat[1], 0x31); + LoadAligned64(b565 + x, b[0]); + p[0] = CalculateFilteredOutputPass1(sr_lo, ma, b); + ma[0] = LoadAligned32(ma343 + x); + ma[1] = LoadAligned32(ma444 + x); + ma[2] = _mm256_permute2x128_si256(ma[3], mat[2], 0x20); + LoadAligned64(b343 + x, b[0]); + LoadAligned64(b444 + x, b[1]); + p[1] = CalculateFilteredOutputPass2(sr_lo, ma, b); + const __m256i d0 = SelfGuidedDoubleMultiplier(sr_lo, p, w0, w2); + + const __m256i sr_hi = LoadUnaligned32(src + x + 16); + mat[0] = LoadAligned32(ma565 + x + 16); + LoadAligned64(b565 + x + 16, bt[0]); + p[0] = CalculateFilteredOutputPass1(sr_hi, mat, bt); + mat[0] = LoadAligned32(ma343 + x + 16); + mat[1] = LoadAligned32(ma444 + x + 16); + mat[2] = _mm256_permute2x128_si256(ma[3], mat[2], 0x31); + LoadAligned64(b343 + x + 16, bt[0]); + LoadAligned64(b444 + x + 16, bt[1]); + p[1] = CalculateFilteredOutputPass2(sr_hi, mat, bt); + const __m256i d1 = SelfGuidedDoubleMultiplier(sr_hi, p, w0, w2); + ClipAndStore(dst + x + 0, d0); + ClipAndStore(dst + x + 16, d1); + + sq[0] = sq[6]; + sq[1] = sq[7]; + ma3[0] = ma3[2]; + ma5[0] = ma5[2]; + b3[0] = b3[5]; + b3[1] = b3[6]; + b5[0] = b5[5]; + b5[1] = b5[6]; + x += 32; + } while (x < width); +} + +LIBGAV1_ALWAYS_INLINE void BoxFilterProcess( + const RestorationUnitInfo& restoration_info, const uint16_t* src, + const ptrdiff_t stride, const uint16_t* const top_border, + const ptrdiff_t top_border_stride, const uint16_t* bottom_border, + const ptrdiff_t bottom_border_stride, const int width, const int height, + SgrBuffer* const sgr_buffer, uint16_t* dst) { + const auto temp_stride = Align<ptrdiff_t>(width, 32); + const auto sum_width = temp_stride + 8; + const auto sum_stride = temp_stride + 32; + const int sgr_proj_index = restoration_info.sgr_proj_info.index; + const uint16_t* const scales = kSgrScaleParameter[sgr_proj_index]; // < 2^12. + const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0]; + const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1]; + const int16_t w2 = (1 << kSgrProjPrecisionBits) - w0 - w1; + uint16_t *sum3[4], *sum5[5], *ma343[4], *ma444[3], *ma565[2]; + uint32_t *square_sum3[4], *square_sum5[5], *b343[4], *b444[3], *b565[2]; + sum3[0] = sgr_buffer->sum3 + kSumOffset; + square_sum3[0] = sgr_buffer->square_sum3 + kSumOffset; + ma343[0] = sgr_buffer->ma343; + b343[0] = sgr_buffer->b343; + for (int i = 1; i <= 3; ++i) { + sum3[i] = sum3[i - 1] + sum_stride; + square_sum3[i] = square_sum3[i - 1] + sum_stride; + ma343[i] = ma343[i - 1] + temp_stride; + b343[i] = b343[i - 1] + temp_stride; + } + sum5[0] = sgr_buffer->sum5 + kSumOffset; + square_sum5[0] = sgr_buffer->square_sum5 + kSumOffset; + for (int i = 1; i <= 4; ++i) { + sum5[i] = sum5[i - 1] + sum_stride; + square_sum5[i] = square_sum5[i - 1] + sum_stride; + } + ma444[0] = sgr_buffer->ma444; + b444[0] = sgr_buffer->b444; + for (int i = 1; i <= 2; ++i) { + ma444[i] = ma444[i - 1] + temp_stride; + b444[i] = b444[i - 1] + temp_stride; + } + ma565[0] = sgr_buffer->ma565; + ma565[1] = ma565[0] + temp_stride; + b565[0] = sgr_buffer->b565; + b565[1] = b565[0] + temp_stride; + assert(scales[0] != 0); + assert(scales[1] != 0); + BoxSum(top_border, top_border_stride, width, sum_stride, temp_stride, sum3[0], + sum5[1], square_sum3[0], square_sum5[1]); + sum5[0] = sum5[1]; + square_sum5[0] = square_sum5[1]; + const uint16_t* const s = (height > 1) ? src + stride : bottom_border; + BoxSumFilterPreProcess(src, s, width, scales, sum3, sum5, square_sum3, + square_sum5, sum_width, ma343, ma444[0], ma565[0], + b343, b444[0], b565[0]); + sum5[0] = sgr_buffer->sum5 + kSumOffset; + square_sum5[0] = sgr_buffer->square_sum5 + kSumOffset; + + for (int y = (height >> 1) - 1; y > 0; --y) { + Circulate4PointersBy2<uint16_t>(sum3); + Circulate4PointersBy2<uint32_t>(square_sum3); + Circulate5PointersBy2<uint16_t>(sum5); + Circulate5PointersBy2<uint32_t>(square_sum5); + BoxFilter(src + 3, src + 2 * stride, src + 3 * stride, stride, width, + scales, w0, w2, sum3, sum5, square_sum3, square_sum5, sum_width, + ma343, ma444, ma565, b343, b444, b565, dst); + src += 2 * stride; + dst += 2 * stride; + Circulate4PointersBy2<uint16_t>(ma343); + Circulate4PointersBy2<uint32_t>(b343); + std::swap(ma444[0], ma444[2]); + std::swap(b444[0], b444[2]); + std::swap(ma565[0], ma565[1]); + std::swap(b565[0], b565[1]); + } + + Circulate4PointersBy2<uint16_t>(sum3); + Circulate4PointersBy2<uint32_t>(square_sum3); + Circulate5PointersBy2<uint16_t>(sum5); + Circulate5PointersBy2<uint32_t>(square_sum5); + if ((height & 1) == 0 || height > 1) { + const uint16_t* sr[2]; + if ((height & 1) == 0) { + sr[0] = bottom_border; + sr[1] = bottom_border + bottom_border_stride; + } else { + sr[0] = src + 2 * stride; + sr[1] = bottom_border; + } + BoxFilter(src + 3, sr[0], sr[1], stride, width, scales, w0, w2, sum3, sum5, + square_sum3, square_sum5, sum_width, ma343, ma444, ma565, b343, + b444, b565, dst); + } + if ((height & 1) != 0) { + if (height > 1) { + src += 2 * stride; + dst += 2 * stride; + Circulate4PointersBy2<uint16_t>(sum3); + Circulate4PointersBy2<uint32_t>(square_sum3); + Circulate5PointersBy2<uint16_t>(sum5); + Circulate5PointersBy2<uint32_t>(square_sum5); + Circulate4PointersBy2<uint16_t>(ma343); + Circulate4PointersBy2<uint32_t>(b343); + std::swap(ma444[0], ma444[2]); + std::swap(b444[0], b444[2]); + std::swap(ma565[0], ma565[1]); + std::swap(b565[0], b565[1]); + } + BoxFilterLastRow(src + 3, bottom_border + bottom_border_stride, width, + sum_width, scales, w0, w2, sum3, sum5, square_sum3, + square_sum5, ma343[0], ma444[0], ma565[0], b343[0], + b444[0], b565[0], dst); + } +} + +inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info, + const uint16_t* src, const ptrdiff_t stride, + const uint16_t* const top_border, + const ptrdiff_t top_border_stride, + const uint16_t* bottom_border, + const ptrdiff_t bottom_border_stride, + const int width, const int height, + SgrBuffer* const sgr_buffer, uint16_t* dst) { + const auto temp_stride = Align<ptrdiff_t>(width, 32); + const auto sum_width = temp_stride + 8; + const auto sum_stride = temp_stride + 32; + const int sgr_proj_index = restoration_info.sgr_proj_info.index; + const uint32_t scale = kSgrScaleParameter[sgr_proj_index][0]; // < 2^12. + const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0]; + uint16_t *sum5[5], *ma565[2]; + uint32_t *square_sum5[5], *b565[2]; + sum5[0] = sgr_buffer->sum5 + kSumOffset; + square_sum5[0] = sgr_buffer->square_sum5 + kSumOffset; + for (int i = 1; i <= 4; ++i) { + sum5[i] = sum5[i - 1] + sum_stride; + square_sum5[i] = square_sum5[i - 1] + sum_stride; + } + ma565[0] = sgr_buffer->ma565; + ma565[1] = ma565[0] + temp_stride; + b565[0] = sgr_buffer->b565; + b565[1] = b565[0] + temp_stride; + assert(scale != 0); + BoxSum<5>(top_border, top_border_stride, width, sum_stride, temp_stride, + sum5[1], square_sum5[1]); + sum5[0] = sum5[1]; + square_sum5[0] = square_sum5[1]; + const uint16_t* const s = (height > 1) ? src + stride : bottom_border; + BoxSumFilterPreProcess5(src, s, width, scale, sum5, square_sum5, sum_width, + ma565[0], b565[0]); + sum5[0] = sgr_buffer->sum5 + kSumOffset; + square_sum5[0] = sgr_buffer->square_sum5 + kSumOffset; + + for (int y = (height >> 1) - 1; y > 0; --y) { + Circulate5PointersBy2<uint16_t>(sum5); + Circulate5PointersBy2<uint32_t>(square_sum5); + BoxFilterPass1(src + 3, src + 2 * stride, src + 3 * stride, stride, sum5, + square_sum5, width, sum_width, scale, w0, ma565, b565, dst); + src += 2 * stride; + dst += 2 * stride; + std::swap(ma565[0], ma565[1]); + std::swap(b565[0], b565[1]); + } + + Circulate5PointersBy2<uint16_t>(sum5); + Circulate5PointersBy2<uint32_t>(square_sum5); + if ((height & 1) == 0 || height > 1) { + const uint16_t* sr[2]; + if ((height & 1) == 0) { + sr[0] = bottom_border; + sr[1] = bottom_border + bottom_border_stride; + } else { + sr[0] = src + 2 * stride; + sr[1] = bottom_border; + } + BoxFilterPass1(src + 3, sr[0], sr[1], stride, sum5, square_sum5, width, + sum_width, scale, w0, ma565, b565, dst); + } + if ((height & 1) != 0) { + src += 3; + if (height > 1) { + src += 2 * stride; + dst += 2 * stride; + std::swap(ma565[0], ma565[1]); + std::swap(b565[0], b565[1]); + Circulate5PointersBy2<uint16_t>(sum5); + Circulate5PointersBy2<uint32_t>(square_sum5); + } + BoxFilterPass1LastRow(src, bottom_border + bottom_border_stride, width, + sum_width, scale, w0, sum5, square_sum5, ma565[0], + b565[0], dst); + } +} + +inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info, + const uint16_t* src, const ptrdiff_t stride, + const uint16_t* const top_border, + const ptrdiff_t top_border_stride, + const uint16_t* bottom_border, + const ptrdiff_t bottom_border_stride, + const int width, const int height, + SgrBuffer* const sgr_buffer, uint16_t* dst) { + assert(restoration_info.sgr_proj_info.multiplier[0] == 0); + const auto temp_stride = Align<ptrdiff_t>(width, 32); + const auto sum_width = temp_stride + 8; + const auto sum_stride = temp_stride + 32; + const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1]; + const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1; + const int sgr_proj_index = restoration_info.sgr_proj_info.index; + const uint32_t scale = kSgrScaleParameter[sgr_proj_index][1]; // < 2^12. + uint16_t *sum3[3], *ma343[3], *ma444[2]; + uint32_t *square_sum3[3], *b343[3], *b444[2]; + sum3[0] = sgr_buffer->sum3 + kSumOffset; + square_sum3[0] = sgr_buffer->square_sum3 + kSumOffset; + ma343[0] = sgr_buffer->ma343; + b343[0] = sgr_buffer->b343; + for (int i = 1; i <= 2; ++i) { + sum3[i] = sum3[i - 1] + sum_stride; + square_sum3[i] = square_sum3[i - 1] + sum_stride; + ma343[i] = ma343[i - 1] + temp_stride; + b343[i] = b343[i - 1] + temp_stride; + } + ma444[0] = sgr_buffer->ma444; + ma444[1] = ma444[0] + temp_stride; + b444[0] = sgr_buffer->b444; + b444[1] = b444[0] + temp_stride; + assert(scale != 0); + BoxSum<3>(top_border, top_border_stride, width, sum_stride, temp_stride, + sum3[0], square_sum3[0]); + BoxSumFilterPreProcess3<false>(src, width, scale, sum3, square_sum3, + sum_width, ma343[0], nullptr, b343[0], + nullptr); + Circulate3PointersBy1<uint16_t>(sum3); + Circulate3PointersBy1<uint32_t>(square_sum3); + const uint16_t* s; + if (height > 1) { + s = src + stride; + } else { + s = bottom_border; + bottom_border += bottom_border_stride; + } + BoxSumFilterPreProcess3<true>(s, width, scale, sum3, square_sum3, sum_width, + ma343[1], ma444[0], b343[1], b444[0]); + + for (int y = height - 2; y > 0; --y) { + Circulate3PointersBy1<uint16_t>(sum3); + Circulate3PointersBy1<uint32_t>(square_sum3); + BoxFilterPass2(src + 2, src + 2 * stride, width, sum_width, scale, w0, sum3, + square_sum3, ma343, ma444, b343, b444, dst); + src += stride; + dst += stride; + Circulate3PointersBy1<uint16_t>(ma343); + Circulate3PointersBy1<uint32_t>(b343); + std::swap(ma444[0], ma444[1]); + std::swap(b444[0], b444[1]); + } + + int y = std::min(height, 2); + src += 2; + do { + Circulate3PointersBy1<uint16_t>(sum3); + Circulate3PointersBy1<uint32_t>(square_sum3); + BoxFilterPass2(src, bottom_border, width, sum_width, scale, w0, sum3, + square_sum3, ma343, ma444, b343, b444, dst); + src += stride; + dst += stride; + bottom_border += bottom_border_stride; + Circulate3PointersBy1<uint16_t>(ma343); + Circulate3PointersBy1<uint32_t>(b343); + std::swap(ma444[0], ma444[1]); + std::swap(b444[0], b444[1]); + } while (--y != 0); +} + +// If |width| is non-multiple of 32, up to 31 more pixels are written to |dest| +// in the end of each row. It is safe to overwrite the output as it will not be +// part of the visible frame. +void SelfGuidedFilter_AVX2( + const RestorationUnitInfo& restoration_info, const void* const source, + const ptrdiff_t stride, const void* const top_border, + const ptrdiff_t top_border_stride, const void* const bottom_border, + const ptrdiff_t bottom_border_stride, const int width, const int height, + RestorationBuffer* const restoration_buffer, void* const dest) { + const int index = restoration_info.sgr_proj_info.index; + const int radius_pass_0 = kSgrProjParams[index][0]; // 2 or 0 + const int radius_pass_1 = kSgrProjParams[index][2]; // 1 or 0 + const auto* const src = static_cast<const uint16_t*>(source); + const auto* const top = static_cast<const uint16_t*>(top_border); + const auto* const bottom = static_cast<const uint16_t*>(bottom_border); + auto* const dst = static_cast<uint16_t*>(dest); + SgrBuffer* const sgr_buffer = &restoration_buffer->sgr_buffer; + if (radius_pass_1 == 0) { + // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the + // following assertion. + assert(radius_pass_0 != 0); + BoxFilterProcessPass1(restoration_info, src - 3, stride, top - 3, + top_border_stride, bottom - 3, bottom_border_stride, + width, height, sgr_buffer, dst); + } else if (radius_pass_0 == 0) { + BoxFilterProcessPass2(restoration_info, src - 2, stride, top - 2, + top_border_stride, bottom - 2, bottom_border_stride, + width, height, sgr_buffer, dst); + } else { + BoxFilterProcess(restoration_info, src - 3, stride, top - 3, + top_border_stride, bottom - 3, bottom_border_stride, width, + height, sgr_buffer, dst); + } +} + void Init10bpp() { Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10); assert(dsp != nullptr); #if DSP_ENABLED_10BPP_AVX2(WienerFilter) dsp->loop_restorations[0] = WienerFilter_AVX2; #endif +#if DSP_ENABLED_10BPP_AVX2(SelfGuidedFilter) + dsp->loop_restorations[1] = SelfGuidedFilter_AVX2; +#endif } } // namespace @@ -581,7 +3146,7 @@ void LoopRestorationInit10bpp_AVX2() { Init10bpp(); } } // namespace dsp } // namespace libgav1 -#else // !(LIBGAV1_TARGETING_AVX2 && LIBGAV1_MAX_BITDEPTH >= 10) +#else // !(LIBGAV1_TARGETING_AVX2 && LIBGAV1_MAX_BITDEPTH >= 10) namespace libgav1 { namespace dsp { diff --git a/src/dsp/x86/loop_restoration_10bit_sse4.cc b/src/dsp/x86/loop_restoration_10bit_sse4.cc index 0598435..96380e3 100644 --- a/src/dsp/x86/loop_restoration_10bit_sse4.cc +++ b/src/dsp/x86/loop_restoration_10bit_sse4.cc @@ -428,13 +428,12 @@ inline void WienerVerticalTap1(const int16_t* wiener_buffer, } } -void WienerFilter_SSE4_1(const RestorationUnitInfo& restoration_info, - const void* const source, const void* const top_border, - const void* const bottom_border, - const ptrdiff_t stride, const int width, - const int height, - RestorationBuffer* const restoration_buffer, - void* const dest) { +void WienerFilter_SSE4_1( + const RestorationUnitInfo& restoration_info, const void* const source, + const ptrdiff_t stride, const void* const top_border, + const ptrdiff_t top_border_stride, const void* const bottom_border, + const ptrdiff_t bottom_border_stride, const int width, const int height, + RestorationBuffer* const restoration_buffer, void* const dest) { const int16_t* const number_leading_zero_coefficients = restoration_info.wiener_info.number_leading_zero_coefficients; const int number_rows_to_skip = std::max( @@ -458,39 +457,42 @@ void WienerFilter_SSE4_1(const RestorationUnitInfo& restoration_info, const __m128i coefficients_horizontal = LoadLo8(restoration_info.wiener_info.filter[WienerInfo::kHorizontal]); if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) { - WienerHorizontalTap7(top + (2 - height_extra) * stride - 3, stride, - wiener_stride, height_extra, coefficients_horizontal, - &wiener_buffer_horizontal); - WienerHorizontalTap7(src - 3, stride, wiener_stride, height, + WienerHorizontalTap7(top + (2 - height_extra) * top_border_stride - 3, + top_border_stride, wiener_stride, height_extra, coefficients_horizontal, &wiener_buffer_horizontal); - WienerHorizontalTap7(bottom - 3, stride, wiener_stride, height_extra, + WienerHorizontalTap7(src - 3, stride, wiener_stride, height, coefficients_horizontal, &wiener_buffer_horizontal); - } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) { - WienerHorizontalTap5(top + (2 - height_extra) * stride - 2, stride, - wiener_stride, height_extra, coefficients_horizontal, + WienerHorizontalTap7(bottom - 3, bottom_border_stride, wiener_stride, + height_extra, coefficients_horizontal, &wiener_buffer_horizontal); - WienerHorizontalTap5(src - 2, stride, wiener_stride, height, + } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) { + WienerHorizontalTap5(top + (2 - height_extra) * top_border_stride - 2, + top_border_stride, wiener_stride, height_extra, coefficients_horizontal, &wiener_buffer_horizontal); - WienerHorizontalTap5(bottom - 2, stride, wiener_stride, height_extra, + WienerHorizontalTap5(src - 2, stride, wiener_stride, height, coefficients_horizontal, &wiener_buffer_horizontal); + WienerHorizontalTap5(bottom - 2, bottom_border_stride, wiener_stride, + height_extra, coefficients_horizontal, + &wiener_buffer_horizontal); } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) { // The maximum over-reads happen here. - WienerHorizontalTap3(top + (2 - height_extra) * stride - 1, stride, - wiener_stride, height_extra, coefficients_horizontal, - &wiener_buffer_horizontal); - WienerHorizontalTap3(src - 1, stride, wiener_stride, height, + WienerHorizontalTap3(top + (2 - height_extra) * top_border_stride - 1, + top_border_stride, wiener_stride, height_extra, coefficients_horizontal, &wiener_buffer_horizontal); - WienerHorizontalTap3(bottom - 1, stride, wiener_stride, height_extra, + WienerHorizontalTap3(src - 1, stride, wiener_stride, height, coefficients_horizontal, &wiener_buffer_horizontal); + WienerHorizontalTap3(bottom - 1, bottom_border_stride, wiener_stride, + height_extra, coefficients_horizontal, + &wiener_buffer_horizontal); } else { assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3); - WienerHorizontalTap1(top + (2 - height_extra) * stride, stride, - wiener_stride, height_extra, + WienerHorizontalTap1(top + (2 - height_extra) * top_border_stride, + top_border_stride, wiener_stride, height_extra, &wiener_buffer_horizontal); WienerHorizontalTap1(src, stride, wiener_stride, height, &wiener_buffer_horizontal); - WienerHorizontalTap1(bottom, stride, wiener_stride, height_extra, - &wiener_buffer_horizontal); + WienerHorizontalTap1(bottom, bottom_border_stride, wiener_stride, + height_extra, &wiener_buffer_horizontal); } // vertical filtering. @@ -522,6 +524,1978 @@ void WienerFilter_SSE4_1(const RestorationUnitInfo& restoration_info, } } +//------------------------------------------------------------------------------ +// SGR + +// SIMD overreads 8 - (width % 8) - 2 * padding pixels, where padding is 3 for +// Pass 1 and 2 for Pass 2. +constexpr int kOverreadInBytesPass1 = 4; +constexpr int kOverreadInBytesPass2 = 8; + +inline void LoadAligned16x2U16(const uint16_t* const src[2], const ptrdiff_t x, + __m128i dst[2]) { + dst[0] = LoadAligned16(src[0] + x); + dst[1] = LoadAligned16(src[1] + x); +} + +inline void LoadAligned16x2U16Msan(const uint16_t* const src[2], + const ptrdiff_t x, const ptrdiff_t border, + __m128i dst[2]) { + dst[0] = LoadAligned16Msan(src[0] + x, sizeof(**src) * (x + 8 - border)); + dst[1] = LoadAligned16Msan(src[1] + x, sizeof(**src) * (x + 8 - border)); +} + +inline void LoadAligned16x3U16(const uint16_t* const src[3], const ptrdiff_t x, + __m128i dst[3]) { + dst[0] = LoadAligned16(src[0] + x); + dst[1] = LoadAligned16(src[1] + x); + dst[2] = LoadAligned16(src[2] + x); +} + +inline void LoadAligned16x3U16Msan(const uint16_t* const src[3], + const ptrdiff_t x, const ptrdiff_t border, + __m128i dst[3]) { + dst[0] = LoadAligned16Msan(src[0] + x, sizeof(**src) * (x + 8 - border)); + dst[1] = LoadAligned16Msan(src[1] + x, sizeof(**src) * (x + 8 - border)); + dst[2] = LoadAligned16Msan(src[2] + x, sizeof(**src) * (x + 8 - border)); +} + +inline void LoadAligned32U32(const uint32_t* const src, __m128i dst[2]) { + dst[0] = LoadAligned16(src + 0); + dst[1] = LoadAligned16(src + 4); +} + +inline void LoadAligned32U32Msan(const uint32_t* const src, const ptrdiff_t x, + const ptrdiff_t border, __m128i dst[2]) { + dst[0] = LoadAligned16Msan(src + x + 0, sizeof(*src) * (x + 4 - border)); + dst[1] = LoadAligned16Msan(src + x + 4, sizeof(*src) * (x + 8 - border)); +} + +inline void LoadAligned32x2U32(const uint32_t* const src[2], const ptrdiff_t x, + __m128i dst[2][2]) { + LoadAligned32U32(src[0] + x, dst[0]); + LoadAligned32U32(src[1] + x, dst[1]); +} + +inline void LoadAligned32x2U32Msan(const uint32_t* const src[2], + const ptrdiff_t x, const ptrdiff_t border, + __m128i dst[2][2]) { + LoadAligned32U32Msan(src[0], x, border, dst[0]); + LoadAligned32U32Msan(src[1], x, border, dst[1]); +} + +inline void LoadAligned32x3U32(const uint32_t* const src[3], const ptrdiff_t x, + __m128i dst[3][2]) { + LoadAligned32U32(src[0] + x, dst[0]); + LoadAligned32U32(src[1] + x, dst[1]); + LoadAligned32U32(src[2] + x, dst[2]); +} + +inline void LoadAligned32x3U32Msan(const uint32_t* const src[3], + const ptrdiff_t x, const ptrdiff_t border, + __m128i dst[3][2]) { + LoadAligned32U32Msan(src[0], x, border, dst[0]); + LoadAligned32U32Msan(src[1], x, border, dst[1]); + LoadAligned32U32Msan(src[2], x, border, dst[2]); +} + +inline void StoreAligned32U16(uint16_t* const dst, const __m128i src[2]) { + StoreAligned16(dst + 0, src[0]); + StoreAligned16(dst + 8, src[1]); +} + +inline void StoreAligned32U32(uint32_t* const dst, const __m128i src[2]) { + StoreAligned16(dst + 0, src[0]); + StoreAligned16(dst + 4, src[1]); +} + +inline void StoreAligned64U32(uint32_t* const dst, const __m128i src[4]) { + StoreAligned32U32(dst + 0, src + 0); + StoreAligned32U32(dst + 8, src + 2); +} + +// Don't use _mm_cvtepu8_epi16() or _mm_cvtepu16_epi32() in the following +// functions. Some compilers may generate super inefficient code and the whole +// decoder could be 15% slower. + +inline __m128i VaddlLo8(const __m128i src0, const __m128i src1) { + const __m128i s0 = _mm_unpacklo_epi8(src0, _mm_setzero_si128()); + const __m128i s1 = _mm_unpacklo_epi8(src1, _mm_setzero_si128()); + return _mm_add_epi16(s0, s1); +} + +inline __m128i VaddlHi8(const __m128i src0, const __m128i src1) { + const __m128i s0 = _mm_unpackhi_epi8(src0, _mm_setzero_si128()); + const __m128i s1 = _mm_unpackhi_epi8(src1, _mm_setzero_si128()); + return _mm_add_epi16(s0, s1); +} + +inline __m128i VaddwLo8(const __m128i src0, const __m128i src1) { + const __m128i s1 = _mm_unpacklo_epi8(src1, _mm_setzero_si128()); + return _mm_add_epi16(src0, s1); +} + +inline __m128i VaddwHi8(const __m128i src0, const __m128i src1) { + const __m128i s1 = _mm_unpackhi_epi8(src1, _mm_setzero_si128()); + return _mm_add_epi16(src0, s1); +} + +inline __m128i VmullNLo8(const __m128i src0, const int src1) { + const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128()); + return _mm_madd_epi16(s0, _mm_set1_epi32(src1)); +} + +inline __m128i VmullNHi8(const __m128i src0, const int src1) { + const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128()); + return _mm_madd_epi16(s0, _mm_set1_epi32(src1)); +} + +inline __m128i VmullLo16(const __m128i src0, const __m128i src1) { + const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128()); + const __m128i s1 = _mm_unpacklo_epi16(src1, _mm_setzero_si128()); + return _mm_madd_epi16(s0, s1); +} + +inline __m128i VmullHi16(const __m128i src0, const __m128i src1) { + const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128()); + const __m128i s1 = _mm_unpackhi_epi16(src1, _mm_setzero_si128()); + return _mm_madd_epi16(s0, s1); +} + +inline __m128i VrshrU16(const __m128i src0, const int src1) { + const __m128i sum = _mm_add_epi16(src0, _mm_set1_epi16(1 << (src1 - 1))); + return _mm_srli_epi16(sum, src1); +} + +inline __m128i VrshrS32(const __m128i src0, const int src1) { + const __m128i sum = _mm_add_epi32(src0, _mm_set1_epi32(1 << (src1 - 1))); + return _mm_srai_epi32(sum, src1); +} + +inline __m128i VrshrU32(const __m128i src0, const int src1) { + const __m128i sum = _mm_add_epi32(src0, _mm_set1_epi32(1 << (src1 - 1))); + return _mm_srli_epi32(sum, src1); +} + +inline void Square(const __m128i src, __m128i dst[2]) { + const __m128i s0 = _mm_unpacklo_epi16(src, _mm_setzero_si128()); + const __m128i s1 = _mm_unpackhi_epi16(src, _mm_setzero_si128()); + dst[0] = _mm_madd_epi16(s0, s0); + dst[1] = _mm_madd_epi16(s1, s1); +} + +template <int offset> +inline void Prepare3_8(const __m128i src[2], __m128i dst[3]) { + dst[0] = _mm_alignr_epi8(src[1], src[0], offset + 0); + dst[1] = _mm_alignr_epi8(src[1], src[0], offset + 1); + dst[2] = _mm_alignr_epi8(src[1], src[0], offset + 2); +} + +inline void Prepare3_16(const __m128i src[2], __m128i dst[3]) { + dst[0] = src[0]; + dst[1] = _mm_alignr_epi8(src[1], src[0], 2); + dst[2] = _mm_alignr_epi8(src[1], src[0], 4); +} + +inline void Prepare3_32(const __m128i src[2], __m128i dst[3]) { + dst[0] = src[0]; + dst[1] = _mm_alignr_epi8(src[1], src[0], 4); + dst[2] = _mm_alignr_epi8(src[1], src[0], 8); +} + +inline void Prepare5_16(const __m128i src[2], __m128i dst[5]) { + Prepare3_16(src, dst); + dst[3] = _mm_alignr_epi8(src[1], src[0], 6); + dst[4] = _mm_alignr_epi8(src[1], src[0], 8); +} + +inline void Prepare5_32(const __m128i src[2], __m128i dst[5]) { + Prepare3_32(src, dst); + dst[3] = _mm_alignr_epi8(src[1], src[0], 12); + dst[4] = src[1]; +} + +inline __m128i Sum3_16(const __m128i src0, const __m128i src1, + const __m128i src2) { + const __m128i sum = _mm_add_epi16(src0, src1); + return _mm_add_epi16(sum, src2); +} + +inline __m128i Sum3_16(const __m128i src[3]) { + return Sum3_16(src[0], src[1], src[2]); +} + +inline __m128i Sum3_32(const __m128i src0, const __m128i src1, + const __m128i src2) { + const __m128i sum = _mm_add_epi32(src0, src1); + return _mm_add_epi32(sum, src2); +} + +inline __m128i Sum3_32(const __m128i src[3]) { + return Sum3_32(src[0], src[1], src[2]); +} + +inline void Sum3_32(const __m128i src[3][2], __m128i dst[2]) { + dst[0] = Sum3_32(src[0][0], src[1][0], src[2][0]); + dst[1] = Sum3_32(src[0][1], src[1][1], src[2][1]); +} + +inline __m128i Sum3WLo16(const __m128i src[3]) { + const __m128i sum = VaddlLo8(src[0], src[1]); + return VaddwLo8(sum, src[2]); +} + +inline __m128i Sum3WHi16(const __m128i src[3]) { + const __m128i sum = VaddlHi8(src[0], src[1]); + return VaddwHi8(sum, src[2]); +} + +inline __m128i Sum5_16(const __m128i src[5]) { + const __m128i sum01 = _mm_add_epi16(src[0], src[1]); + const __m128i sum23 = _mm_add_epi16(src[2], src[3]); + const __m128i sum = _mm_add_epi16(sum01, sum23); + return _mm_add_epi16(sum, src[4]); +} + +inline __m128i Sum5_32(const __m128i* const src0, const __m128i* const src1, + const __m128i* const src2, const __m128i* const src3, + const __m128i* const src4) { + const __m128i sum01 = _mm_add_epi32(*src0, *src1); + const __m128i sum23 = _mm_add_epi32(*src2, *src3); + const __m128i sum = _mm_add_epi32(sum01, sum23); + return _mm_add_epi32(sum, *src4); +} + +inline __m128i Sum5_32(const __m128i src[5]) { + return Sum5_32(&src[0], &src[1], &src[2], &src[3], &src[4]); +} + +inline void Sum5_32(const __m128i src[5][2], __m128i dst[2]) { + dst[0] = Sum5_32(&src[0][0], &src[1][0], &src[2][0], &src[3][0], &src[4][0]); + dst[1] = Sum5_32(&src[0][1], &src[1][1], &src[2][1], &src[3][1], &src[4][1]); +} + +inline __m128i Sum3Horizontal16(const __m128i src[2]) { + __m128i s[3]; + Prepare3_16(src, s); + return Sum3_16(s); +} + +inline void Sum3Horizontal32(const __m128i src[3], __m128i dst[2]) { + __m128i s[3]; + Prepare3_32(src + 0, s); + dst[0] = Sum3_32(s); + Prepare3_32(src + 1, s); + dst[1] = Sum3_32(s); +} + +inline __m128i Sum5Horizontal16(const __m128i src[2]) { + __m128i s[5]; + Prepare5_16(src, s); + return Sum5_16(s); +} + +inline void Sum5Horizontal32(const __m128i src[3], __m128i dst[2]) { + __m128i s[5]; + Prepare5_32(src + 0, s); + dst[0] = Sum5_32(s); + Prepare5_32(src + 1, s); + dst[1] = Sum5_32(s); +} + +void SumHorizontal16(const __m128i src[2], __m128i* const row3, + __m128i* const row5) { + __m128i s[5]; + Prepare5_16(src, s); + const __m128i sum04 = _mm_add_epi16(s[0], s[4]); + *row3 = Sum3_16(s + 1); + *row5 = _mm_add_epi16(sum04, *row3); +} + +inline void SumHorizontal16(const __m128i src[3], __m128i* const row3_0, + __m128i* const row3_1, __m128i* const row5_0, + __m128i* const row5_1) { + SumHorizontal16(src + 0, row3_0, row5_0); + SumHorizontal16(src + 1, row3_1, row5_1); +} + +void SumHorizontal32(const __m128i src[5], __m128i* const row_sq3, + __m128i* const row_sq5) { + const __m128i sum04 = _mm_add_epi32(src[0], src[4]); + *row_sq3 = Sum3_32(src + 1); + *row_sq5 = _mm_add_epi32(sum04, *row_sq3); +} + +inline void SumHorizontal32(const __m128i src[3], __m128i* const row_sq3_0, + __m128i* const row_sq3_1, __m128i* const row_sq5_0, + __m128i* const row_sq5_1) { + __m128i s[5]; + Prepare5_32(src + 0, s); + SumHorizontal32(s, row_sq3_0, row_sq5_0); + Prepare5_32(src + 1, s); + SumHorizontal32(s, row_sq3_1, row_sq5_1); +} + +inline __m128i Sum343Lo(const __m128i ma3[3]) { + const __m128i sum = Sum3WLo16(ma3); + const __m128i sum3 = Sum3_16(sum, sum, sum); + return VaddwLo8(sum3, ma3[1]); +} + +inline __m128i Sum343Hi(const __m128i ma3[3]) { + const __m128i sum = Sum3WHi16(ma3); + const __m128i sum3 = Sum3_16(sum, sum, sum); + return VaddwHi8(sum3, ma3[1]); +} + +inline __m128i Sum343(const __m128i src[3]) { + const __m128i sum = Sum3_32(src); + const __m128i sum3 = Sum3_32(sum, sum, sum); + return _mm_add_epi32(sum3, src[1]); +} + +inline void Sum343(const __m128i src[3], __m128i dst[2]) { + __m128i s[3]; + Prepare3_32(src + 0, s); + dst[0] = Sum343(s); + Prepare3_32(src + 1, s); + dst[1] = Sum343(s); +} + +inline __m128i Sum565Lo(const __m128i src[3]) { + const __m128i sum = Sum3WLo16(src); + const __m128i sum4 = _mm_slli_epi16(sum, 2); + const __m128i sum5 = _mm_add_epi16(sum4, sum); + return VaddwLo8(sum5, src[1]); +} + +inline __m128i Sum565Hi(const __m128i src[3]) { + const __m128i sum = Sum3WHi16(src); + const __m128i sum4 = _mm_slli_epi16(sum, 2); + const __m128i sum5 = _mm_add_epi16(sum4, sum); + return VaddwHi8(sum5, src[1]); +} + +inline __m128i Sum565(const __m128i src[3]) { + const __m128i sum = Sum3_32(src); + const __m128i sum4 = _mm_slli_epi32(sum, 2); + const __m128i sum5 = _mm_add_epi32(sum4, sum); + return _mm_add_epi32(sum5, src[1]); +} + +inline void Sum565(const __m128i src[3], __m128i dst[2]) { + __m128i s[3]; + Prepare3_32(src + 0, s); + dst[0] = Sum565(s); + Prepare3_32(src + 1, s); + dst[1] = Sum565(s); +} + +inline void BoxSum(const uint16_t* src, const ptrdiff_t src_stride, + const ptrdiff_t width, const ptrdiff_t sum_stride, + const ptrdiff_t sum_width, uint16_t* sum3, uint16_t* sum5, + uint32_t* square_sum3, uint32_t* square_sum5) { + const ptrdiff_t overread_in_bytes = + kOverreadInBytesPass1 - sizeof(*src) * width; + int y = 2; + do { + __m128i s[3], sq[6]; + s[0] = LoadUnaligned16Msan(src, overread_in_bytes); + Square(s[0], sq); + ptrdiff_t x = sum_width; + do { + __m128i row3[2], row5[2], row_sq3[2], row_sq5[2]; + s[1] = LoadUnaligned16Msan( + src + 8, overread_in_bytes + sizeof(*src) * (sum_width - x + 8)); + x -= 16; + src += 16; + s[2] = LoadUnaligned16Msan( + src, overread_in_bytes + sizeof(*src) * (sum_width - x)); + Square(s[1], sq + 2); + Square(s[2], sq + 4); + SumHorizontal16(s, &row3[0], &row3[1], &row5[0], &row5[1]); + StoreAligned32U16(sum3, row3); + StoreAligned32U16(sum5, row5); + SumHorizontal32(sq + 0, &row_sq3[0], &row_sq3[1], &row_sq5[0], + &row_sq5[1]); + StoreAligned32U32(square_sum3 + 0, row_sq3); + StoreAligned32U32(square_sum5 + 0, row_sq5); + SumHorizontal32(sq + 2, &row_sq3[0], &row_sq3[1], &row_sq5[0], + &row_sq5[1]); + StoreAligned32U32(square_sum3 + 8, row_sq3); + StoreAligned32U32(square_sum5 + 8, row_sq5); + s[0] = s[2]; + sq[0] = sq[4]; + sq[1] = sq[5]; + sum3 += 16; + sum5 += 16; + square_sum3 += 16; + square_sum5 += 16; + } while (x != 0); + src += src_stride - sum_width; + sum3 += sum_stride - sum_width; + sum5 += sum_stride - sum_width; + square_sum3 += sum_stride - sum_width; + square_sum5 += sum_stride - sum_width; + } while (--y != 0); +} + +template <int size> +inline void BoxSum(const uint16_t* src, const ptrdiff_t src_stride, + const ptrdiff_t width, const ptrdiff_t sum_stride, + const ptrdiff_t sum_width, uint16_t* sums, + uint32_t* square_sums) { + static_assert(size == 3 || size == 5, ""); + const ptrdiff_t overread_in_bytes = + ((size == 5) ? kOverreadInBytesPass1 : kOverreadInBytesPass2) - + sizeof(*src) * width; + int y = 2; + do { + __m128i s[3], sq[6]; + s[0] = LoadUnaligned16Msan(src, overread_in_bytes); + Square(s[0], sq); + ptrdiff_t x = sum_width; + do { + __m128i row[2], row_sq[4]; + s[1] = LoadUnaligned16Msan( + src + 8, overread_in_bytes + sizeof(*src) * (sum_width - x + 8)); + x -= 16; + src += 16; + s[2] = LoadUnaligned16Msan( + src, overread_in_bytes + sizeof(*src) * (sum_width - x)); + Square(s[1], sq + 2); + Square(s[2], sq + 4); + if (size == 3) { + row[0] = Sum3Horizontal16(s + 0); + row[1] = Sum3Horizontal16(s + 1); + Sum3Horizontal32(sq + 0, row_sq + 0); + Sum3Horizontal32(sq + 2, row_sq + 2); + } else { + row[0] = Sum5Horizontal16(s + 0); + row[1] = Sum5Horizontal16(s + 1); + Sum5Horizontal32(sq + 0, row_sq + 0); + Sum5Horizontal32(sq + 2, row_sq + 2); + } + StoreAligned32U16(sums, row); + StoreAligned64U32(square_sums, row_sq); + s[0] = s[2]; + sq[0] = sq[4]; + sq[1] = sq[5]; + sums += 16; + square_sums += 16; + } while (x != 0); + src += src_stride - sum_width; + sums += sum_stride - sum_width; + square_sums += sum_stride - sum_width; + } while (--y != 0); +} + +template <int n> +inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq, + const uint32_t scale) { + static_assert(n == 9 || n == 25, ""); + // a = |sum_sq| + // d = |sum| + // p = (a * n < d * d) ? 0 : a * n - d * d; + const __m128i dxd = _mm_madd_epi16(sum, sum); + // _mm_mullo_epi32() has high latency. Using shifts and additions instead. + // Some compilers could do this for us but we make this explicit. + // return _mm_mullo_epi32(sum_sq, _mm_set1_epi32(n)); + __m128i axn = _mm_add_epi32(sum_sq, _mm_slli_epi32(sum_sq, 3)); + if (n == 25) axn = _mm_add_epi32(axn, _mm_slli_epi32(sum_sq, 4)); + const __m128i sub = _mm_sub_epi32(axn, dxd); + const __m128i p = _mm_max_epi32(sub, _mm_setzero_si128()); + const __m128i pxs = _mm_mullo_epi32(p, _mm_set1_epi32(scale)); + return VrshrU32(pxs, kSgrProjScaleBits); +} + +template <int n> +inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq[2], + const uint32_t scale) { + static_assert(n == 9 || n == 25, ""); + const __m128i b = VrshrU16(sum, 2); + const __m128i sum_lo = _mm_unpacklo_epi16(b, _mm_setzero_si128()); + const __m128i sum_hi = _mm_unpackhi_epi16(b, _mm_setzero_si128()); + const __m128i z0 = CalculateMa<n>(sum_lo, VrshrU32(sum_sq[0], 4), scale); + const __m128i z1 = CalculateMa<n>(sum_hi, VrshrU32(sum_sq[1], 4), scale); + return _mm_packus_epi32(z0, z1); +} + +inline void CalculateB5(const __m128i sum, const __m128i ma, __m128i b[2]) { + // one_over_n == 164. + constexpr uint32_t one_over_n = + ((1 << kSgrProjReciprocalBits) + (25 >> 1)) / 25; + // one_over_n_quarter == 41. + constexpr uint32_t one_over_n_quarter = one_over_n >> 2; + static_assert(one_over_n == one_over_n_quarter << 2, ""); + // |ma| is in range [0, 255]. + const __m128i m = _mm_maddubs_epi16(ma, _mm_set1_epi16(one_over_n_quarter)); + const __m128i m0 = VmullLo16(m, sum); + const __m128i m1 = VmullHi16(m, sum); + b[0] = VrshrU32(m0, kSgrProjReciprocalBits - 2); + b[1] = VrshrU32(m1, kSgrProjReciprocalBits - 2); +} + +inline void CalculateB3(const __m128i sum, const __m128i ma, __m128i b[2]) { + // one_over_n == 455. + constexpr uint32_t one_over_n = + ((1 << kSgrProjReciprocalBits) + (9 >> 1)) / 9; + const __m128i m0 = VmullLo16(ma, sum); + const __m128i m1 = VmullHi16(ma, sum); + const __m128i m2 = _mm_mullo_epi32(m0, _mm_set1_epi32(one_over_n)); + const __m128i m3 = _mm_mullo_epi32(m1, _mm_set1_epi32(one_over_n)); + b[0] = VrshrU32(m2, kSgrProjReciprocalBits); + b[1] = VrshrU32(m3, kSgrProjReciprocalBits); +} + +inline void CalculateSumAndIndex5(const __m128i s5[5], const __m128i sq5[5][2], + const uint32_t scale, __m128i* const sum, + __m128i* const index) { + __m128i sum_sq[2]; + *sum = Sum5_16(s5); + Sum5_32(sq5, sum_sq); + *index = CalculateMa<25>(*sum, sum_sq, scale); +} + +inline void CalculateSumAndIndex3(const __m128i s3[3], const __m128i sq3[3][2], + const uint32_t scale, __m128i* const sum, + __m128i* const index) { + __m128i sum_sq[2]; + *sum = Sum3_16(s3); + Sum3_32(sq3, sum_sq); + *index = CalculateMa<9>(*sum, sum_sq, scale); +} + +template <int n, int offset> +inline void LookupIntermediate(const __m128i sum, const __m128i index, + __m128i* const ma, __m128i b[2]) { + static_assert(n == 9 || n == 25, ""); + static_assert(offset == 0 || offset == 8, ""); + const __m128i idx = _mm_packus_epi16(index, index); + // Actually it's not stored and loaded. The compiler will use a 64-bit + // general-purpose register to process. Faster than using _mm_extract_epi8(). + uint8_t temp[8]; + StoreLo8(temp, idx); + *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[0]], offset + 0); + *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[1]], offset + 1); + *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[2]], offset + 2); + *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[3]], offset + 3); + *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[4]], offset + 4); + *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[5]], offset + 5); + *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[6]], offset + 6); + *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[7]], offset + 7); + // b = ma * b * one_over_n + // |ma| = [0, 255] + // |sum| is a box sum with radius 1 or 2. + // For the first pass radius is 2. Maximum value is 5x5x255 = 6375. + // For the second pass radius is 1. Maximum value is 3x3x255 = 2295. + // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n + // When radius is 2 |n| is 25. |one_over_n| is 164. + // When radius is 1 |n| is 9. |one_over_n| is 455. + // |kSgrProjReciprocalBits| is 12. + // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits). + // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits). + __m128i maq; + if (offset == 0) { + maq = _mm_unpacklo_epi8(*ma, _mm_setzero_si128()); + } else { + maq = _mm_unpackhi_epi8(*ma, _mm_setzero_si128()); + } + if (n == 9) { + CalculateB3(sum, maq, b); + } else { + CalculateB5(sum, maq, b); + } +} + +// Set the shuffle control mask of indices out of range [0, 15] to (1xxxxxxx)b +// to get value 0 as the shuffle result. The most significiant bit 1 comes +// either from the comparison instruction, or from the sign bit of the index. +inline __m128i ShuffleIndex(const __m128i table, const __m128i index) { + __m128i mask; + mask = _mm_cmpgt_epi8(index, _mm_set1_epi8(15)); + mask = _mm_or_si128(mask, index); + return _mm_shuffle_epi8(table, mask); +} + +inline __m128i AdjustValue(const __m128i value, const __m128i index, + const int threshold) { + const __m128i thresholds = _mm_set1_epi8(threshold - 128); + const __m128i offset = _mm_cmpgt_epi8(index, thresholds); + return _mm_add_epi8(value, offset); +} + +inline void CalculateIntermediate(const __m128i sum[2], const __m128i index[2], + __m128i* const ma, __m128i b0[2], + __m128i b1[2]) { + // Use table lookup to read elements whose indices are less than 48. + const __m128i c0 = LoadAligned16(kSgrMaLookup + 0 * 16); + const __m128i c1 = LoadAligned16(kSgrMaLookup + 1 * 16); + const __m128i c2 = LoadAligned16(kSgrMaLookup + 2 * 16); + const __m128i indices = _mm_packus_epi16(index[0], index[1]); + __m128i idx; + // Clip idx to 127 to apply signed comparison instructions. + idx = _mm_min_epu8(indices, _mm_set1_epi8(127)); + // All elements whose indices are less than 48 are set to 0. + // Get shuffle results for indices in range [0, 15]. + *ma = ShuffleIndex(c0, idx); + // Get shuffle results for indices in range [16, 31]. + // Subtract 16 to utilize the sign bit of the index. + idx = _mm_sub_epi8(idx, _mm_set1_epi8(16)); + const __m128i res1 = ShuffleIndex(c1, idx); + // Use OR instruction to combine shuffle results together. + *ma = _mm_or_si128(*ma, res1); + // Get shuffle results for indices in range [32, 47]. + // Subtract 16 to utilize the sign bit of the index. + idx = _mm_sub_epi8(idx, _mm_set1_epi8(16)); + const __m128i res2 = ShuffleIndex(c2, idx); + *ma = _mm_or_si128(*ma, res2); + + // For elements whose indices are larger than 47, since they seldom change + // values with the increase of the index, we use comparison and arithmetic + // operations to calculate their values. + // Add -128 to apply signed comparison instructions. + idx = _mm_add_epi8(indices, _mm_set1_epi8(-128)); + // Elements whose indices are larger than 47 (with value 0) are set to 5. + *ma = _mm_max_epu8(*ma, _mm_set1_epi8(5)); + *ma = AdjustValue(*ma, idx, 55); // 55 is the last index which value is 5. + *ma = AdjustValue(*ma, idx, 72); // 72 is the last index which value is 4. + *ma = AdjustValue(*ma, idx, 101); // 101 is the last index which value is 3. + *ma = AdjustValue(*ma, idx, 169); // 169 is the last index which value is 2. + *ma = AdjustValue(*ma, idx, 254); // 254 is the last index which value is 1. + + // b = ma * b * one_over_n + // |ma| = [0, 255] + // |sum| is a box sum with radius 1 or 2. + // For the first pass radius is 2. Maximum value is 5x5x255 = 6375. + // For the second pass radius is 1. Maximum value is 3x3x255 = 2295. + // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n + // When radius is 2 |n| is 25. |one_over_n| is 164. + // When radius is 1 |n| is 9. |one_over_n| is 455. + // |kSgrProjReciprocalBits| is 12. + // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits). + // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits). + const __m128i maq0 = _mm_unpacklo_epi8(*ma, _mm_setzero_si128()); + CalculateB3(sum[0], maq0, b0); + const __m128i maq1 = _mm_unpackhi_epi8(*ma, _mm_setzero_si128()); + CalculateB3(sum[1], maq1, b1); +} + +inline void CalculateIntermediate(const __m128i sum[2], const __m128i index[2], + __m128i ma[2], __m128i b[4]) { + __m128i mas; + CalculateIntermediate(sum, index, &mas, b + 0, b + 2); + ma[0] = _mm_unpacklo_epi64(ma[0], mas); + ma[1] = _mm_srli_si128(mas, 8); +} + +// Note: It has been tried to call CalculateIntermediate() to replace the slow +// LookupIntermediate() when calculating 16 intermediate data points. However, +// the compiler generates even slower code. +template <int offset> +inline void CalculateIntermediate5(const __m128i s5[5], const __m128i sq5[5][2], + const uint32_t scale, __m128i* const ma, + __m128i b[2]) { + static_assert(offset == 0 || offset == 8, ""); + __m128i sum, index; + CalculateSumAndIndex5(s5, sq5, scale, &sum, &index); + LookupIntermediate<25, offset>(sum, index, ma, b); +} + +inline void CalculateIntermediate3(const __m128i s3[3], const __m128i sq3[3][2], + const uint32_t scale, __m128i* const ma, + __m128i b[2]) { + __m128i sum, index; + CalculateSumAndIndex3(s3, sq3, scale, &sum, &index); + LookupIntermediate<9, 0>(sum, index, ma, b); +} + +inline void Store343_444(const __m128i b3[3], const ptrdiff_t x, + __m128i sum_b343[2], __m128i sum_b444[2], + uint32_t* const b343, uint32_t* const b444) { + __m128i b[3], sum_b111[2]; + Prepare3_32(b3 + 0, b); + sum_b111[0] = Sum3_32(b); + sum_b444[0] = _mm_slli_epi32(sum_b111[0], 2); + sum_b343[0] = _mm_sub_epi32(sum_b444[0], sum_b111[0]); + sum_b343[0] = _mm_add_epi32(sum_b343[0], b[1]); + Prepare3_32(b3 + 1, b); + sum_b111[1] = Sum3_32(b); + sum_b444[1] = _mm_slli_epi32(sum_b111[1], 2); + sum_b343[1] = _mm_sub_epi32(sum_b444[1], sum_b111[1]); + sum_b343[1] = _mm_add_epi32(sum_b343[1], b[1]); + StoreAligned32U32(b444 + x, sum_b444); + StoreAligned32U32(b343 + x, sum_b343); +} + +inline void Store343_444Lo(const __m128i ma3[3], const __m128i b3[3], + const ptrdiff_t x, __m128i* const sum_ma343, + __m128i* const sum_ma444, __m128i sum_b343[2], + __m128i sum_b444[2], uint16_t* const ma343, + uint16_t* const ma444, uint32_t* const b343, + uint32_t* const b444) { + const __m128i sum_ma111 = Sum3WLo16(ma3); + *sum_ma444 = _mm_slli_epi16(sum_ma111, 2); + StoreAligned16(ma444 + x, *sum_ma444); + const __m128i sum333 = _mm_sub_epi16(*sum_ma444, sum_ma111); + *sum_ma343 = VaddwLo8(sum333, ma3[1]); + StoreAligned16(ma343 + x, *sum_ma343); + Store343_444(b3, x, sum_b343, sum_b444, b343, b444); +} + +inline void Store343_444Hi(const __m128i ma3[3], const __m128i b3[3], + const ptrdiff_t x, __m128i* const sum_ma343, + __m128i* const sum_ma444, __m128i sum_b343[2], + __m128i sum_b444[2], uint16_t* const ma343, + uint16_t* const ma444, uint32_t* const b343, + uint32_t* const b444) { + const __m128i sum_ma111 = Sum3WHi16(ma3); + *sum_ma444 = _mm_slli_epi16(sum_ma111, 2); + StoreAligned16(ma444 + x, *sum_ma444); + const __m128i sum333 = _mm_sub_epi16(*sum_ma444, sum_ma111); + *sum_ma343 = VaddwHi8(sum333, ma3[1]); + StoreAligned16(ma343 + x, *sum_ma343); + Store343_444(b3, x, sum_b343, sum_b444, b343, b444); +} + +inline void Store343_444Lo(const __m128i ma3[3], const __m128i b3[2], + const ptrdiff_t x, __m128i* const sum_ma343, + __m128i sum_b343[2], uint16_t* const ma343, + uint16_t* const ma444, uint32_t* const b343, + uint32_t* const b444) { + __m128i sum_ma444, sum_b444[2]; + Store343_444Lo(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343, + ma444, b343, b444); +} + +inline void Store343_444Hi(const __m128i ma3[3], const __m128i b3[2], + const ptrdiff_t x, __m128i* const sum_ma343, + __m128i sum_b343[2], uint16_t* const ma343, + uint16_t* const ma444, uint32_t* const b343, + uint32_t* const b444) { + __m128i sum_ma444, sum_b444[2]; + Store343_444Hi(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343, + ma444, b343, b444); +} + +inline void Store343_444Lo(const __m128i ma3[3], const __m128i b3[2], + const ptrdiff_t x, uint16_t* const ma343, + uint16_t* const ma444, uint32_t* const b343, + uint32_t* const b444) { + __m128i sum_ma343, sum_b343[2]; + Store343_444Lo(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444); +} + +inline void Store343_444Hi(const __m128i ma3[3], const __m128i b3[2], + const ptrdiff_t x, uint16_t* const ma343, + uint16_t* const ma444, uint32_t* const b343, + uint32_t* const b444) { + __m128i sum_ma343, sum_b343[2]; + Store343_444Hi(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444); +} + +LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5Lo( + const __m128i s[2][4], const uint32_t scale, uint16_t* const sum5[5], + uint32_t* const square_sum5[5], __m128i sq[2][8], __m128i* const ma, + __m128i b[2]) { + __m128i s5[2][5], sq5[5][2]; + Square(s[0][1], sq[0] + 2); + Square(s[1][1], sq[1] + 2); + s5[0][3] = Sum5Horizontal16(s[0]); + StoreAligned16(sum5[3], s5[0][3]); + s5[0][4] = Sum5Horizontal16(s[1]); + StoreAligned16(sum5[4], s5[0][4]); + Sum5Horizontal32(sq[0], sq5[3]); + StoreAligned32U32(square_sum5[3], sq5[3]); + Sum5Horizontal32(sq[1], sq5[4]); + StoreAligned32U32(square_sum5[4], sq5[4]); + LoadAligned16x3U16(sum5, 0, s5[0]); + LoadAligned32x3U32(square_sum5, 0, sq5); + CalculateIntermediate5<0>(s5[0], sq5, scale, ma, b); +} + +LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5( + const __m128i s[2][4], const ptrdiff_t sum_width, const ptrdiff_t x, + const uint32_t scale, uint16_t* const sum5[5], + uint32_t* const square_sum5[5], __m128i sq[2][8], __m128i ma[2], + __m128i b[6]) { + __m128i s5[2][5], sq5[5][2]; + Square(s[0][2], sq[0] + 4); + Square(s[1][2], sq[1] + 4); + s5[0][3] = Sum5Horizontal16(s[0] + 1); + s5[1][3] = Sum5Horizontal16(s[0] + 2); + StoreAligned16(sum5[3] + x + 0, s5[0][3]); + StoreAligned16(sum5[3] + x + 8, s5[1][3]); + s5[0][4] = Sum5Horizontal16(s[1] + 1); + s5[1][4] = Sum5Horizontal16(s[1] + 2); + StoreAligned16(sum5[4] + x + 0, s5[0][4]); + StoreAligned16(sum5[4] + x + 8, s5[1][4]); + Sum5Horizontal32(sq[0] + 2, sq5[3]); + StoreAligned32U32(square_sum5[3] + x, sq5[3]); + Sum5Horizontal32(sq[1] + 2, sq5[4]); + StoreAligned32U32(square_sum5[4] + x, sq5[4]); + LoadAligned16x3U16(sum5, x, s5[0]); + LoadAligned32x3U32(square_sum5, x, sq5); + CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], b + 2); + + Square(s[0][3], sq[0] + 6); + Square(s[1][3], sq[1] + 6); + Sum5Horizontal32(sq[0] + 4, sq5[3]); + StoreAligned32U32(square_sum5[3] + x + 8, sq5[3]); + Sum5Horizontal32(sq[1] + 4, sq5[4]); + StoreAligned32U32(square_sum5[4] + x + 8, sq5[4]); + LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]); + LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5); + CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], b + 4); +} + +LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRowLo( + const __m128i s[2], const uint32_t scale, const uint16_t* const sum5[5], + const uint32_t* const square_sum5[5], __m128i sq[4], __m128i* const ma, + __m128i b[2]) { + __m128i s5[5], sq5[5][2]; + Square(s[1], sq + 2); + s5[3] = s5[4] = Sum5Horizontal16(s); + Sum5Horizontal32(sq, sq5[3]); + sq5[4][0] = sq5[3][0]; + sq5[4][1] = sq5[3][1]; + LoadAligned16x3U16(sum5, 0, s5); + LoadAligned32x3U32(square_sum5, 0, sq5); + CalculateIntermediate5<0>(s5, sq5, scale, ma, b); +} + +LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRow( + const __m128i s[4], const ptrdiff_t sum_width, const ptrdiff_t x, + const uint32_t scale, const uint16_t* const sum5[5], + const uint32_t* const square_sum5[5], __m128i sq[8], __m128i ma[2], + __m128i b[6]) { + __m128i s5[2][5], sq5[5][2]; + Square(s[2], sq + 4); + s5[0][3] = Sum5Horizontal16(s + 1); + s5[1][3] = Sum5Horizontal16(s + 2); + s5[0][4] = s5[0][3]; + s5[1][4] = s5[1][3]; + Sum5Horizontal32(sq + 2, sq5[3]); + sq5[4][0] = sq5[3][0]; + sq5[4][1] = sq5[3][1]; + LoadAligned16x3U16(sum5, x, s5[0]); + LoadAligned32x3U32(square_sum5, x, sq5); + CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], b + 2); + + Square(s[3], sq + 6); + Sum5Horizontal32(sq + 4, sq5[3]); + sq5[4][0] = sq5[3][0]; + sq5[4][1] = sq5[3][1]; + LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]); + LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5); + CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], b + 4); +} + +LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3Lo( + const __m128i s[2], const uint32_t scale, uint16_t* const sum3[3], + uint32_t* const square_sum3[3], __m128i sq[4], __m128i* const ma, + __m128i b[2]) { + __m128i s3[3], sq3[3][2]; + Square(s[1], sq + 2); + s3[2] = Sum3Horizontal16(s); + StoreAligned16(sum3[2], s3[2]); + Sum3Horizontal32(sq, sq3[2]); + StoreAligned32U32(square_sum3[2], sq3[2]); + LoadAligned16x2U16(sum3, 0, s3); + LoadAligned32x2U32(square_sum3, 0, sq3); + CalculateIntermediate3(s3, sq3, scale, ma, b); +} + +LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3( + const __m128i s[4], const ptrdiff_t x, const ptrdiff_t sum_width, + const uint32_t scale, uint16_t* const sum3[3], + uint32_t* const square_sum3[3], __m128i sq[8], __m128i ma[2], + __m128i b[6]) { + __m128i s3[4], sq3[3][2], sum[2], index[2]; + Square(s[2], sq + 4); + s3[2] = Sum3Horizontal16(s + 1); + s3[3] = Sum3Horizontal16(s + 2); + StoreAligned32U16(sum3[2] + x, s3 + 2); + Sum3Horizontal32(sq + 2, sq3[2]); + StoreAligned32U32(square_sum3[2] + x + 0, sq3[2]); + LoadAligned16x2U16(sum3, x, s3); + LoadAligned32x2U32(square_sum3, x, sq3); + CalculateSumAndIndex3(s3, sq3, scale, &sum[0], &index[0]); + + Square(s[3], sq + 6); + Sum3Horizontal32(sq + 4, sq3[2]); + StoreAligned32U32(square_sum3[2] + x + 8, sq3[2]); + LoadAligned16x2U16Msan(sum3, x + 8, sum_width, s3 + 1); + LoadAligned32x2U32Msan(square_sum3, x + 8, sum_width, sq3); + CalculateSumAndIndex3(s3 + 1, sq3, scale, &sum[1], &index[1]); + CalculateIntermediate(sum, index, ma, b + 2); +} + +LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLo( + const __m128i s[2][4], const uint16_t scales[2], uint16_t* const sum3[4], + uint16_t* const sum5[5], uint32_t* const square_sum3[4], + uint32_t* const square_sum5[5], __m128i sq[2][8], __m128i ma3[2][2], + __m128i b3[2][6], __m128i* const ma5, __m128i b5[2]) { + __m128i s3[4], s5[5], sq3[4][2], sq5[5][2], sum[2], index[2]; + Square(s[0][1], sq[0] + 2); + Square(s[1][1], sq[1] + 2); + SumHorizontal16(s[0], &s3[2], &s5[3]); + SumHorizontal16(s[1], &s3[3], &s5[4]); + StoreAligned16(sum3[2], s3[2]); + StoreAligned16(sum3[3], s3[3]); + StoreAligned16(sum5[3], s5[3]); + StoreAligned16(sum5[4], s5[4]); + SumHorizontal32(sq[0], &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]); + StoreAligned32U32(square_sum3[2], sq3[2]); + StoreAligned32U32(square_sum5[3], sq5[3]); + SumHorizontal32(sq[1], &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]); + StoreAligned32U32(square_sum3[3], sq3[3]); + StoreAligned32U32(square_sum5[4], sq5[4]); + LoadAligned16x2U16(sum3, 0, s3); + LoadAligned32x2U32(square_sum3, 0, sq3); + LoadAligned16x3U16(sum5, 0, s5); + LoadAligned32x3U32(square_sum5, 0, sq5); + CalculateSumAndIndex3(s3 + 0, sq3 + 0, scales[1], &sum[0], &index[0]); + CalculateSumAndIndex3(s3 + 1, sq3 + 1, scales[1], &sum[1], &index[1]); + CalculateIntermediate(sum, index, &ma3[0][0], b3[0], b3[1]); + ma3[1][0] = _mm_srli_si128(ma3[0][0], 8); + CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5); +} + +LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess( + const __m128i s[2][4], const ptrdiff_t x, const uint16_t scales[2], + uint16_t* const sum3[4], uint16_t* const sum5[5], + uint32_t* const square_sum3[4], uint32_t* const square_sum5[5], + const ptrdiff_t sum_width, __m128i sq[2][8], __m128i ma3[2][2], + __m128i b3[2][6], __m128i ma5[2], __m128i b5[6]) { + __m128i s3[2][4], s5[2][5], sq3[4][2], sq5[5][2], sum[2][2], index[2][2]; + SumHorizontal16(s[0] + 1, &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]); + StoreAligned16(sum3[2] + x + 0, s3[0][2]); + StoreAligned16(sum3[2] + x + 8, s3[1][2]); + StoreAligned16(sum5[3] + x + 0, s5[0][3]); + StoreAligned16(sum5[3] + x + 8, s5[1][3]); + SumHorizontal16(s[1] + 1, &s3[0][3], &s3[1][3], &s5[0][4], &s5[1][4]); + StoreAligned16(sum3[3] + x + 0, s3[0][3]); + StoreAligned16(sum3[3] + x + 8, s3[1][3]); + StoreAligned16(sum5[4] + x + 0, s5[0][4]); + StoreAligned16(sum5[4] + x + 8, s5[1][4]); + Square(s[0][2], sq[0] + 4); + Square(s[1][2], sq[1] + 4); + SumHorizontal32(sq[0] + 2, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]); + StoreAligned32U32(square_sum3[2] + x, sq3[2]); + StoreAligned32U32(square_sum5[3] + x, sq5[3]); + SumHorizontal32(sq[1] + 2, &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]); + StoreAligned32U32(square_sum3[3] + x, sq3[3]); + StoreAligned32U32(square_sum5[4] + x, sq5[4]); + LoadAligned16x2U16(sum3, x, s3[0]); + LoadAligned32x2U32(square_sum3, x, sq3); + CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum[0][0], &index[0][0]); + CalculateSumAndIndex3(s3[0] + 1, sq3 + 1, scales[1], &sum[1][0], + &index[1][0]); + LoadAligned16x3U16(sum5, x, s5[0]); + LoadAligned32x3U32(square_sum5, x, sq5); + CalculateIntermediate5<8>(s5[0], sq5, scales[0], &ma5[0], b5 + 2); + + Square(s[0][3], sq[0] + 6); + Square(s[1][3], sq[1] + 6); + SumHorizontal32(sq[0] + 4, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]); + StoreAligned32U32(square_sum3[2] + x + 8, sq3[2]); + StoreAligned32U32(square_sum5[3] + x + 8, sq5[3]); + SumHorizontal32(sq[1] + 4, &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]); + StoreAligned32U32(square_sum3[3] + x + 8, sq3[3]); + StoreAligned32U32(square_sum5[4] + x + 8, sq5[4]); + LoadAligned16x2U16Msan(sum3, x + 8, sum_width, s3[1]); + LoadAligned32x2U32Msan(square_sum3, x + 8, sum_width, sq3); + CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum[0][1], &index[0][1]); + CalculateSumAndIndex3(s3[1] + 1, sq3 + 1, scales[1], &sum[1][1], + &index[1][1]); + CalculateIntermediate(sum[0], index[0], ma3[0], b3[0] + 2); + CalculateIntermediate(sum[1], index[1], ma3[1], b3[1] + 2); + LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]); + LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5); + CalculateIntermediate5<0>(s5[1], sq5, scales[0], &ma5[1], b5 + 4); +} + +LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRowLo( + const __m128i s[2], const uint16_t scales[2], const uint16_t* const sum3[4], + const uint16_t* const sum5[5], const uint32_t* const square_sum3[4], + const uint32_t* const square_sum5[5], __m128i sq[4], __m128i* const ma3, + __m128i* const ma5, __m128i b3[2], __m128i b5[2]) { + __m128i s3[3], s5[5], sq3[3][2], sq5[5][2]; + Square(s[1], sq + 2); + SumHorizontal16(s, &s3[2], &s5[3]); + SumHorizontal32(sq, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]); + LoadAligned16x3U16(sum5, 0, s5); + s5[4] = s5[3]; + LoadAligned32x3U32(square_sum5, 0, sq5); + sq5[4][0] = sq5[3][0]; + sq5[4][1] = sq5[3][1]; + CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5); + LoadAligned16x2U16(sum3, 0, s3); + LoadAligned32x2U32(square_sum3, 0, sq3); + CalculateIntermediate3(s3, sq3, scales[1], ma3, b3); +} + +LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRow( + const __m128i s[4], const ptrdiff_t sum_width, const ptrdiff_t x, + const uint16_t scales[2], const uint16_t* const sum3[4], + const uint16_t* const sum5[5], const uint32_t* const square_sum3[4], + const uint32_t* const square_sum5[5], __m128i sq[8], __m128i ma3[2], + __m128i ma5[2], __m128i b3[6], __m128i b5[6]) { + __m128i s3[2][3], s5[2][5], sq3[3][2], sq5[5][2], sum[2], index[2]; + Square(s[2], sq + 4); + SumHorizontal16(s + 1, &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]); + SumHorizontal32(sq + 2, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]); + LoadAligned16x3U16(sum5, x, s5[0]); + s5[0][4] = s5[0][3]; + LoadAligned32x3U32(square_sum5, x, sq5); + sq5[4][0] = sq5[3][0]; + sq5[4][1] = sq5[3][1]; + CalculateIntermediate5<8>(s5[0], sq5, scales[0], ma5, b5 + 2); + LoadAligned16x2U16(sum3, x, s3[0]); + LoadAligned32x2U32(square_sum3, x, sq3); + CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum[0], &index[0]); + + Square(s[3], sq + 6); + SumHorizontal32(sq + 4, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]); + LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]); + s5[1][4] = s5[1][3]; + LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5); + sq5[4][0] = sq5[3][0]; + sq5[4][1] = sq5[3][1]; + CalculateIntermediate5<0>(s5[1], sq5, scales[0], ma5 + 1, b5 + 4); + LoadAligned16x2U16Msan(sum3, x + 8, sum_width, s3[1]); + LoadAligned32x2U32Msan(square_sum3, x + 8, sum_width, sq3); + CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum[1], &index[1]); + CalculateIntermediate(sum, index, ma3, b3 + 2); +} + +inline void BoxSumFilterPreProcess5(const uint16_t* const src0, + const uint16_t* const src1, const int width, + const uint32_t scale, + uint16_t* const sum5[5], + uint32_t* const square_sum5[5], + const ptrdiff_t sum_width, uint16_t* ma565, + uint32_t* b565) { + const ptrdiff_t overread_in_bytes = + kOverreadInBytesPass1 - sizeof(*src0) * width; + __m128i s[2][4], mas[2], sq[2][8], bs[6]; + s[0][0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0); + s[0][1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16); + s[1][0] = LoadUnaligned16Msan(src1 + 0, overread_in_bytes + 0); + s[1][1] = LoadUnaligned16Msan(src1 + 8, overread_in_bytes + 16); + Square(s[0][0], sq[0]); + Square(s[1][0], sq[1]); + BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq, &mas[0], bs); + + int x = 0; + do { + __m128i ma5[3], ma[2], b[4]; + s[0][2] = LoadUnaligned16Msan(src0 + x + 16, + overread_in_bytes + sizeof(*src0) * (x + 16)); + s[0][3] = LoadUnaligned16Msan(src0 + x + 24, + overread_in_bytes + sizeof(*src0) * (x + 24)); + s[1][2] = LoadUnaligned16Msan(src1 + x + 16, + overread_in_bytes + sizeof(*src1) * (x + 16)); + s[1][3] = LoadUnaligned16Msan(src1 + x + 24, + overread_in_bytes + sizeof(*src1) * (x + 24)); + BoxFilterPreProcess5(s, sum_width, x + 8, scale, sum5, square_sum5, sq, mas, + bs); + Prepare3_8<0>(mas, ma5); + ma[0] = Sum565Lo(ma5); + ma[1] = Sum565Hi(ma5); + StoreAligned32U16(ma565, ma); + Sum565(bs + 0, b + 0); + Sum565(bs + 2, b + 2); + StoreAligned64U32(b565, b); + s[0][0] = s[0][2]; + s[0][1] = s[0][3]; + s[1][0] = s[1][2]; + s[1][1] = s[1][3]; + sq[0][2] = sq[0][6]; + sq[0][3] = sq[0][7]; + sq[1][2] = sq[1][6]; + sq[1][3] = sq[1][7]; + mas[0] = mas[1]; + bs[0] = bs[4]; + bs[1] = bs[5]; + ma565 += 16; + b565 += 16; + x += 16; + } while (x < width); +} + +template <bool calculate444> +LIBGAV1_ALWAYS_INLINE void BoxSumFilterPreProcess3( + const uint16_t* const src, const int width, const uint32_t scale, + uint16_t* const sum3[3], uint32_t* const square_sum3[3], + const ptrdiff_t sum_width, uint16_t* ma343, uint16_t* ma444, uint32_t* b343, + uint32_t* b444) { + const ptrdiff_t overread_in_bytes = + kOverreadInBytesPass2 - sizeof(*src) * width; + __m128i s[4], mas[2], sq[8], bs[6]; + s[0] = LoadUnaligned16Msan(src + 0, overread_in_bytes + 0); + s[1] = LoadUnaligned16Msan(src + 8, overread_in_bytes + 16); + Square(s[0], sq); + BoxFilterPreProcess3Lo(s, scale, sum3, square_sum3, sq, &mas[0], bs); + + int x = 0; + do { + s[2] = LoadUnaligned16Msan(src + x + 16, + overread_in_bytes + sizeof(*src) * (x + 16)); + s[3] = LoadUnaligned16Msan(src + x + 24, + overread_in_bytes + sizeof(*src) * (x + 24)); + BoxFilterPreProcess3(s, x + 8, sum_width, scale, sum3, square_sum3, sq, mas, + bs); + __m128i ma3[3]; + Prepare3_8<0>(mas, ma3); + if (calculate444) { // NOLINT(readability-simplify-boolean-expr) + Store343_444Lo(ma3, bs + 0, 0, ma343, ma444, b343, b444); + Store343_444Hi(ma3, bs + 2, 8, ma343, ma444, b343, b444); + ma444 += 16; + b444 += 16; + } else { + __m128i ma[2], b[4]; + ma[0] = Sum343Lo(ma3); + ma[1] = Sum343Hi(ma3); + StoreAligned32U16(ma343, ma); + Sum343(bs + 0, b + 0); + Sum343(bs + 2, b + 2); + StoreAligned64U32(b343, b); + } + s[1] = s[3]; + sq[2] = sq[6]; + sq[3] = sq[7]; + mas[0] = mas[1]; + bs[0] = bs[4]; + bs[1] = bs[5]; + ma343 += 16; + b343 += 16; + x += 16; + } while (x < width); +} + +inline void BoxSumFilterPreProcess( + const uint16_t* const src0, const uint16_t* const src1, const int width, + const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5], + uint32_t* const square_sum3[4], uint32_t* const square_sum5[5], + const ptrdiff_t sum_width, uint16_t* const ma343[4], uint16_t* const ma444, + uint16_t* ma565, uint32_t* const b343[4], uint32_t* const b444, + uint32_t* b565) { + const ptrdiff_t overread_in_bytes = + kOverreadInBytesPass1 - sizeof(*src0) * width; + __m128i s[2][4], ma3[2][2], ma5[2], sq[2][8], b3[2][6], b5[6]; + s[0][0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0); + s[0][1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16); + s[1][0] = LoadUnaligned16Msan(src1 + 0, overread_in_bytes + 0); + s[1][1] = LoadUnaligned16Msan(src1 + 8, overread_in_bytes + 16); + Square(s[0][0], sq[0]); + Square(s[1][0], sq[1]); + BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq, + ma3, b3, &ma5[0], b5); + + int x = 0; + do { + __m128i ma[2], b[4], ma3x[3], ma5x[3]; + s[0][2] = LoadUnaligned16Msan(src0 + x + 16, + overread_in_bytes + sizeof(*src0) * (x + 16)); + s[0][3] = LoadUnaligned16Msan(src0 + x + 24, + overread_in_bytes + sizeof(*src0) * (x + 24)); + s[1][2] = LoadUnaligned16Msan(src1 + x + 16, + overread_in_bytes + sizeof(*src1) * (x + 16)); + s[1][3] = LoadUnaligned16Msan(src1 + x + 24, + overread_in_bytes + sizeof(*src1) * (x + 24)); + BoxFilterPreProcess(s, x + 8, scales, sum3, sum5, square_sum3, square_sum5, + sum_width, sq, ma3, b3, ma5, b5); + + Prepare3_8<0>(ma3[0], ma3x); + ma[0] = Sum343Lo(ma3x); + ma[1] = Sum343Hi(ma3x); + StoreAligned32U16(ma343[0] + x, ma); + Sum343(b3[0] + 0, b + 0); + Sum343(b3[0] + 2, b + 2); + StoreAligned64U32(b343[0] + x, b); + Sum565(b5 + 0, b + 0); + Sum565(b5 + 2, b + 2); + StoreAligned64U32(b565, b); + Prepare3_8<0>(ma3[1], ma3x); + Store343_444Lo(ma3x, b3[1], x, ma343[1], ma444, b343[1], b444); + Store343_444Hi(ma3x, b3[1] + 2, x + 8, ma343[1], ma444, b343[1], b444); + Prepare3_8<0>(ma5, ma5x); + ma[0] = Sum565Lo(ma5x); + ma[1] = Sum565Hi(ma5x); + StoreAligned32U16(ma565, ma); + s[0][0] = s[0][2]; + s[0][1] = s[0][3]; + s[1][0] = s[1][2]; + s[1][1] = s[1][3]; + sq[0][2] = sq[0][6]; + sq[0][3] = sq[0][7]; + sq[1][2] = sq[1][6]; + sq[1][3] = sq[1][7]; + ma3[0][0] = ma3[0][1]; + ma3[1][0] = ma3[1][1]; + ma5[0] = ma5[1]; + b3[0][0] = b3[0][4]; + b3[0][1] = b3[0][5]; + b3[1][0] = b3[1][4]; + b3[1][1] = b3[1][5]; + b5[0] = b5[4]; + b5[1] = b5[5]; + ma565 += 16; + b565 += 16; + x += 16; + } while (x < width); +} + +template <int shift> +inline __m128i FilterOutput(const __m128i ma_x_src, const __m128i b) { + // ma: 255 * 32 = 8160 (13 bits) + // b: 65088 * 32 = 2082816 (21 bits) + // v: b - ma * 255 (22 bits) + const __m128i v = _mm_sub_epi32(b, ma_x_src); + // kSgrProjSgrBits = 8 + // kSgrProjRestoreBits = 4 + // shift = 4 or 5 + // v >> 8 or 9 (13 bits) + return VrshrS32(v, kSgrProjSgrBits + shift - kSgrProjRestoreBits); +} + +template <int shift> +inline __m128i CalculateFilteredOutput(const __m128i src, const __m128i ma, + const __m128i b[2]) { + const __m128i ma_x_src_lo = VmullLo16(ma, src); + const __m128i ma_x_src_hi = VmullHi16(ma, src); + const __m128i dst_lo = FilterOutput<shift>(ma_x_src_lo, b[0]); + const __m128i dst_hi = FilterOutput<shift>(ma_x_src_hi, b[1]); + return _mm_packs_epi32(dst_lo, dst_hi); // 13 bits +} + +inline __m128i CalculateFilteredOutputPass1(const __m128i src, + const __m128i ma[2], + const __m128i b[2][2]) { + const __m128i ma_sum = _mm_add_epi16(ma[0], ma[1]); + __m128i b_sum[2]; + b_sum[0] = _mm_add_epi32(b[0][0], b[1][0]); + b_sum[1] = _mm_add_epi32(b[0][1], b[1][1]); + return CalculateFilteredOutput<5>(src, ma_sum, b_sum); +} + +inline __m128i CalculateFilteredOutputPass2(const __m128i src, + const __m128i ma[3], + const __m128i b[3][2]) { + const __m128i ma_sum = Sum3_16(ma); + __m128i b_sum[2]; + Sum3_32(b, b_sum); + return CalculateFilteredOutput<5>(src, ma_sum, b_sum); +} + +inline __m128i SelfGuidedFinal(const __m128i src, const __m128i v[2]) { + const __m128i v_lo = + VrshrS32(v[0], kSgrProjRestoreBits + kSgrProjPrecisionBits); + const __m128i v_hi = + VrshrS32(v[1], kSgrProjRestoreBits + kSgrProjPrecisionBits); + const __m128i vv = _mm_packs_epi32(v_lo, v_hi); + return _mm_add_epi16(src, vv); +} + +inline __m128i SelfGuidedDoubleMultiplier(const __m128i src, + const __m128i filter[2], const int w0, + const int w2) { + __m128i v[2]; + const __m128i w0_w2 = _mm_set1_epi32((w2 << 16) | static_cast<uint16_t>(w0)); + const __m128i f_lo = _mm_unpacklo_epi16(filter[0], filter[1]); + const __m128i f_hi = _mm_unpackhi_epi16(filter[0], filter[1]); + v[0] = _mm_madd_epi16(w0_w2, f_lo); + v[1] = _mm_madd_epi16(w0_w2, f_hi); + return SelfGuidedFinal(src, v); +} + +inline __m128i SelfGuidedSingleMultiplier(const __m128i src, + const __m128i filter, const int w0) { + // weight: -96 to 96 (Sgrproj_Xqd_Min/Max) + __m128i v[2]; + v[0] = VmullNLo8(filter, w0); + v[1] = VmullNHi8(filter, w0); + return SelfGuidedFinal(src, v); +} + +inline void ClipAndStore(uint16_t* const dst, const __m128i val) { + const __m128i val0 = _mm_max_epi16(val, _mm_setzero_si128()); + const __m128i val1 = _mm_min_epi16(val0, _mm_set1_epi16(1023)); + StoreAligned16(dst, val1); +} + +LIBGAV1_ALWAYS_INLINE void BoxFilterPass1( + const uint16_t* const src, const uint16_t* const src0, + const uint16_t* const src1, const ptrdiff_t stride, uint16_t* const sum5[5], + uint32_t* const square_sum5[5], const int width, const ptrdiff_t sum_width, + const uint32_t scale, const int16_t w0, uint16_t* const ma565[2], + uint32_t* const b565[2], uint16_t* const dst) { + const ptrdiff_t overread_in_bytes = + kOverreadInBytesPass1 - sizeof(*src0) * width; + __m128i s[2][4], mas[2], sq[2][8], bs[6]; + s[0][0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0); + s[0][1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16); + s[1][0] = LoadUnaligned16Msan(src1 + 0, overread_in_bytes + 0); + s[1][1] = LoadUnaligned16Msan(src1 + 8, overread_in_bytes + 16); + Square(s[0][0], sq[0]); + Square(s[1][0], sq[1]); + BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq, &mas[0], bs); + + int x = 0; + do { + __m128i ma[2], ma5[3], b[2][2], p[2]; + s[0][2] = LoadUnaligned16Msan(src0 + x + 16, + overread_in_bytes + sizeof(*src0) * (x + 16)); + s[0][3] = LoadUnaligned16Msan(src0 + x + 24, + overread_in_bytes + sizeof(*src0) * (x + 24)); + s[1][2] = LoadUnaligned16Msan(src1 + x + 16, + overread_in_bytes + sizeof(*src1) * (x + 16)); + s[1][3] = LoadUnaligned16Msan(src1 + x + 24, + overread_in_bytes + sizeof(*src1) * (x + 24)); + BoxFilterPreProcess5(s, sum_width, x + 8, scale, sum5, square_sum5, sq, mas, + bs); + Prepare3_8<0>(mas, ma5); + ma[1] = Sum565Lo(ma5); + StoreAligned16(ma565[1] + x, ma[1]); + Sum565(bs, b[1]); + StoreAligned32U32(b565[1] + x, b[1]); + const __m128i sr0_lo = LoadAligned16(src + x + 0); + const __m128i sr1_lo = LoadAligned16(src + stride + x + 0); + ma[0] = LoadAligned16(ma565[0] + x); + LoadAligned32U32(b565[0] + x, b[0]); + p[0] = CalculateFilteredOutputPass1(sr0_lo, ma, b); + p[1] = CalculateFilteredOutput<4>(sr1_lo, ma[1], b[1]); + const __m128i d00 = SelfGuidedSingleMultiplier(sr0_lo, p[0], w0); + const __m128i d10 = SelfGuidedSingleMultiplier(sr1_lo, p[1], w0); + + ma[1] = Sum565Hi(ma5); + StoreAligned16(ma565[1] + x + 8, ma[1]); + Sum565(bs + 2, b[1]); + StoreAligned32U32(b565[1] + x + 8, b[1]); + const __m128i sr0_hi = LoadAligned16(src + x + 8); + const __m128i sr1_hi = LoadAligned16(src + stride + x + 8); + ma[0] = LoadAligned16(ma565[0] + x + 8); + LoadAligned32U32(b565[0] + x + 8, b[0]); + p[0] = CalculateFilteredOutputPass1(sr0_hi, ma, b); + p[1] = CalculateFilteredOutput<4>(sr1_hi, ma[1], b[1]); + const __m128i d01 = SelfGuidedSingleMultiplier(sr0_hi, p[0], w0); + ClipAndStore(dst + x + 0, d00); + ClipAndStore(dst + x + 8, d01); + const __m128i d11 = SelfGuidedSingleMultiplier(sr1_hi, p[1], w0); + ClipAndStore(dst + stride + x + 0, d10); + ClipAndStore(dst + stride + x + 8, d11); + s[0][0] = s[0][2]; + s[0][1] = s[0][3]; + s[1][0] = s[1][2]; + s[1][1] = s[1][3]; + sq[0][2] = sq[0][6]; + sq[0][3] = sq[0][7]; + sq[1][2] = sq[1][6]; + sq[1][3] = sq[1][7]; + mas[0] = mas[1]; + bs[0] = bs[4]; + bs[1] = bs[5]; + x += 16; + } while (x < width); +} + +inline void BoxFilterPass1LastRow( + const uint16_t* const src, const uint16_t* const src0, const int width, + const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0, + uint16_t* const sum5[5], uint32_t* const square_sum5[5], uint16_t* ma565, + uint32_t* b565, uint16_t* const dst) { + const ptrdiff_t overread_in_bytes = + kOverreadInBytesPass1 - sizeof(*src0) * width; + __m128i s[4], mas[2], sq[8], bs[6]; + s[0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0); + s[1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16); + Square(s[0], sq); + BoxFilterPreProcess5LastRowLo(s, scale, sum5, square_sum5, sq, &mas[0], bs); + + int x = 0; + do { + __m128i ma[2], ma5[3], b[2][2]; + s[2] = LoadUnaligned16Msan(src0 + x + 16, + overread_in_bytes + sizeof(*src0) * (x + 16)); + s[3] = LoadUnaligned16Msan(src0 + x + 24, + overread_in_bytes + sizeof(*src0) * (x + 24)); + BoxFilterPreProcess5LastRow(s, sum_width, x + 8, scale, sum5, square_sum5, + sq, mas, bs); + Prepare3_8<0>(mas, ma5); + ma[1] = Sum565Lo(ma5); + Sum565(bs, b[1]); + ma[0] = LoadAligned16(ma565); + LoadAligned32U32(b565, b[0]); + const __m128i sr_lo = LoadAligned16(src + x + 0); + __m128i p = CalculateFilteredOutputPass1(sr_lo, ma, b); + const __m128i d0 = SelfGuidedSingleMultiplier(sr_lo, p, w0); + + ma[1] = Sum565Hi(ma5); + Sum565(bs + 2, b[1]); + ma[0] = LoadAligned16(ma565 + 8); + LoadAligned32U32(b565 + 8, b[0]); + const __m128i sr_hi = LoadAligned16(src + x + 8); + p = CalculateFilteredOutputPass1(sr_hi, ma, b); + const __m128i d1 = SelfGuidedSingleMultiplier(sr_hi, p, w0); + ClipAndStore(dst + x + 0, d0); + ClipAndStore(dst + x + 8, d1); + s[1] = s[3]; + sq[2] = sq[6]; + sq[3] = sq[7]; + mas[0] = mas[1]; + bs[0] = bs[4]; + bs[1] = bs[5]; + ma565 += 16; + b565 += 16; + x += 16; + } while (x < width); +} + +LIBGAV1_ALWAYS_INLINE void BoxFilterPass2( + const uint16_t* const src, const uint16_t* const src0, const int width, + const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0, + uint16_t* const sum3[3], uint32_t* const square_sum3[3], + uint16_t* const ma343[3], uint16_t* const ma444[2], uint32_t* const b343[3], + uint32_t* const b444[2], uint16_t* const dst) { + const ptrdiff_t overread_in_bytes = + kOverreadInBytesPass2 - sizeof(*src0) * width; + __m128i s[4], mas[2], sq[8], bs[6]; + s[0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0); + s[1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16); + Square(s[0], sq); + BoxFilterPreProcess3Lo(s, scale, sum3, square_sum3, sq, &mas[0], bs); + + int x = 0; + do { + s[2] = LoadUnaligned16Msan(src0 + x + 16, + overread_in_bytes + sizeof(*src0) * (x + 16)); + s[3] = LoadUnaligned16Msan(src0 + x + 24, + overread_in_bytes + sizeof(*src0) * (x + 24)); + BoxFilterPreProcess3(s, x + 8, sum_width, scale, sum3, square_sum3, sq, mas, + bs); + __m128i ma[3], b[3][2], ma3[3]; + Prepare3_8<0>(mas, ma3); + Store343_444Lo(ma3, bs + 0, x, &ma[2], b[2], ma343[2], ma444[1], b343[2], + b444[1]); + const __m128i sr_lo = LoadAligned16(src + x + 0); + ma[0] = LoadAligned16(ma343[0] + x); + ma[1] = LoadAligned16(ma444[0] + x); + LoadAligned32U32(b343[0] + x, b[0]); + LoadAligned32U32(b444[0] + x, b[1]); + const __m128i p0 = CalculateFilteredOutputPass2(sr_lo, ma, b); + + Store343_444Hi(ma3, bs + 2, x + 8, &ma[2], b[2], ma343[2], ma444[1], + b343[2], b444[1]); + const __m128i sr_hi = LoadAligned16(src + x + 8); + ma[0] = LoadAligned16(ma343[0] + x + 8); + ma[1] = LoadAligned16(ma444[0] + x + 8); + LoadAligned32U32(b343[0] + x + 8, b[0]); + LoadAligned32U32(b444[0] + x + 8, b[1]); + const __m128i p1 = CalculateFilteredOutputPass2(sr_hi, ma, b); + const __m128i d0 = SelfGuidedSingleMultiplier(sr_lo, p0, w0); + const __m128i d1 = SelfGuidedSingleMultiplier(sr_hi, p1, w0); + ClipAndStore(dst + x + 0, d0); + ClipAndStore(dst + x + 8, d1); + s[1] = s[3]; + sq[2] = sq[6]; + sq[3] = sq[7]; + mas[0] = mas[1]; + bs[0] = bs[4]; + bs[1] = bs[5]; + x += 16; + } while (x < width); +} + +LIBGAV1_ALWAYS_INLINE void BoxFilter( + const uint16_t* const src, const uint16_t* const src0, + const uint16_t* const src1, const ptrdiff_t stride, const int width, + const uint16_t scales[2], const int16_t w0, const int16_t w2, + uint16_t* const sum3[4], uint16_t* const sum5[5], + uint32_t* const square_sum3[4], uint32_t* const square_sum5[5], + const ptrdiff_t sum_width, uint16_t* const ma343[4], + uint16_t* const ma444[3], uint16_t* const ma565[2], uint32_t* const b343[4], + uint32_t* const b444[3], uint32_t* const b565[2], uint16_t* const dst) { + const ptrdiff_t overread_in_bytes = + kOverreadInBytesPass1 - sizeof(*src0) * width; + __m128i s[2][4], ma3[2][2], ma5[2], sq[2][8], b3[2][6], b5[6]; + s[0][0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0); + s[0][1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16); + s[1][0] = LoadUnaligned16Msan(src1 + 0, overread_in_bytes + 0); + s[1][1] = LoadUnaligned16Msan(src1 + 8, overread_in_bytes + 16); + Square(s[0][0], sq[0]); + Square(s[1][0], sq[1]); + BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq, + ma3, b3, &ma5[0], b5); + + int x = 0; + do { + __m128i ma[3][3], b[3][3][2], p[2][2], ma3x[2][3], ma5x[3]; + s[0][2] = LoadUnaligned16Msan(src0 + x + 16, + overread_in_bytes + sizeof(*src0) * (x + 16)); + s[0][3] = LoadUnaligned16Msan(src0 + x + 24, + overread_in_bytes + sizeof(*src0) * (x + 24)); + s[1][2] = LoadUnaligned16Msan(src1 + x + 16, + overread_in_bytes + sizeof(*src1) * (x + 16)); + s[1][3] = LoadUnaligned16Msan(src1 + x + 24, + overread_in_bytes + sizeof(*src1) * (x + 24)); + BoxFilterPreProcess(s, x + 8, scales, sum3, sum5, square_sum3, square_sum5, + sum_width, sq, ma3, b3, ma5, b5); + Prepare3_8<0>(ma3[0], ma3x[0]); + Prepare3_8<0>(ma3[1], ma3x[1]); + Prepare3_8<0>(ma5, ma5x); + Store343_444Lo(ma3x[0], b3[0], x, &ma[1][2], &ma[2][1], b[1][2], b[2][1], + ma343[2], ma444[1], b343[2], b444[1]); + Store343_444Lo(ma3x[1], b3[1], x, &ma[2][2], b[2][2], ma343[3], ma444[2], + b343[3], b444[2]); + ma[0][1] = Sum565Lo(ma5x); + StoreAligned16(ma565[1] + x, ma[0][1]); + Sum565(b5, b[0][1]); + StoreAligned32U32(b565[1] + x, b[0][1]); + const __m128i sr0_lo = LoadAligned16(src + x); + const __m128i sr1_lo = LoadAligned16(src + stride + x); + ma[0][0] = LoadAligned16(ma565[0] + x); + LoadAligned32U32(b565[0] + x, b[0][0]); + p[0][0] = CalculateFilteredOutputPass1(sr0_lo, ma[0], b[0]); + p[1][0] = CalculateFilteredOutput<4>(sr1_lo, ma[0][1], b[0][1]); + ma[1][0] = LoadAligned16(ma343[0] + x); + ma[1][1] = LoadAligned16(ma444[0] + x); + LoadAligned32U32(b343[0] + x, b[1][0]); + LoadAligned32U32(b444[0] + x, b[1][1]); + p[0][1] = CalculateFilteredOutputPass2(sr0_lo, ma[1], b[1]); + const __m128i d00 = SelfGuidedDoubleMultiplier(sr0_lo, p[0], w0, w2); + ma[2][0] = LoadAligned16(ma343[1] + x); + LoadAligned32U32(b343[1] + x, b[2][0]); + p[1][1] = CalculateFilteredOutputPass2(sr1_lo, ma[2], b[2]); + const __m128i d10 = SelfGuidedDoubleMultiplier(sr1_lo, p[1], w0, w2); + + Store343_444Hi(ma3x[0], b3[0] + 2, x + 8, &ma[1][2], &ma[2][1], b[1][2], + b[2][1], ma343[2], ma444[1], b343[2], b444[1]); + Store343_444Hi(ma3x[1], b3[1] + 2, x + 8, &ma[2][2], b[2][2], ma343[3], + ma444[2], b343[3], b444[2]); + ma[0][1] = Sum565Hi(ma5x); + StoreAligned16(ma565[1] + x + 8, ma[0][1]); + Sum565(b5 + 2, b[0][1]); + StoreAligned32U32(b565[1] + x + 8, b[0][1]); + const __m128i sr0_hi = LoadAligned16(src + x + 8); + const __m128i sr1_hi = LoadAligned16(src + stride + x + 8); + ma[0][0] = LoadAligned16(ma565[0] + x + 8); + LoadAligned32U32(b565[0] + x + 8, b[0][0]); + p[0][0] = CalculateFilteredOutputPass1(sr0_hi, ma[0], b[0]); + p[1][0] = CalculateFilteredOutput<4>(sr1_hi, ma[0][1], b[0][1]); + ma[1][0] = LoadAligned16(ma343[0] + x + 8); + ma[1][1] = LoadAligned16(ma444[0] + x + 8); + LoadAligned32U32(b343[0] + x + 8, b[1][0]); + LoadAligned32U32(b444[0] + x + 8, b[1][1]); + p[0][1] = CalculateFilteredOutputPass2(sr0_hi, ma[1], b[1]); + const __m128i d01 = SelfGuidedDoubleMultiplier(sr0_hi, p[0], w0, w2); + ClipAndStore(dst + x + 0, d00); + ClipAndStore(dst + x + 8, d01); + ma[2][0] = LoadAligned16(ma343[1] + x + 8); + LoadAligned32U32(b343[1] + x + 8, b[2][0]); + p[1][1] = CalculateFilteredOutputPass2(sr1_hi, ma[2], b[2]); + const __m128i d11 = SelfGuidedDoubleMultiplier(sr1_hi, p[1], w0, w2); + ClipAndStore(dst + stride + x + 0, d10); + ClipAndStore(dst + stride + x + 8, d11); + s[0][0] = s[0][2]; + s[0][1] = s[0][3]; + s[1][0] = s[1][2]; + s[1][1] = s[1][3]; + sq[0][2] = sq[0][6]; + sq[0][3] = sq[0][7]; + sq[1][2] = sq[1][6]; + sq[1][3] = sq[1][7]; + ma3[0][0] = ma3[0][1]; + ma3[1][0] = ma3[1][1]; + ma5[0] = ma5[1]; + b3[0][0] = b3[0][4]; + b3[0][1] = b3[0][5]; + b3[1][0] = b3[1][4]; + b3[1][1] = b3[1][5]; + b5[0] = b5[4]; + b5[1] = b5[5]; + x += 16; + } while (x < width); +} + +inline void BoxFilterLastRow( + const uint16_t* const src, const uint16_t* const src0, const int width, + const ptrdiff_t sum_width, const uint16_t scales[2], const int16_t w0, + const int16_t w2, uint16_t* const sum3[4], uint16_t* const sum5[5], + uint32_t* const square_sum3[4], uint32_t* const square_sum5[5], + uint16_t* const ma343, uint16_t* const ma444, uint16_t* const ma565, + uint32_t* const b343, uint32_t* const b444, uint32_t* const b565, + uint16_t* const dst) { + const ptrdiff_t overread_in_bytes = + kOverreadInBytesPass1 - sizeof(*src0) * width; + __m128i s[4], ma3[2], ma5[2], sq[8], b3[6], b5[6], ma[3], b[3][2]; + s[0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0); + s[1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16); + Square(s[0], sq); + BoxFilterPreProcessLastRowLo(s, scales, sum3, sum5, square_sum3, square_sum5, + sq, &ma3[0], &ma5[0], b3, b5); + + int x = 0; + do { + __m128i ma3x[3], ma5x[3], p[2]; + s[2] = LoadUnaligned16Msan(src0 + x + 16, + overread_in_bytes + sizeof(*src0) * (x + 16)); + s[3] = LoadUnaligned16Msan(src0 + x + 24, + overread_in_bytes + sizeof(*src0) * (x + 24)); + BoxFilterPreProcessLastRow(s, sum_width, x + 8, scales, sum3, sum5, + square_sum3, square_sum5, sq, ma3, ma5, b3, b5); + Prepare3_8<0>(ma3, ma3x); + Prepare3_8<0>(ma5, ma5x); + ma[1] = Sum565Lo(ma5x); + Sum565(b5, b[1]); + ma[2] = Sum343Lo(ma3x); + Sum343(b3, b[2]); + const __m128i sr_lo = LoadAligned16(src + x + 0); + ma[0] = LoadAligned16(ma565 + x); + LoadAligned32U32(b565 + x, b[0]); + p[0] = CalculateFilteredOutputPass1(sr_lo, ma, b); + ma[0] = LoadAligned16(ma343 + x); + ma[1] = LoadAligned16(ma444 + x); + LoadAligned32U32(b343 + x, b[0]); + LoadAligned32U32(b444 + x, b[1]); + p[1] = CalculateFilteredOutputPass2(sr_lo, ma, b); + const __m128i d0 = SelfGuidedDoubleMultiplier(sr_lo, p, w0, w2); + + ma[1] = Sum565Hi(ma5x); + Sum565(b5 + 2, b[1]); + ma[2] = Sum343Hi(ma3x); + Sum343(b3 + 2, b[2]); + const __m128i sr_hi = LoadAligned16(src + x + 8); + ma[0] = LoadAligned16(ma565 + x + 8); + LoadAligned32U32(b565 + x + 8, b[0]); + p[0] = CalculateFilteredOutputPass1(sr_hi, ma, b); + ma[0] = LoadAligned16(ma343 + x + 8); + ma[1] = LoadAligned16(ma444 + x + 8); + LoadAligned32U32(b343 + x + 8, b[0]); + LoadAligned32U32(b444 + x + 8, b[1]); + p[1] = CalculateFilteredOutputPass2(sr_hi, ma, b); + const __m128i d1 = SelfGuidedDoubleMultiplier(sr_hi, p, w0, w2); + ClipAndStore(dst + x + 0, d0); + ClipAndStore(dst + x + 8, d1); + s[1] = s[3]; + sq[2] = sq[6]; + sq[3] = sq[7]; + ma3[0] = ma3[1]; + ma5[0] = ma5[1]; + b3[0] = b3[4]; + b3[1] = b3[5]; + b5[0] = b5[4]; + b5[1] = b5[5]; + x += 16; + } while (x < width); +} + +LIBGAV1_ALWAYS_INLINE void BoxFilterProcess( + const RestorationUnitInfo& restoration_info, const uint16_t* src, + const ptrdiff_t stride, const uint16_t* const top_border, + const ptrdiff_t top_border_stride, const uint16_t* bottom_border, + const ptrdiff_t bottom_border_stride, const int width, const int height, + SgrBuffer* const sgr_buffer, uint16_t* dst) { + const auto temp_stride = Align<ptrdiff_t>(width, 16); + const auto sum_width = Align<ptrdiff_t>(width + 8, 16); + const auto sum_stride = temp_stride + 16; + const int sgr_proj_index = restoration_info.sgr_proj_info.index; + const uint16_t* const scales = kSgrScaleParameter[sgr_proj_index]; // < 2^12. + const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0]; + const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1]; + const int16_t w2 = (1 << kSgrProjPrecisionBits) - w0 - w1; + uint16_t *sum3[4], *sum5[5], *ma343[4], *ma444[3], *ma565[2]; + uint32_t *square_sum3[4], *square_sum5[5], *b343[4], *b444[3], *b565[2]; + sum3[0] = sgr_buffer->sum3; + square_sum3[0] = sgr_buffer->square_sum3; + ma343[0] = sgr_buffer->ma343; + b343[0] = sgr_buffer->b343; + for (int i = 1; i <= 3; ++i) { + sum3[i] = sum3[i - 1] + sum_stride; + square_sum3[i] = square_sum3[i - 1] + sum_stride; + ma343[i] = ma343[i - 1] + temp_stride; + b343[i] = b343[i - 1] + temp_stride; + } + sum5[0] = sgr_buffer->sum5; + square_sum5[0] = sgr_buffer->square_sum5; + for (int i = 1; i <= 4; ++i) { + sum5[i] = sum5[i - 1] + sum_stride; + square_sum5[i] = square_sum5[i - 1] + sum_stride; + } + ma444[0] = sgr_buffer->ma444; + b444[0] = sgr_buffer->b444; + for (int i = 1; i <= 2; ++i) { + ma444[i] = ma444[i - 1] + temp_stride; + b444[i] = b444[i - 1] + temp_stride; + } + ma565[0] = sgr_buffer->ma565; + ma565[1] = ma565[0] + temp_stride; + b565[0] = sgr_buffer->b565; + b565[1] = b565[0] + temp_stride; + assert(scales[0] != 0); + assert(scales[1] != 0); + BoxSum(top_border, top_border_stride, width, sum_stride, sum_width, sum3[0], + sum5[1], square_sum3[0], square_sum5[1]); + sum5[0] = sum5[1]; + square_sum5[0] = square_sum5[1]; + const uint16_t* const s = (height > 1) ? src + stride : bottom_border; + BoxSumFilterPreProcess(src, s, width, scales, sum3, sum5, square_sum3, + square_sum5, sum_width, ma343, ma444[0], ma565[0], + b343, b444[0], b565[0]); + sum5[0] = sgr_buffer->sum5; + square_sum5[0] = sgr_buffer->square_sum5; + + for (int y = (height >> 1) - 1; y > 0; --y) { + Circulate4PointersBy2<uint16_t>(sum3); + Circulate4PointersBy2<uint32_t>(square_sum3); + Circulate5PointersBy2<uint16_t>(sum5); + Circulate5PointersBy2<uint32_t>(square_sum5); + BoxFilter(src + 3, src + 2 * stride, src + 3 * stride, stride, width, + scales, w0, w2, sum3, sum5, square_sum3, square_sum5, sum_width, + ma343, ma444, ma565, b343, b444, b565, dst); + src += 2 * stride; + dst += 2 * stride; + Circulate4PointersBy2<uint16_t>(ma343); + Circulate4PointersBy2<uint32_t>(b343); + std::swap(ma444[0], ma444[2]); + std::swap(b444[0], b444[2]); + std::swap(ma565[0], ma565[1]); + std::swap(b565[0], b565[1]); + } + + Circulate4PointersBy2<uint16_t>(sum3); + Circulate4PointersBy2<uint32_t>(square_sum3); + Circulate5PointersBy2<uint16_t>(sum5); + Circulate5PointersBy2<uint32_t>(square_sum5); + if ((height & 1) == 0 || height > 1) { + const uint16_t* sr[2]; + if ((height & 1) == 0) { + sr[0] = bottom_border; + sr[1] = bottom_border + bottom_border_stride; + } else { + sr[0] = src + 2 * stride; + sr[1] = bottom_border; + } + BoxFilter(src + 3, sr[0], sr[1], stride, width, scales, w0, w2, sum3, sum5, + square_sum3, square_sum5, sum_width, ma343, ma444, ma565, b343, + b444, b565, dst); + } + if ((height & 1) != 0) { + if (height > 1) { + src += 2 * stride; + dst += 2 * stride; + Circulate4PointersBy2<uint16_t>(sum3); + Circulate4PointersBy2<uint32_t>(square_sum3); + Circulate5PointersBy2<uint16_t>(sum5); + Circulate5PointersBy2<uint32_t>(square_sum5); + Circulate4PointersBy2<uint16_t>(ma343); + Circulate4PointersBy2<uint32_t>(b343); + std::swap(ma444[0], ma444[2]); + std::swap(b444[0], b444[2]); + std::swap(ma565[0], ma565[1]); + std::swap(b565[0], b565[1]); + } + BoxFilterLastRow(src + 3, bottom_border + bottom_border_stride, width, + sum_width, scales, w0, w2, sum3, sum5, square_sum3, + square_sum5, ma343[0], ma444[0], ma565[0], b343[0], + b444[0], b565[0], dst); + } +} + +inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info, + const uint16_t* src, const ptrdiff_t stride, + const uint16_t* const top_border, + const ptrdiff_t top_border_stride, + const uint16_t* bottom_border, + const ptrdiff_t bottom_border_stride, + const int width, const int height, + SgrBuffer* const sgr_buffer, uint16_t* dst) { + const auto temp_stride = Align<ptrdiff_t>(width, 16); + const auto sum_width = Align<ptrdiff_t>(width + 8, 16); + const auto sum_stride = temp_stride + 16; + const int sgr_proj_index = restoration_info.sgr_proj_info.index; + const uint32_t scale = kSgrScaleParameter[sgr_proj_index][0]; // < 2^12. + const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0]; + uint16_t *sum5[5], *ma565[2]; + uint32_t *square_sum5[5], *b565[2]; + sum5[0] = sgr_buffer->sum5; + square_sum5[0] = sgr_buffer->square_sum5; + for (int i = 1; i <= 4; ++i) { + sum5[i] = sum5[i - 1] + sum_stride; + square_sum5[i] = square_sum5[i - 1] + sum_stride; + } + ma565[0] = sgr_buffer->ma565; + ma565[1] = ma565[0] + temp_stride; + b565[0] = sgr_buffer->b565; + b565[1] = b565[0] + temp_stride; + assert(scale != 0); + BoxSum<5>(top_border, top_border_stride, width, sum_stride, sum_width, + sum5[1], square_sum5[1]); + sum5[0] = sum5[1]; + square_sum5[0] = square_sum5[1]; + const uint16_t* const s = (height > 1) ? src + stride : bottom_border; + BoxSumFilterPreProcess5(src, s, width, scale, sum5, square_sum5, sum_width, + ma565[0], b565[0]); + sum5[0] = sgr_buffer->sum5; + square_sum5[0] = sgr_buffer->square_sum5; + + for (int y = (height >> 1) - 1; y > 0; --y) { + Circulate5PointersBy2<uint16_t>(sum5); + Circulate5PointersBy2<uint32_t>(square_sum5); + BoxFilterPass1(src + 3, src + 2 * stride, src + 3 * stride, stride, sum5, + square_sum5, width, sum_width, scale, w0, ma565, b565, dst); + src += 2 * stride; + dst += 2 * stride; + std::swap(ma565[0], ma565[1]); + std::swap(b565[0], b565[1]); + } + + Circulate5PointersBy2<uint16_t>(sum5); + Circulate5PointersBy2<uint32_t>(square_sum5); + if ((height & 1) == 0 || height > 1) { + const uint16_t* sr[2]; + if ((height & 1) == 0) { + sr[0] = bottom_border; + sr[1] = bottom_border + bottom_border_stride; + } else { + sr[0] = src + 2 * stride; + sr[1] = bottom_border; + } + BoxFilterPass1(src + 3, sr[0], sr[1], stride, sum5, square_sum5, width, + sum_width, scale, w0, ma565, b565, dst); + } + if ((height & 1) != 0) { + src += 3; + if (height > 1) { + src += 2 * stride; + dst += 2 * stride; + std::swap(ma565[0], ma565[1]); + std::swap(b565[0], b565[1]); + Circulate5PointersBy2<uint16_t>(sum5); + Circulate5PointersBy2<uint32_t>(square_sum5); + } + BoxFilterPass1LastRow(src, bottom_border + bottom_border_stride, width, + sum_width, scale, w0, sum5, square_sum5, ma565[0], + b565[0], dst); + } +} + +inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info, + const uint16_t* src, const ptrdiff_t stride, + const uint16_t* const top_border, + const ptrdiff_t top_border_stride, + const uint16_t* bottom_border, + const ptrdiff_t bottom_border_stride, + const int width, const int height, + SgrBuffer* const sgr_buffer, uint16_t* dst) { + assert(restoration_info.sgr_proj_info.multiplier[0] == 0); + const auto temp_stride = Align<ptrdiff_t>(width, 16); + const auto sum_width = Align<ptrdiff_t>(width + 8, 16); + const auto sum_stride = temp_stride + 16; + const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1]; + const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1; + const int sgr_proj_index = restoration_info.sgr_proj_info.index; + const uint32_t scale = kSgrScaleParameter[sgr_proj_index][1]; // < 2^12. + uint16_t *sum3[3], *ma343[3], *ma444[2]; + uint32_t *square_sum3[3], *b343[3], *b444[2]; + sum3[0] = sgr_buffer->sum3; + square_sum3[0] = sgr_buffer->square_sum3; + ma343[0] = sgr_buffer->ma343; + b343[0] = sgr_buffer->b343; + for (int i = 1; i <= 2; ++i) { + sum3[i] = sum3[i - 1] + sum_stride; + square_sum3[i] = square_sum3[i - 1] + sum_stride; + ma343[i] = ma343[i - 1] + temp_stride; + b343[i] = b343[i - 1] + temp_stride; + } + ma444[0] = sgr_buffer->ma444; + ma444[1] = ma444[0] + temp_stride; + b444[0] = sgr_buffer->b444; + b444[1] = b444[0] + temp_stride; + assert(scale != 0); + BoxSum<3>(top_border, top_border_stride, width, sum_stride, sum_width, + sum3[0], square_sum3[0]); + BoxSumFilterPreProcess3<false>(src, width, scale, sum3, square_sum3, + sum_width, ma343[0], nullptr, b343[0], + nullptr); + Circulate3PointersBy1<uint16_t>(sum3); + Circulate3PointersBy1<uint32_t>(square_sum3); + const uint16_t* s; + if (height > 1) { + s = src + stride; + } else { + s = bottom_border; + bottom_border += bottom_border_stride; + } + BoxSumFilterPreProcess3<true>(s, width, scale, sum3, square_sum3, sum_width, + ma343[1], ma444[0], b343[1], b444[0]); + + for (int y = height - 2; y > 0; --y) { + Circulate3PointersBy1<uint16_t>(sum3); + Circulate3PointersBy1<uint32_t>(square_sum3); + BoxFilterPass2(src + 2, src + 2 * stride, width, sum_width, scale, w0, sum3, + square_sum3, ma343, ma444, b343, b444, dst); + src += stride; + dst += stride; + Circulate3PointersBy1<uint16_t>(ma343); + Circulate3PointersBy1<uint32_t>(b343); + std::swap(ma444[0], ma444[1]); + std::swap(b444[0], b444[1]); + } + + int y = std::min(height, 2); + src += 2; + do { + Circulate3PointersBy1<uint16_t>(sum3); + Circulate3PointersBy1<uint32_t>(square_sum3); + BoxFilterPass2(src, bottom_border, width, sum_width, scale, w0, sum3, + square_sum3, ma343, ma444, b343, b444, dst); + src += stride; + dst += stride; + bottom_border += bottom_border_stride; + Circulate3PointersBy1<uint16_t>(ma343); + Circulate3PointersBy1<uint32_t>(b343); + std::swap(ma444[0], ma444[1]); + std::swap(b444[0], b444[1]); + } while (--y != 0); +} + +// If |width| is non-multiple of 16, up to 15 more pixels are written to |dest| +// in the end of each row. It is safe to overwrite the output as it will not be +// part of the visible frame. +void SelfGuidedFilter_SSE4_1( + const RestorationUnitInfo& restoration_info, const void* const source, + const ptrdiff_t stride, const void* const top_border, + const ptrdiff_t top_border_stride, const void* const bottom_border, + const ptrdiff_t bottom_border_stride, const int width, const int height, + RestorationBuffer* const restoration_buffer, void* const dest) { + const int index = restoration_info.sgr_proj_info.index; + const int radius_pass_0 = kSgrProjParams[index][0]; // 2 or 0 + const int radius_pass_1 = kSgrProjParams[index][2]; // 1 or 0 + const auto* const src = static_cast<const uint16_t*>(source); + const auto* const top = static_cast<const uint16_t*>(top_border); + const auto* const bottom = static_cast<const uint16_t*>(bottom_border); + auto* const dst = static_cast<uint16_t*>(dest); + SgrBuffer* const sgr_buffer = &restoration_buffer->sgr_buffer; + if (radius_pass_1 == 0) { + // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the + // following assertion. + assert(radius_pass_0 != 0); + BoxFilterProcessPass1(restoration_info, src - 3, stride, top - 3, + top_border_stride, bottom - 3, bottom_border_stride, + width, height, sgr_buffer, dst); + } else if (radius_pass_0 == 0) { + BoxFilterProcessPass2(restoration_info, src - 2, stride, top - 2, + top_border_stride, bottom - 2, bottom_border_stride, + width, height, sgr_buffer, dst); + } else { + BoxFilterProcess(restoration_info, src - 3, stride, top - 3, + top_border_stride, bottom - 3, bottom_border_stride, width, + height, sgr_buffer, dst); + } +} + void Init10bpp() { Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10); assert(dsp != nullptr); @@ -531,6 +2505,11 @@ void Init10bpp() { #else static_cast<void>(WienerFilter_SSE4_1); #endif +#if DSP_ENABLED_10BPP_SSE4_1(SelfGuidedFilter) + dsp->loop_restorations[1] = SelfGuidedFilter_SSE4_1; +#else + static_cast<void>(SelfGuidedFilter_SSE4_1); +#endif } } // namespace @@ -540,7 +2519,7 @@ void LoopRestorationInit10bpp_SSE4_1() { Init10bpp(); } } // namespace dsp } // namespace libgav1 -#else // !(LIBGAV1_TARGETING_SSE4_1 && LIBGAV1_MAX_BITDEPTH >= 10) +#else // !(LIBGAV1_TARGETING_SSE4_1 && LIBGAV1_MAX_BITDEPTH >= 10) namespace libgav1 { namespace dsp { diff --git a/src/dsp/x86/loop_restoration_avx2.cc b/src/dsp/x86/loop_restoration_avx2.cc index 7ae7c90..351a324 100644 --- a/src/dsp/x86/loop_restoration_avx2.cc +++ b/src/dsp/x86/loop_restoration_avx2.cc @@ -28,7 +28,6 @@ #include "src/dsp/constants.h" #include "src/dsp/dsp.h" #include "src/dsp/x86/common_avx2.h" -#include "src/dsp/x86/common_sse4.h" #include "src/utils/common.h" #include "src/utils/constants.h" @@ -116,7 +115,8 @@ inline void WienerHorizontalTap7(const uint8_t* src, const ptrdiff_t src_stride, filter[0] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x0100)); filter[1] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x0302)); filter[2] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x0102)); - filter[3] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x8000)); + filter[3] = _mm256_shuffle_epi8( + coefficients, _mm256_set1_epi16(static_cast<int16_t>(0x8000))); for (int y = height; y != 0; --y) { __m256i s = LoadUnaligned32(src); __m256i ss[4]; @@ -144,7 +144,8 @@ inline void WienerHorizontalTap5(const uint8_t* src, const ptrdiff_t src_stride, __m256i filter[3]; filter[0] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x0201)); filter[1] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x0203)); - filter[2] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x8001)); + filter[2] = _mm256_shuffle_epi8( + coefficients, _mm256_set1_epi16(static_cast<int16_t>(0x8001))); for (int y = height; y != 0; --y) { __m256i s = LoadUnaligned32(src); __m256i ss[4]; @@ -171,7 +172,8 @@ inline void WienerHorizontalTap3(const uint8_t* src, const ptrdiff_t src_stride, int16_t** const wiener_buffer) { __m256i filter[2]; filter[0] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x0302)); - filter[1] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x8002)); + filter[1] = _mm256_shuffle_epi8( + coefficients, _mm256_set1_epi16(static_cast<int16_t>(0x8002))); for (int y = height; y != 0; --y) { __m256i s = LoadUnaligned32(src); __m256i ss[4]; @@ -480,12 +482,12 @@ inline void WienerVerticalTap1(const int16_t* wiener_buffer, } } -void WienerFilter_AVX2(const RestorationUnitInfo& restoration_info, - const void* const source, const void* const top_border, - const void* const bottom_border, const ptrdiff_t stride, - const int width, const int height, - RestorationBuffer* const restoration_buffer, - void* const dest) { +void WienerFilter_AVX2( + const RestorationUnitInfo& restoration_info, const void* const source, + const ptrdiff_t stride, const void* const top_border, + const ptrdiff_t top_border_stride, const void* const bottom_border, + const ptrdiff_t bottom_border_stride, const int width, const int height, + RestorationBuffer* const restoration_buffer, void* const dest) { const int16_t* const number_leading_zero_coefficients = restoration_info.wiener_info.number_leading_zero_coefficients; const int number_rows_to_skip = std::max( @@ -515,39 +517,42 @@ void WienerFilter_AVX2(const RestorationUnitInfo& restoration_info, c_horizontal = _mm_packs_epi16(c_horizontal, c_horizontal); const __m256i coefficients_horizontal = _mm256_broadcastd_epi32(c_horizontal); if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) { - WienerHorizontalTap7(top + (2 - height_extra) * stride - 3, stride, - wiener_stride, height_extra, coefficients_horizontal, - &wiener_buffer_horizontal); - WienerHorizontalTap7(src - 3, stride, wiener_stride, height, + WienerHorizontalTap7(top + (2 - height_extra) * top_border_stride - 3, + top_border_stride, wiener_stride, height_extra, coefficients_horizontal, &wiener_buffer_horizontal); - WienerHorizontalTap7(bottom - 3, stride, wiener_stride, height_extra, + WienerHorizontalTap7(src - 3, stride, wiener_stride, height, coefficients_horizontal, &wiener_buffer_horizontal); - } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) { - WienerHorizontalTap5(top + (2 - height_extra) * stride - 2, stride, - wiener_stride, height_extra, coefficients_horizontal, + WienerHorizontalTap7(bottom - 3, bottom_border_stride, wiener_stride, + height_extra, coefficients_horizontal, &wiener_buffer_horizontal); - WienerHorizontalTap5(src - 2, stride, wiener_stride, height, + } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) { + WienerHorizontalTap5(top + (2 - height_extra) * top_border_stride - 2, + top_border_stride, wiener_stride, height_extra, coefficients_horizontal, &wiener_buffer_horizontal); - WienerHorizontalTap5(bottom - 2, stride, wiener_stride, height_extra, + WienerHorizontalTap5(src - 2, stride, wiener_stride, height, coefficients_horizontal, &wiener_buffer_horizontal); + WienerHorizontalTap5(bottom - 2, bottom_border_stride, wiener_stride, + height_extra, coefficients_horizontal, + &wiener_buffer_horizontal); } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) { // The maximum over-reads happen here. - WienerHorizontalTap3(top + (2 - height_extra) * stride - 1, stride, - wiener_stride, height_extra, coefficients_horizontal, - &wiener_buffer_horizontal); - WienerHorizontalTap3(src - 1, stride, wiener_stride, height, + WienerHorizontalTap3(top + (2 - height_extra) * top_border_stride - 1, + top_border_stride, wiener_stride, height_extra, coefficients_horizontal, &wiener_buffer_horizontal); - WienerHorizontalTap3(bottom - 1, stride, wiener_stride, height_extra, + WienerHorizontalTap3(src - 1, stride, wiener_stride, height, coefficients_horizontal, &wiener_buffer_horizontal); + WienerHorizontalTap3(bottom - 1, bottom_border_stride, wiener_stride, + height_extra, coefficients_horizontal, + &wiener_buffer_horizontal); } else { assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3); - WienerHorizontalTap1(top + (2 - height_extra) * stride, stride, - wiener_stride, height_extra, + WienerHorizontalTap1(top + (2 - height_extra) * top_border_stride, + top_border_stride, wiener_stride, height_extra, &wiener_buffer_horizontal); WienerHorizontalTap1(src, stride, wiener_stride, height, &wiener_buffer_horizontal); - WienerHorizontalTap1(bottom, stride, wiener_stride, height_extra, - &wiener_buffer_horizontal); + WienerHorizontalTap1(bottom, bottom_border_stride, wiener_stride, + height_extra, &wiener_buffer_horizontal); } // vertical filtering. @@ -765,17 +770,6 @@ inline __m256i VaddwHi16(const __m256i src0, const __m256i src1) { return _mm256_add_epi32(src0, s1); } -// Using VgetLane16() can save a sign extension instruction. -template <int n> -inline int VgetLane16(__m256i src) { - return _mm256_extract_epi16(src, n); -} - -template <int n> -inline int VgetLane8(__m256i src) { - return _mm256_extract_epi8(src, n); -} - inline __m256i VmullNLo8(const __m256i src0, const int src1) { const __m256i s0 = _mm256_unpacklo_epi16(src0, _mm256_setzero_si256()); return _mm256_madd_epi16(s0, _mm256_set1_epi32(src1)); @@ -1253,9 +1247,8 @@ inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride, do { const __m128i s0 = LoadUnaligned16Msan(src, kOverreadInBytesPass1_128 - width); - __m128i sq_128[2]; + __m128i sq_128[2], s3, s5, sq3[2], sq5[2]; __m256i sq[3]; - __m128i s3, s5, sq3[2], sq5[2]; sq_128[0] = SquareLo8(s0); sq_128[1] = SquareHi8(s0); SumHorizontalLo(s0, &s3, &s5); @@ -1432,11 +1425,43 @@ inline __m256i CalculateMa(const __m256i sum, const __m256i sum_sq[2], return _mm256_packus_epi32(z0, z1); } -template <int n> -inline __m128i CalculateB(const __m128i sum, const __m128i ma) { - static_assert(n == 9 || n == 25, ""); +inline __m128i CalculateB5(const __m128i sum, const __m128i ma) { + // one_over_n == 164. constexpr uint32_t one_over_n = - ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n; + ((1 << kSgrProjReciprocalBits) + (25 >> 1)) / 25; + // one_over_n_quarter == 41. + constexpr uint32_t one_over_n_quarter = one_over_n >> 2; + static_assert(one_over_n == one_over_n_quarter << 2, ""); + // |ma| is in range [0, 255]. + const __m128i m = _mm_maddubs_epi16(ma, _mm_set1_epi16(one_over_n_quarter)); + const __m128i m0 = VmullLo16(m, sum); + const __m128i m1 = VmullHi16(m, sum); + const __m128i b_lo = VrshrU32(m0, kSgrProjReciprocalBits - 2); + const __m128i b_hi = VrshrU32(m1, kSgrProjReciprocalBits - 2); + return _mm_packus_epi32(b_lo, b_hi); +} + +inline __m256i CalculateB5(const __m256i sum, const __m256i ma) { + // one_over_n == 164. + constexpr uint32_t one_over_n = + ((1 << kSgrProjReciprocalBits) + (25 >> 1)) / 25; + // one_over_n_quarter == 41. + constexpr uint32_t one_over_n_quarter = one_over_n >> 2; + static_assert(one_over_n == one_over_n_quarter << 2, ""); + // |ma| is in range [0, 255]. + const __m256i m = + _mm256_maddubs_epi16(ma, _mm256_set1_epi16(one_over_n_quarter)); + const __m256i m0 = VmullLo16(m, sum); + const __m256i m1 = VmullHi16(m, sum); + const __m256i b_lo = VrshrU32(m0, kSgrProjReciprocalBits - 2); + const __m256i b_hi = VrshrU32(m1, kSgrProjReciprocalBits - 2); + return _mm256_packus_epi32(b_lo, b_hi); +} + +inline __m128i CalculateB3(const __m128i sum, const __m128i ma) { + // one_over_n == 455. + constexpr uint32_t one_over_n = + ((1 << kSgrProjReciprocalBits) + (9 >> 1)) / 9; const __m128i m0 = VmullLo16(ma, sum); const __m128i m1 = VmullHi16(ma, sum); const __m128i m2 = _mm_mullo_epi32(m0, _mm_set1_epi32(one_over_n)); @@ -1446,11 +1471,10 @@ inline __m128i CalculateB(const __m128i sum, const __m128i ma) { return _mm_packus_epi32(b_lo, b_hi); } -template <int n> -inline __m256i CalculateB(const __m256i sum, const __m256i ma) { - static_assert(n == 9 || n == 25, ""); +inline __m256i CalculateB3(const __m256i sum, const __m256i ma) { + // one_over_n == 455. constexpr uint32_t one_over_n = - ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n; + ((1 << kSgrProjReciprocalBits) + (9 >> 1)) / 9; const __m256i m0 = VmullLo16(ma, sum); const __m256i m1 = VmullHi16(ma, sum); const __m256i m2 = _mm256_mullo_epi32(m0, _mm256_set1_epi32(one_over_n)); @@ -1525,7 +1549,7 @@ inline void LookupIntermediate(const __m128i sum, const __m128i index, // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits). // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits). const __m128i maq = _mm_unpacklo_epi8(*ma, _mm_setzero_si128()); - *b = CalculateB<n>(sum, maq); + *b = (n == 9) ? CalculateB3(sum, maq) : CalculateB5(sum, maq); } // Repeat the first 48 elements in kSgrMaLookup with a period of 16. @@ -1539,7 +1563,7 @@ alignas(32) constexpr uint8_t kSgrMaLookupAvx2[96] = { // Set the shuffle control mask of indices out of range [0, 15] to (1xxxxxxx)b // to get value 0 as the shuffle result. The most significiant bit 1 comes -// either from the comparision instruction, or from the sign bit of the index. +// either from the comparison instruction, or from the sign bit of the index. inline __m256i ShuffleIndex(const __m256i table, const __m256i index) { __m256i mask; mask = _mm256_cmpgt_epi8(index, _mm256_set1_epi8(15)); @@ -1558,15 +1582,15 @@ template <int n> inline void CalculateIntermediate(const __m256i sum[2], const __m256i index[2], __m256i ma[3], __m256i b[2]) { static_assert(n == 9 || n == 25, ""); - // Use table lookup to read elements which indices are less than 48. + // Use table lookup to read elements whose indices are less than 48. const __m256i c0 = LoadAligned32(kSgrMaLookupAvx2 + 0 * 32); const __m256i c1 = LoadAligned32(kSgrMaLookupAvx2 + 1 * 32); const __m256i c2 = LoadAligned32(kSgrMaLookupAvx2 + 2 * 32); const __m256i indices = _mm256_packus_epi16(index[0], index[1]); __m256i idx, mas; - // Clip idx to 127 to apply signed comparision instructions. + // Clip idx to 127 to apply signed comparison instructions. idx = _mm256_min_epu8(indices, _mm256_set1_epi8(127)); - // All elements which indices are less than 48 are set to 0. + // All elements whose indices are less than 48 are set to 0. // Get shuffle results for indices in range [0, 15]. mas = ShuffleIndex(c0, idx); // Get shuffle results for indices in range [16, 31]. @@ -1581,12 +1605,12 @@ inline void CalculateIntermediate(const __m256i sum[2], const __m256i index[2], const __m256i res2 = ShuffleIndex(c2, idx); mas = _mm256_or_si256(mas, res2); - // For elements which indices are larger than 47, since they seldom change + // For elements whose indices are larger than 47, since they seldom change // values with the increase of the index, we use comparison and arithmetic // operations to calculate their values. - // Add -128 to apply signed comparision instructions. + // Add -128 to apply signed comparison instructions. idx = _mm256_add_epi8(indices, _mm256_set1_epi8(-128)); - // Elements which indices are larger than 47 (with value 0) are set to 5. + // Elements whose indices are larger than 47 (with value 0) are set to 5. mas = _mm256_max_epu8(mas, _mm256_set1_epi8(5)); mas = AdjustValue(mas, idx, 55); // 55 is the last index which value is 5. mas = AdjustValue(mas, idx, 72); // 72 is the last index which value is 4. @@ -1611,8 +1635,13 @@ inline void CalculateIntermediate(const __m256i sum[2], const __m256i index[2], // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits). const __m256i maq0 = _mm256_unpackhi_epi8(ma[0], _mm256_setzero_si256()); const __m256i maq1 = _mm256_unpacklo_epi8(ma[1], _mm256_setzero_si256()); - b[0] = CalculateB<n>(sum[0], maq0); - b[1] = CalculateB<n>(sum[1], maq1); + if (n == 9) { + b[0] = CalculateB3(sum[0], maq0); + b[1] = CalculateB3(sum[1], maq1); + } else { + b[0] = CalculateB5(sum[0], maq0); + b[1] = CalculateB5(sum[1], maq1); + } } inline void CalculateIntermediate5(const __m128i s5[5], const __m128i sq5[5][2], @@ -1903,8 +1932,8 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess( __m256i b3[2][5], __m256i ma5[3], __m256i b5[5]) { const __m256i s0 = LoadUnaligned32Msan(src0 + 8, over_read_in_bytes + 8); const __m256i s1 = LoadUnaligned32Msan(src1 + 8, over_read_in_bytes + 8); - __m256i s3[2][4], s5[2][5], sq3[4][2], sq5[5][2], sq3t[4][2], sq5t[5][2], - sum_3[2][2], index_3[2][2], sum_5[2], index_5[2]; + __m256i s3[2][4], s5[2][5], sq3[4][2], sq5[5][2], sum_3[2][2], index_3[2][2], + sum_5[2], index_5[2]; sq[0][1] = SquareLo8(s0); sq[0][2] = SquareHi8(s0); sq[1][1] = SquareLo8(s1); @@ -1938,22 +1967,22 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess( LoadAligned64x3U32(square_sum5, x, sq5); CalculateSumAndIndex5(s5[0], sq5, scales[0], &sum_5[0], &index_5[0]); - SumHorizontal(sq[0] + 1, &sq3t[2][0], &sq3t[2][1], &sq5t[3][0], &sq5t[3][1]); - SumHorizontal(sq[1] + 1, &sq3t[3][0], &sq3t[3][1], &sq5t[4][0], &sq5t[4][1]); - StoreAligned64(square_sum3[2] + x + 16, sq3t[2]); - StoreAligned64(square_sum5[3] + x + 16, sq5t[3]); - StoreAligned64(square_sum3[3] + x + 16, sq3t[3]); - StoreAligned64(square_sum5[4] + x + 16, sq5t[4]); + SumHorizontal(sq[0] + 1, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]); + SumHorizontal(sq[1] + 1, &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]); + StoreAligned64(square_sum3[2] + x + 16, sq3[2]); + StoreAligned64(square_sum5[3] + x + 16, sq5[3]); + StoreAligned64(square_sum3[3] + x + 16, sq3[3]); + StoreAligned64(square_sum5[4] + x + 16, sq5[4]); LoadAligned32x2U16Msan(sum3, x + 16, sum_width, s3[1]); - LoadAligned64x2U32Msan(square_sum3, x + 16, sum_width, sq3t); - CalculateSumAndIndex3(s3[1], sq3t, scales[1], &sum_3[0][1], &index_3[0][1]); - CalculateSumAndIndex3(s3[1] + 1, sq3t + 1, scales[1], &sum_3[1][1], + LoadAligned64x2U32Msan(square_sum3, x + 16, sum_width, sq3); + CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum_3[0][1], &index_3[0][1]); + CalculateSumAndIndex3(s3[1] + 1, sq3 + 1, scales[1], &sum_3[1][1], &index_3[1][1]); CalculateIntermediate<9>(sum_3[0], index_3[0], ma3[0], b3[0] + 1); CalculateIntermediate<9>(sum_3[1], index_3[1], ma3[1], b3[1] + 1); LoadAligned32x3U16Msan(sum5, x + 16, sum_width, s5[1]); - LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5t); - CalculateSumAndIndex5(s5[1], sq5t, scales[0], &sum_5[1], &index_5[1]); + LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5); + CalculateSumAndIndex5(s5[1], sq5, scales[0], &sum_5[1], &index_5[1]); CalculateIntermediate<25>(sum_5, index_5, ma5, b5 + 1); b3[0][0] = _mm256_permute2x128_si256(b3[0][0], b3[0][2], 0x21); b3[1][0] = _mm256_permute2x128_si256(b3[1][0], b3[1][2], 0x21); @@ -1988,8 +2017,8 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRow( __m256i sq[6], __m256i ma3[2], __m256i ma5[2], __m256i b3[5], __m256i b5[5]) { const __m256i s0 = LoadUnaligned32Msan(src + 8, over_read_in_bytes + 8); - __m256i s3[2][3], s5[2][5], sq3[4][2], sq3t[4][2], sq5[5][2], sq5t[5][2], - sum_3[2], index_3[2], sum_5[2], index_5[2]; + __m256i s3[2][3], s5[2][5], sq3[4][2], sq5[5][2], sum_3[2], index_3[2], + sum_5[2], index_5[2]; sq[1] = SquareLo8(s0); sq[2] = SquareHi8(s0); sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21); @@ -2006,17 +2035,17 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRow( sq5[4][1] = sq5[3][1]; CalculateSumAndIndex5(s5[0], sq5, scales[0], &sum_5[0], &index_5[0]); - SumHorizontal(sq + 1, &sq3t[2][0], &sq3t[2][1], &sq5t[3][0], &sq5t[3][1]); + SumHorizontal(sq + 1, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]); LoadAligned32x2U16Msan(sum3, x + 16, sum_width, s3[1]); - LoadAligned64x2U32Msan(square_sum3, x + 16, sum_width, sq3t); - CalculateSumAndIndex3(s3[1], sq3t, scales[1], &sum_3[1], &index_3[1]); + LoadAligned64x2U32Msan(square_sum3, x + 16, sum_width, sq3); + CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum_3[1], &index_3[1]); CalculateIntermediate<9>(sum_3, index_3, ma3, b3 + 1); LoadAligned32x3U16Msan(sum5, x + 16, sum_width, s5[1]); s5[1][4] = s5[1][3]; - LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5t); - sq5t[4][0] = sq5t[3][0]; - sq5t[4][1] = sq5t[3][1]; - CalculateSumAndIndex5(s5[1], sq5t, scales[0], &sum_5[1], &index_5[1]); + LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5); + sq5[4][0] = sq5[3][0]; + sq5[4][1] = sq5[3][1]; + CalculateSumAndIndex5(s5[1], sq5, scales[0], &sum_5[1], &index_5[1]); CalculateIntermediate<25>(sum_5, index_5, ma5, b5 + 1); b3[0] = _mm256_permute2x128_si256(b3[0], b3[2], 0x21); b5[0] = _mm256_permute2x128_si256(b5[0], b5[2], 0x21); @@ -2071,9 +2100,9 @@ LIBGAV1_ALWAYS_INLINE void BoxSumFilterPreProcess3( uint16_t* const sum3[3], uint32_t* const square_sum3[3], const ptrdiff_t sum_width, uint16_t* ma343, uint16_t* ma444, uint32_t* b343, uint32_t* b444) { + const __m128i s = LoadUnaligned16Msan(src, kOverreadInBytesPass2_128 - width); __m128i ma0, sq_128[2], b0; __m256i mas[3], sq[3], bs[3]; - const __m128i s = LoadUnaligned16Msan(src, kOverreadInBytesPass2_128 - width); sq_128[0] = SquareLo8(s); BoxFilterPreProcess3Lo(s, scale, sum3, square_sum3, sq_128, &ma0, &b0); sq[0] = SetrM128i(sq_128[0], sq_128[1]); @@ -2115,9 +2144,9 @@ inline void BoxSumFilterPreProcess( const uint8_t* const src0, const uint8_t* const src1, const int width, const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5], uint32_t* const square_sum3[4], uint32_t* const square_sum5[5], - const ptrdiff_t sum_width, uint16_t* const ma343[4], - uint16_t* const ma444[2], uint16_t* ma565, uint32_t* const b343[4], - uint32_t* const b444[2], uint32_t* b565) { + const ptrdiff_t sum_width, uint16_t* const ma343[4], uint16_t* const ma444, + uint16_t* ma565, uint32_t* const b343[4], uint32_t* const b444, + uint32_t* b565) { __m128i s[2], ma3_128[2], ma5_0, sq_128[2][2], b3_128[2], b5_0; __m256i ma3[2][3], ma5[3], sq[2][3], b3[2][5], b5[5]; s[0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1_128 - width); @@ -2151,9 +2180,8 @@ inline void BoxSumFilterPreProcess( Sum565W(b5, b); StoreAligned64(b565, b); Prepare3_8(ma3[1], ma3x); - Store343_444Lo(ma3x, b3[1], x, ma343[1], ma444[0], b343[1], b444[0]); - Store343_444Hi(ma3x, b3[1] + 1, x + 16, ma343[1], ma444[0], b343[1], - b444[0]); + Store343_444Lo(ma3x, b3[1], x, ma343[1], ma444, b343[1], b444); + Store343_444Hi(ma3x, b3[1] + 1, x + 16, ma343[1], ma444, b343[1], b444); Prepare3_8(ma5, ma5x); ma[0] = Sum565Lo(ma5x); ma[1] = Sum565Hi(ma5x); @@ -2199,8 +2227,9 @@ inline __m256i CalculateFilteredOutput(const __m256i src, const __m256i ma, return _mm256_packs_epi32(dst_lo, dst_hi); // 13 bits } -inline __m256i CalculateFilteredOutputPass1(const __m256i src, __m256i ma[2], - __m256i b[2][2]) { +inline __m256i CalculateFilteredOutputPass1(const __m256i src, + const __m256i ma[2], + const __m256i b[2][2]) { const __m256i ma_sum = _mm256_add_epi16(ma[0], ma[1]); __m256i b_sum[2]; b_sum[0] = _mm256_add_epi32(b[0][0], b[1][0]); @@ -2208,8 +2237,9 @@ inline __m256i CalculateFilteredOutputPass1(const __m256i src, __m256i ma[2], return CalculateFilteredOutput<5>(src, ma_sum, b_sum); } -inline __m256i CalculateFilteredOutputPass2(const __m256i src, __m256i ma[3], - __m256i b[3][2]) { +inline __m256i CalculateFilteredOutputPass2(const __m256i src, + const __m256i ma[3], + const __m256i b[3][2]) { const __m256i ma_sum = Sum3_16(ma); __m256i b_sum[2]; Sum3_32(b, b_sum); @@ -2267,13 +2297,13 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterPass1( int x = 0; do { - __m256i ma[3], ma3[3], b[2][2][2]; + __m256i ma[3], ma5[3], b[2][2][2]; BoxFilterPreProcess5(src0 + x + 8, src1 + x + 8, x + 8 + kOverreadInBytesPass1_256 - width, sum_width, x + 8, scale, sum5, square_sum5, sq, mas, bs); - Prepare3_8(mas, ma3); - ma[1] = Sum565Lo(ma3); - ma[2] = Sum565Hi(ma3); + Prepare3_8(mas, ma5); + ma[1] = Sum565Lo(ma5); + ma[2] = Sum565Hi(ma5); StoreAligned64(ma565[1] + x, ma + 1); Sum565W(bs + 0, b[0][1]); Sum565W(bs + 1, b[1][1]); @@ -2511,9 +2541,9 @@ inline void BoxFilterLastRow( const ptrdiff_t sum_width, const uint16_t scales[2], const int16_t w0, const int16_t w2, uint16_t* const sum3[4], uint16_t* const sum5[5], uint32_t* const square_sum3[4], uint32_t* const square_sum5[5], - uint16_t* const ma343[4], uint16_t* const ma444[3], - uint16_t* const ma565[2], uint32_t* const b343[4], uint32_t* const b444[3], - uint32_t* const b565[2], uint8_t* const dst) { + uint16_t* const ma343, uint16_t* const ma444, uint16_t* const ma565, + uint32_t* const b343, uint32_t* const b444, uint32_t* const b565, + uint8_t* const dst) { const __m128i s0 = LoadUnaligned16Msan(src0, kOverreadInBytesPass1_128 - width); __m128i ma3_0, ma5_0, b3_0, b5_0, sq_128[2]; @@ -2542,13 +2572,13 @@ inline void BoxFilterLastRow( Sum343W(b3, b[2]); const __m256i sr = LoadUnaligned32(src + x); const __m256i sr_lo = _mm256_unpacklo_epi8(sr, _mm256_setzero_si256()); - ma[0] = LoadAligned32(ma565[0] + x); - LoadAligned64(b565[0] + x, b[0]); + ma[0] = LoadAligned32(ma565 + x); + LoadAligned64(b565 + x, b[0]); p[0] = CalculateFilteredOutputPass1(sr_lo, ma, b); - ma[0] = LoadAligned32(ma343[0] + x); - ma[1] = LoadAligned32(ma444[0] + x); - LoadAligned64(b343[0] + x, b[0]); - LoadAligned64(b444[0] + x, b[1]); + ma[0] = LoadAligned32(ma343 + x); + ma[1] = LoadAligned32(ma444 + x); + LoadAligned64(b343 + x, b[0]); + LoadAligned64(b444 + x, b[1]); p[1] = CalculateFilteredOutputPass2(sr_lo, ma, b); const __m256i d0 = SelfGuidedDoubleMultiplier(sr_lo, p, w0, w2); @@ -2557,13 +2587,13 @@ inline void BoxFilterLastRow( mat[2] = Sum343Hi(ma3x); Sum343W(b3 + 1, b[2]); const __m256i sr_hi = _mm256_unpackhi_epi8(sr, _mm256_setzero_si256()); - mat[0] = LoadAligned32(ma565[0] + x + 16); - LoadAligned64(b565[0] + x + 16, b[0]); + mat[0] = LoadAligned32(ma565 + x + 16); + LoadAligned64(b565 + x + 16, b[0]); p[0] = CalculateFilteredOutputPass1(sr_hi, mat, b); - mat[0] = LoadAligned32(ma343[0] + x + 16); - mat[1] = LoadAligned32(ma444[0] + x + 16); - LoadAligned64(b343[0] + x + 16, b[0]); - LoadAligned64(b444[0] + x + 16, b[1]); + mat[0] = LoadAligned32(ma343 + x + 16); + mat[1] = LoadAligned32(ma444 + x + 16); + LoadAligned64(b343 + x + 16, b[0]); + LoadAligned64(b444 + x + 16, b[1]); p[1] = CalculateFilteredOutputPass2(sr_hi, mat, b); const __m256i d1 = SelfGuidedDoubleMultiplier(sr_hi, p, w0, w2); StoreUnaligned32(dst + x, _mm256_packus_epi16(d0, d1)); @@ -2578,8 +2608,9 @@ inline void BoxFilterLastRow( LIBGAV1_ALWAYS_INLINE void BoxFilterProcess( const RestorationUnitInfo& restoration_info, const uint8_t* src, - const uint8_t* const top_border, const uint8_t* bottom_border, - const ptrdiff_t stride, const int width, const int height, + const ptrdiff_t stride, const uint8_t* const top_border, + const ptrdiff_t top_border_stride, const uint8_t* bottom_border, + const ptrdiff_t bottom_border_stride, const int width, const int height, SgrBuffer* const sgr_buffer, uint8_t* dst) { const auto temp_stride = Align<ptrdiff_t>(width, 32); const auto sum_width = temp_stride + 8; @@ -2619,14 +2650,14 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterProcess( b565[1] = b565[0] + temp_stride; assert(scales[0] != 0); assert(scales[1] != 0); - BoxSum(top_border, stride, width, sum_stride, temp_stride, sum3[0], sum5[1], - square_sum3[0], square_sum5[1]); + BoxSum(top_border, top_border_stride, width, sum_stride, temp_stride, sum3[0], + sum5[1], square_sum3[0], square_sum5[1]); sum5[0] = sum5[1]; square_sum5[0] = square_sum5[1]; const uint8_t* const s = (height > 1) ? src + stride : bottom_border; BoxSumFilterPreProcess(src, s, width, scales, sum3, sum5, square_sum3, - square_sum5, sum_width, ma343, ma444, ma565[0], b343, - b444, b565[0]); + square_sum5, sum_width, ma343, ma444[0], ma565[0], + b343, b444[0], b565[0]); sum5[0] = sgr_buffer->sum5 + kSumOffset; square_sum5[0] = sgr_buffer->square_sum5 + kSumOffset; @@ -2656,7 +2687,7 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterProcess( const uint8_t* sr[2]; if ((height & 1) == 0) { sr[0] = bottom_border; - sr[1] = bottom_border + stride; + sr[1] = bottom_border + bottom_border_stride; } else { sr[0] = src + 2 * stride; sr[1] = bottom_border; @@ -2680,19 +2711,21 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterProcess( std::swap(ma565[0], ma565[1]); std::swap(b565[0], b565[1]); } - BoxFilterLastRow(src + 3, bottom_border + stride, width, sum_width, scales, - w0, w2, sum3, sum5, square_sum3, square_sum5, ma343, ma444, - ma565, b343, b444, b565, dst); + BoxFilterLastRow(src + 3, bottom_border + bottom_border_stride, width, + sum_width, scales, w0, w2, sum3, sum5, square_sum3, + square_sum5, ma343[0], ma444[0], ma565[0], b343[0], + b444[0], b565[0], dst); } } inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info, - const uint8_t* src, + const uint8_t* src, const ptrdiff_t stride, const uint8_t* const top_border, + const ptrdiff_t top_border_stride, const uint8_t* bottom_border, - const ptrdiff_t stride, const int width, - const int height, SgrBuffer* const sgr_buffer, - uint8_t* dst) { + const ptrdiff_t bottom_border_stride, + const int width, const int height, + SgrBuffer* const sgr_buffer, uint8_t* dst) { const auto temp_stride = Align<ptrdiff_t>(width, 32); const auto sum_width = temp_stride + 8; const auto sum_stride = temp_stride + 32; @@ -2712,8 +2745,8 @@ inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info, b565[0] = sgr_buffer->b565; b565[1] = b565[0] + temp_stride; assert(scale != 0); - BoxSum<5>(top_border, stride, width, sum_stride, temp_stride, sum5[1], - square_sum5[1]); + BoxSum<5>(top_border, top_border_stride, width, sum_stride, temp_stride, + sum5[1], square_sum5[1]); sum5[0] = sum5[1]; square_sum5[0] = square_sum5[1]; const uint8_t* const s = (height > 1) ? src + stride : bottom_border; @@ -2739,7 +2772,7 @@ inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info, const uint8_t* sr[2]; if ((height & 1) == 0) { sr[0] = bottom_border; - sr[1] = bottom_border + stride; + sr[1] = bottom_border + bottom_border_stride; } else { sr[0] = src + 2 * stride; sr[1] = bottom_border; @@ -2757,18 +2790,20 @@ inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info, Circulate5PointersBy2<uint16_t>(sum5); Circulate5PointersBy2<uint32_t>(square_sum5); } - BoxFilterPass1LastRow(src, bottom_border + stride, width, sum_width, scale, - w0, sum5, square_sum5, ma565[0], b565[0], dst); + BoxFilterPass1LastRow(src, bottom_border + bottom_border_stride, width, + sum_width, scale, w0, sum5, square_sum5, ma565[0], + b565[0], dst); } } inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info, - const uint8_t* src, + const uint8_t* src, const ptrdiff_t stride, const uint8_t* const top_border, + const ptrdiff_t top_border_stride, const uint8_t* bottom_border, - const ptrdiff_t stride, const int width, - const int height, SgrBuffer* const sgr_buffer, - uint8_t* dst) { + const ptrdiff_t bottom_border_stride, + const int width, const int height, + SgrBuffer* const sgr_buffer, uint8_t* dst) { assert(restoration_info.sgr_proj_info.multiplier[0] == 0); const auto temp_stride = Align<ptrdiff_t>(width, 32); const auto sum_width = temp_stride + 8; @@ -2794,8 +2829,8 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info, b444[0] = sgr_buffer->b444; b444[1] = b444[0] + temp_stride; assert(scale != 0); - BoxSum<3>(top_border, stride, width, sum_stride, temp_stride, sum3[0], - square_sum3[0]); + BoxSum<3>(top_border, top_border_stride, width, sum_stride, temp_stride, + sum3[0], square_sum3[0]); BoxSumFilterPreProcess3<false>(src, width, scale, sum3, square_sum3, sum_width, ma343[0], nullptr, b343[0], nullptr); @@ -2806,7 +2841,7 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info, s = src + stride; } else { s = bottom_border; - bottom_border += stride; + bottom_border += bottom_border_stride; } BoxSumFilterPreProcess3<true>(s, width, scale, sum3, square_sum3, sum_width, ma343[1], ma444[0], b343[1], b444[0]); @@ -2833,7 +2868,7 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info, square_sum3, ma343, ma444, b343, b444, dst); src += stride; dst += stride; - bottom_border += stride; + bottom_border += bottom_border_stride; Circulate3PointersBy1<uint16_t>(ma343); Circulate3PointersBy1<uint32_t>(b343); std::swap(ma444[0], ma444[1]); @@ -2841,13 +2876,14 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info, } while (--y != 0); } -// If |width| is non-multiple of 8, up to 7 more pixels are written to |dest| in -// the end of each row. It is safe to overwrite the output as it will not be +// If |width| is non-multiple of 32, up to 31 more pixels are written to |dest| +// in the end of each row. It is safe to overwrite the output as it will not be // part of the visible frame. void SelfGuidedFilter_AVX2( const RestorationUnitInfo& restoration_info, const void* const source, - const void* const top_border, const void* const bottom_border, - const ptrdiff_t stride, const int width, const int height, + const ptrdiff_t stride, const void* const top_border, + const ptrdiff_t top_border_stride, const void* const bottom_border, + const ptrdiff_t bottom_border_stride, const int width, const int height, RestorationBuffer* const restoration_buffer, void* const dest) { const int index = restoration_info.sgr_proj_info.index; const int radius_pass_0 = kSgrProjParams[index][0]; // 2 or 0 @@ -2861,14 +2897,17 @@ void SelfGuidedFilter_AVX2( // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the // following assertion. assert(radius_pass_0 != 0); - BoxFilterProcessPass1(restoration_info, src - 3, top - 3, bottom - 3, - stride, width, height, sgr_buffer, dst); + BoxFilterProcessPass1(restoration_info, src - 3, stride, top - 3, + top_border_stride, bottom - 3, bottom_border_stride, + width, height, sgr_buffer, dst); } else if (radius_pass_0 == 0) { - BoxFilterProcessPass2(restoration_info, src - 2, top - 2, bottom - 2, - stride, width, height, sgr_buffer, dst); + BoxFilterProcessPass2(restoration_info, src - 2, stride, top - 2, + top_border_stride, bottom - 2, bottom_border_stride, + width, height, sgr_buffer, dst); } else { - BoxFilterProcess(restoration_info, src - 3, top - 3, bottom - 3, stride, - width, height, sgr_buffer, dst); + BoxFilterProcess(restoration_info, src - 3, stride, top - 3, + top_border_stride, bottom - 3, bottom_border_stride, width, + height, sgr_buffer, dst); } } @@ -2891,7 +2930,7 @@ void LoopRestorationInit_AVX2() { low_bitdepth::Init8bpp(); } } // namespace dsp } // namespace libgav1 -#else // !LIBGAV1_TARGETING_AVX2 +#else // !LIBGAV1_TARGETING_AVX2 namespace libgav1 { namespace dsp { diff --git a/src/dsp/x86/loop_restoration_avx2.h b/src/dsp/x86/loop_restoration_avx2.h index d80227c..2c3534a 100644 --- a/src/dsp/x86/loop_restoration_avx2.h +++ b/src/dsp/x86/loop_restoration_avx2.h @@ -47,6 +47,10 @@ void LoopRestorationInit10bpp_AVX2(); #define LIBGAV1_Dsp8bpp_SelfGuidedFilter LIBGAV1_CPU_AVX2 #endif +#ifndef LIBGAV1_Dsp10bpp_SelfGuidedFilter +#define LIBGAV1_Dsp10bpp_SelfGuidedFilter LIBGAV1_CPU_AVX2 +#endif + #endif // LIBGAV1_TARGETING_AVX2 #endif // LIBGAV1_SRC_DSP_X86_LOOP_RESTORATION_AVX2_H_ diff --git a/src/dsp/x86/loop_restoration_sse4.cc b/src/dsp/x86/loop_restoration_sse4.cc index 24f5ad2..273bcc8 100644 --- a/src/dsp/x86/loop_restoration_sse4.cc +++ b/src/dsp/x86/loop_restoration_sse4.cc @@ -481,13 +481,12 @@ inline void WienerVerticalTap1(const int16_t* wiener_buffer, } } -void WienerFilter_SSE4_1(const RestorationUnitInfo& restoration_info, - const void* const source, const void* const top_border, - const void* const bottom_border, - const ptrdiff_t stride, const int width, - const int height, - RestorationBuffer* const restoration_buffer, - void* const dest) { +void WienerFilter_SSE4_1( + const RestorationUnitInfo& restoration_info, const void* const source, + const ptrdiff_t stride, const void* const top_border, + const ptrdiff_t top_border_stride, const void* const bottom_border, + const ptrdiff_t bottom_border_stride, const int width, const int height, + RestorationBuffer* const restoration_buffer, void* const dest) { const int16_t* const number_leading_zero_coefficients = restoration_info.wiener_info.number_leading_zero_coefficients; const int number_rows_to_skip = std::max( @@ -516,45 +515,48 @@ void WienerFilter_SSE4_1(const RestorationUnitInfo& restoration_info, const __m128i coefficients_horizontal = _mm_sub_epi16(c, _mm_setr_epi16(0, 0, 0, 128, 0, 0, 0, 0)); if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) { - WienerHorizontalTap7(top + (2 - height_extra) * stride - 3, stride, - wiener_stride, height_extra, filter_horizontal[0], - coefficients_horizontal, &wiener_buffer_horizontal); - WienerHorizontalTap7(src - 3, stride, wiener_stride, height, + WienerHorizontalTap7(top + (2 - height_extra) * top_border_stride - 3, + top_border_stride, wiener_stride, height_extra, filter_horizontal[0], coefficients_horizontal, &wiener_buffer_horizontal); - WienerHorizontalTap7(bottom - 3, stride, wiener_stride, height_extra, + WienerHorizontalTap7(src - 3, stride, wiener_stride, height, filter_horizontal[0], coefficients_horizontal, &wiener_buffer_horizontal); - } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) { - WienerHorizontalTap5(top + (2 - height_extra) * stride - 2, stride, - wiener_stride, height_extra, filter_horizontal[1], + WienerHorizontalTap7(bottom - 3, bottom_border_stride, wiener_stride, + height_extra, filter_horizontal[0], coefficients_horizontal, &wiener_buffer_horizontal); - WienerHorizontalTap5(src - 2, stride, wiener_stride, height, + } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) { + WienerHorizontalTap5(top + (2 - height_extra) * top_border_stride - 2, + top_border_stride, wiener_stride, height_extra, filter_horizontal[1], coefficients_horizontal, &wiener_buffer_horizontal); - WienerHorizontalTap5(bottom - 2, stride, wiener_stride, height_extra, + WienerHorizontalTap5(src - 2, stride, wiener_stride, height, filter_horizontal[1], coefficients_horizontal, &wiener_buffer_horizontal); + WienerHorizontalTap5(bottom - 2, bottom_border_stride, wiener_stride, + height_extra, filter_horizontal[1], + coefficients_horizontal, &wiener_buffer_horizontal); } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) { // The maximum over-reads happen here. - WienerHorizontalTap3(top + (2 - height_extra) * stride - 1, stride, - wiener_stride, height_extra, filter_horizontal[2], - coefficients_horizontal, &wiener_buffer_horizontal); - WienerHorizontalTap3(src - 1, stride, wiener_stride, height, + WienerHorizontalTap3(top + (2 - height_extra) * top_border_stride - 1, + top_border_stride, wiener_stride, height_extra, filter_horizontal[2], coefficients_horizontal, &wiener_buffer_horizontal); - WienerHorizontalTap3(bottom - 1, stride, wiener_stride, height_extra, + WienerHorizontalTap3(src - 1, stride, wiener_stride, height, filter_horizontal[2], coefficients_horizontal, &wiener_buffer_horizontal); + WienerHorizontalTap3(bottom - 1, bottom_border_stride, wiener_stride, + height_extra, filter_horizontal[2], + coefficients_horizontal, &wiener_buffer_horizontal); } else { assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3); - WienerHorizontalTap1(top + (2 - height_extra) * stride, stride, - wiener_stride, height_extra, + WienerHorizontalTap1(top + (2 - height_extra) * top_border_stride, + top_border_stride, wiener_stride, height_extra, &wiener_buffer_horizontal); WienerHorizontalTap1(src, stride, wiener_stride, height, &wiener_buffer_horizontal); - WienerHorizontalTap1(bottom, stride, wiener_stride, height_extra, - &wiener_buffer_horizontal); + WienerHorizontalTap1(bottom, bottom_border_stride, wiener_stride, + height_extra, &wiener_buffer_horizontal); } // vertical filtering. @@ -1160,11 +1162,26 @@ inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq[2], return _mm_packus_epi32(z0, z1); } -template <int n> -inline __m128i CalculateB(const __m128i sum, const __m128i ma) { - static_assert(n == 9 || n == 25, ""); +inline __m128i CalculateB5(const __m128i sum, const __m128i ma) { + // one_over_n == 164. constexpr uint32_t one_over_n = - ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n; + ((1 << kSgrProjReciprocalBits) + (25 >> 1)) / 25; + // one_over_n_quarter == 41. + constexpr uint32_t one_over_n_quarter = one_over_n >> 2; + static_assert(one_over_n == one_over_n_quarter << 2, ""); + // |ma| is in range [0, 255]. + const __m128i m = _mm_maddubs_epi16(ma, _mm_set1_epi16(one_over_n_quarter)); + const __m128i m0 = VmullLo16(m, sum); + const __m128i m1 = VmullHi16(m, sum); + const __m128i b_lo = VrshrU32(m0, kSgrProjReciprocalBits - 2); + const __m128i b_hi = VrshrU32(m1, kSgrProjReciprocalBits - 2); + return _mm_packus_epi32(b_lo, b_hi); +} + +inline __m128i CalculateB3(const __m128i sum, const __m128i ma) { + // one_over_n == 455. + constexpr uint32_t one_over_n = + ((1 << kSgrProjReciprocalBits) + (9 >> 1)) / 9; const __m128i m0 = VmullLo16(ma, sum); const __m128i m1 = VmullHi16(ma, sum); const __m128i m2 = _mm_mullo_epi32(m0, _mm_set1_epi32(one_over_n)); @@ -1227,12 +1244,12 @@ inline void LookupIntermediate(const __m128i sum, const __m128i index, } else { maq = _mm_unpackhi_epi8(*ma, _mm_setzero_si128()); } - *b = CalculateB<n>(sum, maq); + *b = (n == 9) ? CalculateB3(sum, maq) : CalculateB5(sum, maq); } // Set the shuffle control mask of indices out of range [0, 15] to (1xxxxxxx)b // to get value 0 as the shuffle result. The most significiant bit 1 comes -// either from the comparision instruction, or from the sign bit of the index. +// either from the comparison instruction, or from the sign bit of the index. inline __m128i ShuffleIndex(const __m128i table, const __m128i index) { __m128i mask; mask = _mm_cmpgt_epi8(index, _mm_set1_epi8(15)); @@ -1250,15 +1267,15 @@ inline __m128i AdjustValue(const __m128i value, const __m128i index, inline void CalculateIntermediate(const __m128i sum[2], const __m128i index[2], __m128i* const ma, __m128i* const b0, __m128i* const b1) { - // Use table lookup to read elements which indices are less than 48. + // Use table lookup to read elements whose indices are less than 48. const __m128i c0 = LoadAligned16(kSgrMaLookup + 0 * 16); const __m128i c1 = LoadAligned16(kSgrMaLookup + 1 * 16); const __m128i c2 = LoadAligned16(kSgrMaLookup + 2 * 16); const __m128i indices = _mm_packus_epi16(index[0], index[1]); __m128i idx; - // Clip idx to 127 to apply signed comparision instructions. + // Clip idx to 127 to apply signed comparison instructions. idx = _mm_min_epu8(indices, _mm_set1_epi8(127)); - // All elements which indices are less than 48 are set to 0. + // All elements whose indices are less than 48 are set to 0. // Get shuffle results for indices in range [0, 15]. *ma = ShuffleIndex(c0, idx); // Get shuffle results for indices in range [16, 31]. @@ -1273,12 +1290,12 @@ inline void CalculateIntermediate(const __m128i sum[2], const __m128i index[2], const __m128i res2 = ShuffleIndex(c2, idx); *ma = _mm_or_si128(*ma, res2); - // For elements which indices are larger than 47, since they seldom change + // For elements whose indices are larger than 47, since they seldom change // values with the increase of the index, we use comparison and arithmetic // operations to calculate their values. - // Add -128 to apply signed comparision instructions. + // Add -128 to apply signed comparison instructions. idx = _mm_add_epi8(indices, _mm_set1_epi8(-128)); - // Elements which indices are larger than 47 (with value 0) are set to 5. + // Elements whose indices are larger than 47 (with value 0) are set to 5. *ma = _mm_max_epu8(*ma, _mm_set1_epi8(5)); *ma = AdjustValue(*ma, idx, 55); // 55 is the last index which value is 5. *ma = AdjustValue(*ma, idx, 72); // 72 is the last index which value is 4. @@ -1298,9 +1315,9 @@ inline void CalculateIntermediate(const __m128i sum[2], const __m128i index[2], // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits). // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits). const __m128i maq0 = _mm_unpacklo_epi8(*ma, _mm_setzero_si128()); - *b0 = CalculateB<9>(sum[0], maq0); + *b0 = CalculateB3(sum[0], maq0); const __m128i maq1 = _mm_unpackhi_epi8(*ma, _mm_setzero_si128()); - *b1 = CalculateB<9>(sum[1], maq1); + *b1 = CalculateB3(sum[1], maq1); } inline void CalculateIntermediate(const __m128i sum[2], const __m128i index[2], @@ -1776,9 +1793,9 @@ inline void BoxSumFilterPreProcess( const uint8_t* const src0, const uint8_t* const src1, const int width, const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5], uint32_t* const square_sum3[4], uint32_t* const square_sum5[5], - const ptrdiff_t sum_width, uint16_t* const ma343[4], - uint16_t* const ma444[2], uint16_t* ma565, uint32_t* const b343[4], - uint32_t* const b444[2], uint32_t* b565) { + const ptrdiff_t sum_width, uint16_t* const ma343[4], uint16_t* const ma444, + uint16_t* ma565, uint32_t* const b343[4], uint32_t* const b444, + uint32_t* b565) { __m128i s[2][2], ma3[2][2], ma5[2], sq[2][4], b3[2][3], b5[3]; s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width); s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1 - width); @@ -1808,9 +1825,8 @@ inline void BoxSumFilterPreProcess( Sum565W(b5 + 1, b + 2); StoreAligned64U32(b565, b); Prepare3_8<0>(ma3[1], ma3x); - Store343_444Lo(ma3x, b3[1], x, ma343[1], ma444[0], b343[1], b444[0]); - Store343_444Hi(ma3x, b3[1] + 1, x + 8, ma343[1], ma444[0], b343[1], - b444[0]); + Store343_444Lo(ma3x, b3[1], x, ma343[1], ma444, b343[1], b444); + Store343_444Hi(ma3x, b3[1] + 1, x + 8, ma343[1], ma444, b343[1], b444); Prepare3_8<0>(ma5, ma5x); ma[0] = Sum565Lo(ma5x); ma[1] = Sum565Hi(ma5x); @@ -1854,8 +1870,9 @@ inline __m128i CalculateFilteredOutput(const __m128i src, const __m128i ma, return _mm_packs_epi32(dst_lo, dst_hi); // 13 bits } -inline __m128i CalculateFilteredOutputPass1(const __m128i src, __m128i ma[2], - __m128i b[2][2]) { +inline __m128i CalculateFilteredOutputPass1(const __m128i src, + const __m128i ma[2], + const __m128i b[2][2]) { const __m128i ma_sum = _mm_add_epi16(ma[0], ma[1]); __m128i b_sum[2]; b_sum[0] = _mm_add_epi32(b[0][0], b[1][0]); @@ -1863,8 +1880,9 @@ inline __m128i CalculateFilteredOutputPass1(const __m128i src, __m128i ma[2], return CalculateFilteredOutput<5>(src, ma_sum, b_sum); } -inline __m128i CalculateFilteredOutputPass2(const __m128i src, __m128i ma[3], - __m128i b[3][2]) { +inline __m128i CalculateFilteredOutputPass2(const __m128i src, + const __m128i ma[3], + const __m128i b[3][2]) { const __m128i ma_sum = Sum3_16(ma); __m128i b_sum[2]; Sum3_32(b, b_sum); @@ -1916,15 +1934,15 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterPass1( int x = 0; do { - __m128i ma[2], ma3[3], b[2][2], sr[2], p[2]; + __m128i ma[2], ma5[3], b[2][2], sr[2], p[2]; s[0][1] = LoadUnaligned16Msan(src0 + x + 16, x + 16 + kOverreadInBytesPass1 - width); s[1][1] = LoadUnaligned16Msan(src1 + x + 16, x + 16 + kOverreadInBytesPass1 - width); BoxFilterPreProcess5(s, sum_width, x + 8, scale, sum5, square_sum5, sq, mas, bs); - Prepare3_8<0>(mas, ma3); - ma[1] = Sum565Lo(ma3); + Prepare3_8<0>(mas, ma5); + ma[1] = Sum565Lo(ma5); StoreAligned16(ma565[1] + x, ma[1]); Sum565W(bs, b[1]); StoreAligned32U32(b565[1] + x, b[1]); @@ -1939,7 +1957,7 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterPass1( const __m128i d00 = SelfGuidedSingleMultiplier(sr0_lo, p[0], w0); const __m128i d10 = SelfGuidedSingleMultiplier(sr1_lo, p[1], w0); - ma[1] = Sum565Hi(ma3); + ma[1] = Sum565Hi(ma5); StoreAligned16(ma565[1] + x + 8, ma[1]); Sum565W(bs + 1, b[1]); StoreAligned32U32(b565[1] + x + 8, b[1]); @@ -2158,9 +2176,9 @@ inline void BoxFilterLastRow( const ptrdiff_t sum_width, const uint16_t scales[2], const int16_t w0, const int16_t w2, uint16_t* const sum3[4], uint16_t* const sum5[5], uint32_t* const square_sum3[4], uint32_t* const square_sum5[5], - uint16_t* const ma343[4], uint16_t* const ma444[3], - uint16_t* const ma565[2], uint32_t* const b343[4], uint32_t* const b444[3], - uint32_t* const b565[2], uint8_t* const dst) { + uint16_t* const ma343, uint16_t* const ma444, uint16_t* const ma565, + uint32_t* const b343, uint32_t* const b444, uint32_t* const b565, + uint8_t* const dst) { __m128i s[2], ma3[2], ma5[2], sq[4], b3[3], b5[3], ma[3], b[3][2]; s[0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width); sq[0] = SquareLo8(s[0]); @@ -2183,13 +2201,13 @@ inline void BoxFilterLastRow( Sum343W(b3, b[2]); const __m128i sr = LoadAligned16(src + x); const __m128i sr_lo = _mm_unpacklo_epi8(sr, _mm_setzero_si128()); - ma[0] = LoadAligned16(ma565[0] + x); - LoadAligned32U32(b565[0] + x, b[0]); + ma[0] = LoadAligned16(ma565 + x); + LoadAligned32U32(b565 + x, b[0]); p[0] = CalculateFilteredOutputPass1(sr_lo, ma, b); - ma[0] = LoadAligned16(ma343[0] + x); - ma[1] = LoadAligned16(ma444[0] + x); - LoadAligned32U32(b343[0] + x, b[0]); - LoadAligned32U32(b444[0] + x, b[1]); + ma[0] = LoadAligned16(ma343 + x); + ma[1] = LoadAligned16(ma444 + x); + LoadAligned32U32(b343 + x, b[0]); + LoadAligned32U32(b444 + x, b[1]); p[1] = CalculateFilteredOutputPass2(sr_lo, ma, b); const __m128i d0 = SelfGuidedDoubleMultiplier(sr_lo, p, w0, w2); @@ -2198,13 +2216,13 @@ inline void BoxFilterLastRow( ma[2] = Sum343Hi(ma3x); Sum343W(b3 + 1, b[2]); const __m128i sr_hi = _mm_unpackhi_epi8(sr, _mm_setzero_si128()); - ma[0] = LoadAligned16(ma565[0] + x + 8); - LoadAligned32U32(b565[0] + x + 8, b[0]); + ma[0] = LoadAligned16(ma565 + x + 8); + LoadAligned32U32(b565 + x + 8, b[0]); p[0] = CalculateFilteredOutputPass1(sr_hi, ma, b); - ma[0] = LoadAligned16(ma343[0] + x + 8); - ma[1] = LoadAligned16(ma444[0] + x + 8); - LoadAligned32U32(b343[0] + x + 8, b[0]); - LoadAligned32U32(b444[0] + x + 8, b[1]); + ma[0] = LoadAligned16(ma343 + x + 8); + ma[1] = LoadAligned16(ma444 + x + 8); + LoadAligned32U32(b343 + x + 8, b[0]); + LoadAligned32U32(b444 + x + 8, b[1]); p[1] = CalculateFilteredOutputPass2(sr_hi, ma, b); const __m128i d1 = SelfGuidedDoubleMultiplier(sr_hi, p, w0, w2); StoreAligned16(dst + x, _mm_packus_epi16(d0, d1)); @@ -2220,8 +2238,9 @@ inline void BoxFilterLastRow( LIBGAV1_ALWAYS_INLINE void BoxFilterProcess( const RestorationUnitInfo& restoration_info, const uint8_t* src, - const uint8_t* const top_border, const uint8_t* bottom_border, - const ptrdiff_t stride, const int width, const int height, + const ptrdiff_t stride, const uint8_t* const top_border, + const ptrdiff_t top_border_stride, const uint8_t* bottom_border, + const ptrdiff_t bottom_border_stride, const int width, const int height, SgrBuffer* const sgr_buffer, uint8_t* dst) { const auto temp_stride = Align<ptrdiff_t>(width, 16); const auto sum_width = Align<ptrdiff_t>(width + 8, 16); @@ -2261,14 +2280,14 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterProcess( b565[1] = b565[0] + temp_stride; assert(scales[0] != 0); assert(scales[1] != 0); - BoxSum(top_border, stride, width, sum_stride, sum_width, sum3[0], sum5[1], - square_sum3[0], square_sum5[1]); + BoxSum(top_border, top_border_stride, width, sum_stride, sum_width, sum3[0], + sum5[1], square_sum3[0], square_sum5[1]); sum5[0] = sum5[1]; square_sum5[0] = square_sum5[1]; const uint8_t* const s = (height > 1) ? src + stride : bottom_border; BoxSumFilterPreProcess(src, s, width, scales, sum3, sum5, square_sum3, - square_sum5, sum_width, ma343, ma444, ma565[0], b343, - b444, b565[0]); + square_sum5, sum_width, ma343, ma444[0], ma565[0], + b343, b444[0], b565[0]); sum5[0] = sgr_buffer->sum5; square_sum5[0] = sgr_buffer->square_sum5; @@ -2298,7 +2317,7 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterProcess( const uint8_t* sr[2]; if ((height & 1) == 0) { sr[0] = bottom_border; - sr[1] = bottom_border + stride; + sr[1] = bottom_border + bottom_border_stride; } else { sr[0] = src + 2 * stride; sr[1] = bottom_border; @@ -2322,19 +2341,21 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterProcess( std::swap(ma565[0], ma565[1]); std::swap(b565[0], b565[1]); } - BoxFilterLastRow(src + 3, bottom_border + stride, width, sum_width, scales, - w0, w2, sum3, sum5, square_sum3, square_sum5, ma343, ma444, - ma565, b343, b444, b565, dst); + BoxFilterLastRow(src + 3, bottom_border + bottom_border_stride, width, + sum_width, scales, w0, w2, sum3, sum5, square_sum3, + square_sum5, ma343[0], ma444[0], ma565[0], b343[0], + b444[0], b565[0], dst); } } inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info, - const uint8_t* src, + const uint8_t* src, const ptrdiff_t stride, const uint8_t* const top_border, + const ptrdiff_t top_border_stride, const uint8_t* bottom_border, - const ptrdiff_t stride, const int width, - const int height, SgrBuffer* const sgr_buffer, - uint8_t* dst) { + const ptrdiff_t bottom_border_stride, + const int width, const int height, + SgrBuffer* const sgr_buffer, uint8_t* dst) { const auto temp_stride = Align<ptrdiff_t>(width, 16); const auto sum_width = Align<ptrdiff_t>(width + 8, 16); const auto sum_stride = temp_stride + 16; @@ -2354,8 +2375,8 @@ inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info, b565[0] = sgr_buffer->b565; b565[1] = b565[0] + temp_stride; assert(scale != 0); - BoxSum<5>(top_border, stride, width, sum_stride, sum_width, sum5[1], - square_sum5[1]); + BoxSum<5>(top_border, top_border_stride, width, sum_stride, sum_width, + sum5[1], square_sum5[1]); sum5[0] = sum5[1]; square_sum5[0] = square_sum5[1]; const uint8_t* const s = (height > 1) ? src + stride : bottom_border; @@ -2381,7 +2402,7 @@ inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info, const uint8_t* sr[2]; if ((height & 1) == 0) { sr[0] = bottom_border; - sr[1] = bottom_border + stride; + sr[1] = bottom_border + bottom_border_stride; } else { sr[0] = src + 2 * stride; sr[1] = bottom_border; @@ -2399,18 +2420,20 @@ inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info, Circulate5PointersBy2<uint16_t>(sum5); Circulate5PointersBy2<uint32_t>(square_sum5); } - BoxFilterPass1LastRow(src, bottom_border + stride, width, sum_width, scale, - w0, sum5, square_sum5, ma565[0], b565[0], dst); + BoxFilterPass1LastRow(src, bottom_border + bottom_border_stride, width, + sum_width, scale, w0, sum5, square_sum5, ma565[0], + b565[0], dst); } } inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info, - const uint8_t* src, + const uint8_t* src, const ptrdiff_t stride, const uint8_t* const top_border, + const ptrdiff_t top_border_stride, const uint8_t* bottom_border, - const ptrdiff_t stride, const int width, - const int height, SgrBuffer* const sgr_buffer, - uint8_t* dst) { + const ptrdiff_t bottom_border_stride, + const int width, const int height, + SgrBuffer* const sgr_buffer, uint8_t* dst) { assert(restoration_info.sgr_proj_info.multiplier[0] == 0); const auto temp_stride = Align<ptrdiff_t>(width, 16); const auto sum_width = Align<ptrdiff_t>(width + 8, 16); @@ -2436,8 +2459,8 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info, b444[0] = sgr_buffer->b444; b444[1] = b444[0] + temp_stride; assert(scale != 0); - BoxSum<3>(top_border, stride, width, sum_stride, sum_width, sum3[0], - square_sum3[0]); + BoxSum<3>(top_border, top_border_stride, width, sum_stride, sum_width, + sum3[0], square_sum3[0]); BoxSumFilterPreProcess3<false>(src, width, scale, sum3, square_sum3, sum_width, ma343[0], nullptr, b343[0], nullptr); @@ -2448,7 +2471,7 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info, s = src + stride; } else { s = bottom_border; - bottom_border += stride; + bottom_border += bottom_border_stride; } BoxSumFilterPreProcess3<true>(s, width, scale, sum3, square_sum3, sum_width, ma343[1], ma444[0], b343[1], b444[0]); @@ -2475,7 +2498,7 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info, square_sum3, ma343, ma444, b343, b444, dst); src += stride; dst += stride; - bottom_border += stride; + bottom_border += bottom_border_stride; Circulate3PointersBy1<uint16_t>(ma343); Circulate3PointersBy1<uint32_t>(b343); std::swap(ma444[0], ma444[1]); @@ -2483,13 +2506,14 @@ inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info, } while (--y != 0); } -// If |width| is non-multiple of 8, up to 7 more pixels are written to |dest| in -// the end of each row. It is safe to overwrite the output as it will not be +// If |width| is non-multiple of 16, up to 15 more pixels are written to |dest| +// in the end of each row. It is safe to overwrite the output as it will not be // part of the visible frame. void SelfGuidedFilter_SSE4_1( const RestorationUnitInfo& restoration_info, const void* const source, - const void* const top_border, const void* const bottom_border, - const ptrdiff_t stride, const int width, const int height, + const ptrdiff_t stride, const void* const top_border, + const ptrdiff_t top_border_stride, const void* const bottom_border, + const ptrdiff_t bottom_border_stride, const int width, const int height, RestorationBuffer* const restoration_buffer, void* const dest) { const int index = restoration_info.sgr_proj_info.index; const int radius_pass_0 = kSgrProjParams[index][0]; // 2 or 0 @@ -2503,14 +2527,17 @@ void SelfGuidedFilter_SSE4_1( // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the // following assertion. assert(radius_pass_0 != 0); - BoxFilterProcessPass1(restoration_info, src - 3, top - 3, bottom - 3, - stride, width, height, sgr_buffer, dst); + BoxFilterProcessPass1(restoration_info, src - 3, stride, top - 3, + top_border_stride, bottom - 3, bottom_border_stride, + width, height, sgr_buffer, dst); } else if (radius_pass_0 == 0) { - BoxFilterProcessPass2(restoration_info, src - 2, top - 2, bottom - 2, - stride, width, height, sgr_buffer, dst); + BoxFilterProcessPass2(restoration_info, src - 2, stride, top - 2, + top_border_stride, bottom - 2, bottom_border_stride, + width, height, sgr_buffer, dst); } else { - BoxFilterProcess(restoration_info, src - 3, top - 3, bottom - 3, stride, - width, height, sgr_buffer, dst); + BoxFilterProcess(restoration_info, src - 3, stride, top - 3, + top_border_stride, bottom - 3, bottom_border_stride, width, + height, sgr_buffer, dst); } } @@ -2538,7 +2565,7 @@ void LoopRestorationInit_SSE4_1() { low_bitdepth::Init8bpp(); } } // namespace dsp } // namespace libgav1 -#else // !LIBGAV1_TARGETING_SSE4_1 +#else // !LIBGAV1_TARGETING_SSE4_1 namespace libgav1 { namespace dsp { diff --git a/src/dsp/x86/loop_restoration_sse4.h b/src/dsp/x86/loop_restoration_sse4.h index 65b2b11..00df3af 100644 --- a/src/dsp/x86/loop_restoration_sse4.h +++ b/src/dsp/x86/loop_restoration_sse4.h @@ -47,6 +47,10 @@ void LoopRestorationInit10bpp_SSE4_1(); #define LIBGAV1_Dsp10bpp_WienerFilter LIBGAV1_CPU_SSE4_1 #endif +#ifndef LIBGAV1_Dsp10bpp_SelfGuidedFilter +#define LIBGAV1_Dsp10bpp_SelfGuidedFilter LIBGAV1_CPU_SSE4_1 +#endif + #endif // LIBGAV1_TARGETING_SSE4_1 #endif // LIBGAV1_SRC_DSP_X86_LOOP_RESTORATION_SSE4_H_ diff --git a/src/dsp/x86/mask_blend_sse4.cc b/src/dsp/x86/mask_blend_sse4.cc index d8036be..2e836af 100644 --- a/src/dsp/x86/mask_blend_sse4.cc +++ b/src/dsp/x86/mask_blend_sse4.cc @@ -430,12 +430,515 @@ void Init8bpp() { } // namespace } // namespace low_bitdepth -void MaskBlendInit_SSE4_1() { low_bitdepth::Init8bpp(); } +#if LIBGAV1_MAX_BITDEPTH >= 10 +namespace high_bitdepth { +namespace { + +constexpr int kMax10bppSample = (1 << 10) - 1; +constexpr int kMaskInverse = 64; +constexpr int kRoundBitsMaskBlend = 4; + +inline __m128i RightShiftWithRoundingZero_U16(const __m128i v_val_d, int bits, + const __m128i zero) { + // Shift out all but the last bit. + const __m128i v_tmp_d = _mm_srli_epi16(v_val_d, bits - 1); + // Avg with zero will shift by 1 and round. + return _mm_avg_epu16(v_tmp_d, zero); +} + +inline __m128i RightShiftWithRoundingConst_S32(const __m128i v_val_d, int bits, + const __m128i shift) { + const __m128i v_tmp_d = _mm_add_epi32(v_val_d, shift); + return _mm_srai_epi32(v_tmp_d, bits); +} + +template <int subsampling_x, int subsampling_y> +inline __m128i GetMask4x2(const uint8_t* mask, ptrdiff_t mask_stride, + const __m128i zero) { + if (subsampling_x == 1) { + if (subsampling_y == 0) { + const __m128i mask_val_0 = _mm_cvtepu8_epi16(LoadLo8(mask)); + const __m128i mask_val_1 = + _mm_cvtepu8_epi16(LoadLo8(mask + (mask_stride << subsampling_y))); + __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1); + return RightShiftWithRoundingZero_U16(subsampled_mask, 1, zero); + } + const __m128i one = _mm_set1_epi8(1); + const __m128i mask_val_0 = + LoadHi8(LoadLo8(mask), mask + (mask_stride << 1)); + const __m128i mask_val_1 = LoadHi8(LoadLo8(mask + mask_stride), + mask + (mask_stride << 1) + mask_stride); + const __m128i add = _mm_adds_epu8(mask_val_0, mask_val_1); + const __m128i subsampled_mask = _mm_maddubs_epi16(add, one); + return RightShiftWithRoundingZero_U16(subsampled_mask, 2, zero); + } + assert(subsampling_y == 0 && subsampling_x == 0); + const __m128i mask_val_0 = Load4(mask); + const __m128i mask_val_1 = Load4(mask + mask_stride); + return _mm_cvtepu8_epi16( + _mm_or_si128(mask_val_0, _mm_slli_si128(mask_val_1, 4))); +} + +template <int subsampling_x, int subsampling_y> +inline __m128i GetMask8(const uint8_t* mask, const ptrdiff_t stride, + const __m128i zero) { + if (subsampling_x == 1) { + if (subsampling_y == 0) { + const __m128i row_vals = LoadUnaligned16(mask); + const __m128i mask_val_0 = _mm_cvtepu8_epi16(row_vals); + const __m128i mask_val_1 = _mm_cvtepu8_epi16(_mm_srli_si128(row_vals, 8)); + __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1); + return RightShiftWithRoundingZero_U16(subsampled_mask, 1, zero); + } + const __m128i one = _mm_set1_epi8(1); + const __m128i mask_val_0 = LoadUnaligned16(mask); + const __m128i mask_val_1 = LoadUnaligned16(mask + stride); + const __m128i add_0 = _mm_adds_epu8(mask_val_0, mask_val_1); + const __m128i mask_0 = _mm_maddubs_epi16(add_0, one); + return RightShiftWithRoundingZero_U16(mask_0, 2, zero); + } + assert(subsampling_y == 0 && subsampling_x == 0); + const __m128i mask_val = LoadLo8(mask); + return _mm_cvtepu8_epi16(mask_val); +} + +inline void WriteMaskBlendLine10bpp4x2_SSE4_1( + const uint16_t* pred_0, const uint16_t* pred_1, + const ptrdiff_t pred_stride_1, const __m128i& pred_mask_0, + const __m128i& pred_mask_1, const __m128i& offset, const __m128i& max, + const __m128i& shift4, uint16_t* dst, const ptrdiff_t dst_stride) { + const __m128i pred_val_0 = LoadUnaligned16(pred_0); + const __m128i pred_val_1 = LoadHi8(LoadLo8(pred_1), pred_1 + pred_stride_1); + + // int res = (mask_value * pred_0[x] + (64 - mask_value) * pred_1[x]) >> 6; + const __m128i compound_pred_lo_0 = _mm_mullo_epi16(pred_val_0, pred_mask_0); + const __m128i compound_pred_hi_0 = _mm_mulhi_epu16(pred_val_0, pred_mask_0); + const __m128i compound_pred_lo_1 = _mm_mullo_epi16(pred_val_1, pred_mask_1); + const __m128i compound_pred_hi_1 = _mm_mulhi_epu16(pred_val_1, pred_mask_1); + const __m128i pack0_lo = + _mm_unpacklo_epi16(compound_pred_lo_0, compound_pred_hi_0); + const __m128i pack0_hi = + _mm_unpackhi_epi16(compound_pred_lo_0, compound_pred_hi_0); + const __m128i pack1_lo = + _mm_unpacklo_epi16(compound_pred_lo_1, compound_pred_hi_1); + const __m128i pack1_hi = + _mm_unpackhi_epi16(compound_pred_lo_1, compound_pred_hi_1); + const __m128i compound_pred_lo = _mm_add_epi32(pack0_lo, pack1_lo); + const __m128i compound_pred_hi = _mm_add_epi32(pack0_hi, pack1_hi); + // res -= (bitdepth == 8) ? 0 : kCompoundOffset; + const __m128i sub_0 = + _mm_sub_epi32(_mm_srli_epi32(compound_pred_lo, 6), offset); + const __m128i sub_1 = + _mm_sub_epi32(_mm_srli_epi32(compound_pred_hi, 6), offset); + + // dst[x] = static_cast<Pixel>( + // Clip3(RightShiftWithRounding(res, inter_post_round_bits), 0, + // (1 << kBitdepth8) - 1)); + const __m128i shift_0 = + RightShiftWithRoundingConst_S32(sub_0, kRoundBitsMaskBlend, shift4); + const __m128i shift_1 = + RightShiftWithRoundingConst_S32(sub_1, kRoundBitsMaskBlend, shift4); + const __m128i result = _mm_min_epi16(_mm_packus_epi32(shift_0, shift_1), max); + StoreLo8(dst, result); + StoreHi8(dst + dst_stride, result); +} + +template <int subsampling_x, int subsampling_y> +inline void MaskBlend10bpp4x4_SSE4_1(const uint16_t* pred_0, + const uint16_t* pred_1, + const ptrdiff_t pred_stride_1, + const uint8_t* mask, + const ptrdiff_t mask_stride, uint16_t* dst, + const ptrdiff_t dst_stride) { + const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse); + const __m128i zero = _mm_setzero_si128(); + const __m128i shift4 = _mm_set1_epi32((1 << kRoundBitsMaskBlend) >> 1); + const __m128i offset = _mm_set1_epi32(kCompoundOffset); + const __m128i max = _mm_set1_epi16(kMax10bppSample); + __m128i pred_mask_0 = + GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero); + __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); + WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, pred_mask_0, + pred_mask_1, offset, max, shift4, dst, + dst_stride); + pred_0 += 4 << 1; + pred_1 += pred_stride_1 << 1; + mask += mask_stride << (1 + subsampling_y); + dst += dst_stride << 1; + + pred_mask_0 = + GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero); + pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); + WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, pred_mask_0, + pred_mask_1, offset, max, shift4, dst, + dst_stride); +} + +template <int subsampling_x, int subsampling_y> +inline void MaskBlend10bpp4xH_SSE4_1(const uint16_t* pred_0, + const uint16_t* pred_1, + const ptrdiff_t pred_stride_1, + const uint8_t* const mask_ptr, + const ptrdiff_t mask_stride, + const int height, uint16_t* dst, + const ptrdiff_t dst_stride) { + const uint8_t* mask = mask_ptr; + if (height == 4) { + MaskBlend10bpp4x4_SSE4_1<subsampling_x, subsampling_y>( + pred_0, pred_1, pred_stride_1, mask, mask_stride, dst, dst_stride); + return; + } + const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse); + const __m128i zero = _mm_setzero_si128(); + const uint8_t pred0_stride2 = 4 << 1; + const ptrdiff_t pred1_stride2 = pred_stride_1 << 1; + const ptrdiff_t mask_stride2 = mask_stride << (1 + subsampling_y); + const ptrdiff_t dst_stride2 = dst_stride << 1; + const __m128i offset = _mm_set1_epi32(kCompoundOffset); + const __m128i max = _mm_set1_epi16(kMax10bppSample); + const __m128i shift4 = _mm_set1_epi32((1 << kRoundBitsMaskBlend) >> 1); + int y = height; + do { + __m128i pred_mask_0 = + GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero); + __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); + + WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, + pred_mask_0, pred_mask_1, offset, max, + shift4, dst, dst_stride); + pred_0 += pred0_stride2; + pred_1 += pred1_stride2; + mask += mask_stride2; + dst += dst_stride2; + + pred_mask_0 = + GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero); + pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); + WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, + pred_mask_0, pred_mask_1, offset, max, + shift4, dst, dst_stride); + pred_0 += pred0_stride2; + pred_1 += pred1_stride2; + mask += mask_stride2; + dst += dst_stride2; + + pred_mask_0 = + GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero); + pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); + WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, + pred_mask_0, pred_mask_1, offset, max, + shift4, dst, dst_stride); + pred_0 += pred0_stride2; + pred_1 += pred1_stride2; + mask += mask_stride2; + dst += dst_stride2; + + pred_mask_0 = + GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero); + pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); + WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, + pred_mask_0, pred_mask_1, offset, max, + shift4, dst, dst_stride); + pred_0 += pred0_stride2; + pred_1 += pred1_stride2; + mask += mask_stride2; + dst += dst_stride2; + y -= 8; + } while (y != 0); +} + +template <int subsampling_x, int subsampling_y> +inline void MaskBlend10bpp_SSE4_1(const void* prediction_0, + const void* prediction_1, + const ptrdiff_t prediction_stride_1, + const uint8_t* const mask_ptr, + const ptrdiff_t mask_stride, const int width, + const int height, void* dest, + const ptrdiff_t dest_stride) { + auto* dst = static_cast<uint16_t*>(dest); + const ptrdiff_t dst_stride = dest_stride / sizeof(dst[0]); + const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); + const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); + const ptrdiff_t pred_stride_0 = width; + const ptrdiff_t pred_stride_1 = prediction_stride_1; + if (width == 4) { + MaskBlend10bpp4xH_SSE4_1<subsampling_x, subsampling_y>( + pred_0, pred_1, pred_stride_1, mask_ptr, mask_stride, height, dst, + dst_stride); + return; + } + const uint8_t* mask = mask_ptr; + const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse); + const __m128i zero = _mm_setzero_si128(); + const ptrdiff_t mask_stride_ss = mask_stride << subsampling_y; + const __m128i offset = _mm_set1_epi32(kCompoundOffset); + const __m128i max = _mm_set1_epi16(kMax10bppSample); + const __m128i shift4 = _mm_set1_epi32((1 << kRoundBitsMaskBlend) >> 1); + int y = height; + do { + int x = 0; + do { + const __m128i pred_mask_0 = GetMask8<subsampling_x, subsampling_y>( + mask + (x << subsampling_x), mask_stride, zero); + const __m128i pred_val_0 = LoadUnaligned16(pred_0 + x); + const __m128i pred_val_1 = LoadUnaligned16(pred_1 + x); + // 64 - mask + const __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); + + const __m128i compound_pred_lo_0 = + _mm_mullo_epi16(pred_val_0, pred_mask_0); + const __m128i compound_pred_hi_0 = + _mm_mulhi_epu16(pred_val_0, pred_mask_0); + const __m128i compound_pred_lo_1 = + _mm_mullo_epi16(pred_val_1, pred_mask_1); + const __m128i compound_pred_hi_1 = + _mm_mulhi_epu16(pred_val_1, pred_mask_1); + const __m128i pack0_lo = + _mm_unpacklo_epi16(compound_pred_lo_0, compound_pred_hi_0); + const __m128i pack0_hi = + _mm_unpackhi_epi16(compound_pred_lo_0, compound_pred_hi_0); + const __m128i pack1_lo = + _mm_unpacklo_epi16(compound_pred_lo_1, compound_pred_hi_1); + const __m128i pack1_hi = + _mm_unpackhi_epi16(compound_pred_lo_1, compound_pred_hi_1); + const __m128i compound_pred_lo = _mm_add_epi32(pack0_lo, pack1_lo); + const __m128i compound_pred_hi = _mm_add_epi32(pack0_hi, pack1_hi); + + const __m128i sub_0 = + _mm_sub_epi32(_mm_srli_epi32(compound_pred_lo, 6), offset); + const __m128i sub_1 = + _mm_sub_epi32(_mm_srli_epi32(compound_pred_hi, 6), offset); + const __m128i shift_0 = + RightShiftWithRoundingConst_S32(sub_0, kRoundBitsMaskBlend, shift4); + const __m128i shift_1 = + RightShiftWithRoundingConst_S32(sub_1, kRoundBitsMaskBlend, shift4); + const __m128i result = + _mm_min_epi16(_mm_packus_epi32(shift_0, shift_1), max); + StoreUnaligned16(dst + x, result); + x += 8; + } while (x < width); + dst += dst_stride; + pred_0 += pred_stride_0; + pred_1 += pred_stride_1; + mask += mask_stride_ss; + } while (--y != 0); +} + +inline void InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1( + const uint16_t* prediction_0, const uint16_t* prediction_1, + const ptrdiff_t pred_stride_1, const __m128i& pred_mask_0, + const __m128i& pred_mask_1, const __m128i& shift6, uint16_t* dst, + const ptrdiff_t dst_stride) { + const __m128i pred_val_0 = LoadUnaligned16(prediction_0); + const __m128i pred_val_1 = + LoadHi8(LoadLo8(prediction_1), prediction_1 + pred_stride_1); + + const __m128i mask_0 = _mm_unpacklo_epi16(pred_mask_1, pred_mask_0); + const __m128i mask_1 = _mm_unpackhi_epi16(pred_mask_1, pred_mask_0); + const __m128i pred_0 = _mm_unpacklo_epi16(pred_val_0, pred_val_1); + const __m128i pred_1 = _mm_unpackhi_epi16(pred_val_0, pred_val_1); + + const __m128i compound_pred_0 = _mm_madd_epi16(pred_0, mask_0); + const __m128i compound_pred_1 = _mm_madd_epi16(pred_1, mask_1); + const __m128i shift_0 = + RightShiftWithRoundingConst_S32(compound_pred_0, 6, shift6); + const __m128i shift_1 = + RightShiftWithRoundingConst_S32(compound_pred_1, 6, shift6); + const __m128i res = _mm_packus_epi32(shift_0, shift_1); + StoreLo8(dst, res); + StoreHi8(dst + dst_stride, res); +} + +template <int subsampling_x, int subsampling_y> +inline void InterIntraMaskBlend10bpp4x4_SSE4_1( + const uint16_t* pred_0, const uint16_t* pred_1, + const ptrdiff_t pred_stride_1, const uint8_t* mask, + const ptrdiff_t mask_stride, uint16_t* dst, const ptrdiff_t dst_stride) { + const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse); + const __m128i shift6 = _mm_set1_epi32((1 << 6) >> 1); + const __m128i zero = _mm_setzero_si128(); + __m128i pred_mask_0 = + GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero); + __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); + InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, + pred_mask_0, pred_mask_1, shift6, + dst, dst_stride); + pred_0 += 4 << 1; + pred_1 += pred_stride_1 << 1; + mask += mask_stride << (1 + subsampling_y); + dst += dst_stride << 1; + + pred_mask_0 = + GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero); + pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); + InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, + pred_mask_0, pred_mask_1, shift6, + dst, dst_stride); +} + +template <int subsampling_x, int subsampling_y> +inline void InterIntraMaskBlend10bpp4xH_SSE4_1(const uint16_t* pred_0, + const uint16_t* pred_1, + const ptrdiff_t pred_stride_1, + const uint8_t* const mask_ptr, + const ptrdiff_t mask_stride, + const int height, uint16_t* dst, + const ptrdiff_t dst_stride) { + const uint8_t* mask = mask_ptr; + if (height == 4) { + InterIntraMaskBlend10bpp4x4_SSE4_1<subsampling_x, subsampling_y>( + pred_0, pred_1, pred_stride_1, mask, mask_stride, dst, dst_stride); + return; + } + const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse); + const __m128i zero = _mm_setzero_si128(); + const __m128i shift6 = _mm_set1_epi32((1 << 6) >> 1); + const uint8_t pred0_stride2 = 4 << 1; + const ptrdiff_t pred1_stride2 = pred_stride_1 << 1; + const ptrdiff_t mask_stride2 = mask_stride << (1 + subsampling_y); + const ptrdiff_t dst_stride2 = dst_stride << 1; + int y = height; + do { + __m128i pred_mask_0 = + GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero); + __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); + InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, + pred_mask_0, pred_mask_1, + shift6, dst, dst_stride); + pred_0 += pred0_stride2; + pred_1 += pred1_stride2; + mask += mask_stride2; + dst += dst_stride2; + + pred_mask_0 = + GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero); + pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); + InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, + pred_mask_0, pred_mask_1, + shift6, dst, dst_stride); + pred_0 += pred0_stride2; + pred_1 += pred1_stride2; + mask += mask_stride2; + dst += dst_stride2; + + pred_mask_0 = + GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero); + pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); + InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, + pred_mask_0, pred_mask_1, + shift6, dst, dst_stride); + pred_0 += pred0_stride2; + pred_1 += pred1_stride2; + mask += mask_stride2; + dst += dst_stride2; + + pred_mask_0 = + GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero); + pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); + InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, + pred_mask_0, pred_mask_1, + shift6, dst, dst_stride); + pred_0 += pred0_stride2; + pred_1 += pred1_stride2; + mask += mask_stride2; + dst += dst_stride2; + y -= 8; + } while (y != 0); +} + +template <int subsampling_x, int subsampling_y> +inline void InterIntraMaskBlend10bpp_SSE4_1( + const void* prediction_0, const void* prediction_1, + const ptrdiff_t prediction_stride_1, const uint8_t* const mask_ptr, + const ptrdiff_t mask_stride, const int width, const int height, void* dest, + const ptrdiff_t dest_stride) { + auto* dst = static_cast<uint16_t*>(dest); + const ptrdiff_t dst_stride = dest_stride / sizeof(dst[0]); + const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); + const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); + const ptrdiff_t pred_stride_0 = width; + const ptrdiff_t pred_stride_1 = prediction_stride_1; + if (width == 4) { + InterIntraMaskBlend10bpp4xH_SSE4_1<subsampling_x, subsampling_y>( + pred_0, pred_1, pred_stride_1, mask_ptr, mask_stride, height, dst, + dst_stride); + return; + } + const uint8_t* mask = mask_ptr; + const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse); + const __m128i shift6 = _mm_set1_epi32((1 << 6) >> 1); + const __m128i zero = _mm_setzero_si128(); + const ptrdiff_t mask_stride_ss = mask_stride << subsampling_y; + int y = height; + do { + int x = 0; + do { + const __m128i pred_mask_0 = GetMask8<subsampling_x, subsampling_y>( + mask + (x << subsampling_x), mask_stride, zero); + const __m128i pred_val_0 = LoadUnaligned16(pred_0 + x); + const __m128i pred_val_1 = LoadUnaligned16(pred_1 + x); + // 64 - mask + const __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); + const __m128i mask_0 = _mm_unpacklo_epi16(pred_mask_1, pred_mask_0); + const __m128i mask_1 = _mm_unpackhi_epi16(pred_mask_1, pred_mask_0); + const __m128i pred_0 = _mm_unpacklo_epi16(pred_val_0, pred_val_1); + const __m128i pred_1 = _mm_unpackhi_epi16(pred_val_0, pred_val_1); + + const __m128i compound_pred_0 = _mm_madd_epi16(pred_0, mask_0); + const __m128i compound_pred_1 = _mm_madd_epi16(pred_1, mask_1); + const __m128i shift_0 = + RightShiftWithRoundingConst_S32(compound_pred_0, 6, shift6); + const __m128i shift_1 = + RightShiftWithRoundingConst_S32(compound_pred_1, 6, shift6); + StoreUnaligned16(dst + x, _mm_packus_epi32(shift_0, shift_1)); + x += 8; + } while (x < width); + dst += dst_stride; + pred_0 += pred_stride_0; + pred_1 += pred_stride_1; + mask += mask_stride_ss; + } while (--y != 0); +} + +void Init10bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10); + assert(dsp != nullptr); + +#if DSP_ENABLED_10BPP_SSE4_1(MaskBlend444) + dsp->mask_blend[0][0] = MaskBlend10bpp_SSE4_1<0, 0>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(MaskBlend422) + dsp->mask_blend[1][0] = MaskBlend10bpp_SSE4_1<1, 0>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(MaskBlend420) + dsp->mask_blend[2][0] = MaskBlend10bpp_SSE4_1<1, 1>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(MaskBlendInterIntra444) + dsp->mask_blend[0][1] = InterIntraMaskBlend10bpp_SSE4_1<0, 0>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(MaskBlendInterIntra422) + dsp->mask_blend[1][1] = InterIntraMaskBlend10bpp_SSE4_1<1, 0>; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(MaskBlendInterIntra420) + dsp->mask_blend[2][1] = InterIntraMaskBlend10bpp_SSE4_1<1, 1>; +#endif +} + +} // namespace +} // namespace high_bitdepth +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +void MaskBlendInit_SSE4_1() { + low_bitdepth::Init8bpp(); +#if LIBGAV1_MAX_BITDEPTH >= 10 + high_bitdepth::Init10bpp(); +#endif // LIBGAV1_MAX_BITDEPTH >= 10 +} } // namespace dsp } // namespace libgav1 -#else // !LIBGAV1_TARGETING_SSE4_1 +#else // !LIBGAV1_TARGETING_SSE4_1 namespace libgav1 { namespace dsp { diff --git a/src/dsp/x86/mask_blend_sse4.h b/src/dsp/x86/mask_blend_sse4.h index 52b0b5c..4a95f0c 100644 --- a/src/dsp/x86/mask_blend_sse4.h +++ b/src/dsp/x86/mask_blend_sse4.h @@ -55,6 +55,30 @@ void MaskBlendInit_SSE4_1(); #define LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp420 LIBGAV1_CPU_SSE4_1 #endif +#ifndef LIBGAV1_Dsp10bpp_MaskBlend444 +#define LIBGAV1_Dsp10bpp_MaskBlend444 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_MaskBlend422 +#define LIBGAV1_Dsp10bpp_MaskBlend422 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_MaskBlend420 +#define LIBGAV1_Dsp10bpp_MaskBlend420 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_MaskBlendInterIntra444 +#define LIBGAV1_Dsp10bpp_MaskBlendInterIntra444 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_MaskBlendInterIntra422 +#define LIBGAV1_Dsp10bpp_MaskBlendInterIntra422 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_MaskBlendInterIntra420 +#define LIBGAV1_Dsp10bpp_MaskBlendInterIntra420 LIBGAV1_CPU_SSE4_1 +#endif + #endif // LIBGAV1_TARGETING_SSE4_1 #endif // LIBGAV1_SRC_DSP_X86_MASK_BLEND_SSE4_H_ diff --git a/src/dsp/x86/motion_field_projection_sse4.cc b/src/dsp/x86/motion_field_projection_sse4.cc index c506941..e3f2cce 100644 --- a/src/dsp/x86/motion_field_projection_sse4.cc +++ b/src/dsp/x86/motion_field_projection_sse4.cc @@ -139,9 +139,9 @@ inline void Store(const __m128i position, const __m128i reference_offset, const ptrdiff_t offset = static_cast<int16_t>(_mm_extract_epi16(position, idx)); if ((idx & 3) == 0) { - dst_mv[offset].mv32 = _mm_cvtsi128_si32(mv); + dst_mv[offset].mv32 = static_cast<uint32_t>(_mm_cvtsi128_si32(mv)); } else { - dst_mv[offset].mv32 = _mm_extract_epi32(mv, idx & 3); + dst_mv[offset].mv32 = static_cast<uint32_t>(_mm_extract_epi32(mv, idx & 3)); } dst_reference_offset[offset] = _mm_extract_epi8(reference_offset, idx); } @@ -386,7 +386,7 @@ void MotionFieldProjectionInit_SSE4_1() { } // namespace dsp } // namespace libgav1 -#else // !LIBGAV1_TARGETING_SSE4_1 +#else // !LIBGAV1_TARGETING_SSE4_1 namespace libgav1 { namespace dsp { diff --git a/src/dsp/x86/motion_vector_search_sse4.cc b/src/dsp/x86/motion_vector_search_sse4.cc index e9cdd4c..7f5f035 100644 --- a/src/dsp/x86/motion_vector_search_sse4.cc +++ b/src/dsp/x86/motion_vector_search_sse4.cc @@ -251,7 +251,7 @@ void MotionVectorSearchInit_SSE4_1() { } // namespace dsp } // namespace libgav1 -#else // !LIBGAV1_TARGETING_SSE4_1 +#else // !LIBGAV1_TARGETING_SSE4_1 namespace libgav1 { namespace dsp { diff --git a/src/dsp/x86/obmc_sse4.cc b/src/dsp/x86/obmc_sse4.cc index 3a1d1fd..c34a7f7 100644 --- a/src/dsp/x86/obmc_sse4.cc +++ b/src/dsp/x86/obmc_sse4.cc @@ -31,6 +31,7 @@ namespace libgav1 { namespace dsp { +namespace low_bitdepth { namespace { #include "src/dsp/obmc.inc" @@ -311,13 +312,295 @@ void Init8bpp() { } } // namespace +} // namespace low_bitdepth -void ObmcInit_SSE4_1() { Init8bpp(); } +#if LIBGAV1_MAX_BITDEPTH >= 10 +namespace high_bitdepth { +namespace { + +#include "src/dsp/obmc.inc" + +constexpr int kRoundBitsObmcBlend = 6; + +inline void OverlapBlendFromLeft2xH_SSE4_1( + uint16_t* const prediction, const ptrdiff_t pred_stride, const int height, + const uint16_t* const obmc_prediction, const ptrdiff_t obmc_pred_stride) { + uint16_t* pred = prediction; + const uint16_t* obmc_pred = obmc_prediction; + const ptrdiff_t pred_stride2 = pred_stride << 1; + const ptrdiff_t obmc_pred_stride2 = obmc_pred_stride << 1; + const __m128i mask_inverter = _mm_cvtsi32_si128(0x40404040); + const __m128i mask_val = _mm_shufflelo_epi16(Load2(kObmcMask), 0x00); + // 64 - mask. + const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val); + const __m128i masks = + _mm_cvtepi8_epi16(_mm_unpacklo_epi8(mask_val, obmc_mask_val)); + int y = height; + do { + const __m128i pred_val = Load4x2(pred, pred + pred_stride); + const __m128i obmc_pred_val = + Load4x2(obmc_pred, obmc_pred + obmc_pred_stride); + const __m128i terms = _mm_unpacklo_epi16(pred_val, obmc_pred_val); + const __m128i result = RightShiftWithRounding_U32( + _mm_madd_epi16(terms, masks), kRoundBitsObmcBlend); + const __m128i packed_result = _mm_packus_epi32(result, result); + Store4(pred, packed_result); + Store4(pred + pred_stride, _mm_srli_si128(packed_result, 4)); + pred += pred_stride2; + obmc_pred += obmc_pred_stride2; + y -= 2; + } while (y != 0); +} + +inline void OverlapBlendFromLeft4xH_SSE4_1( + uint16_t* const prediction, const ptrdiff_t pred_stride, const int height, + const uint16_t* const obmc_prediction, const ptrdiff_t obmc_pred_stride) { + uint16_t* pred = prediction; + const uint16_t* obmc_pred = obmc_prediction; + const ptrdiff_t pred_stride2 = pred_stride << 1; + const ptrdiff_t obmc_pred_stride2 = obmc_pred_stride << 1; + const __m128i mask_inverter = _mm_cvtsi32_si128(0x40404040); + const __m128i mask_val = Load4(kObmcMask + 2); + // 64 - mask. + const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val); + const __m128i masks = + _mm_cvtepi8_epi16(_mm_unpacklo_epi8(mask_val, obmc_mask_val)); + int y = height; + do { + const __m128i pred_val = LoadHi8(LoadLo8(pred), pred + pred_stride); + const __m128i obmc_pred_val = + LoadHi8(LoadLo8(obmc_pred), obmc_pred + obmc_pred_stride); + const __m128i terms_lo = _mm_unpacklo_epi16(pred_val, obmc_pred_val); + const __m128i terms_hi = _mm_unpackhi_epi16(pred_val, obmc_pred_val); + const __m128i result_lo = RightShiftWithRounding_U32( + _mm_madd_epi16(terms_lo, masks), kRoundBitsObmcBlend); + const __m128i result_hi = RightShiftWithRounding_U32( + _mm_madd_epi16(terms_hi, masks), kRoundBitsObmcBlend); + const __m128i packed_result = _mm_packus_epi32(result_lo, result_hi); + StoreLo8(pred, packed_result); + StoreHi8(pred + pred_stride, packed_result); + pred += pred_stride2; + obmc_pred += obmc_pred_stride2; + y -= 2; + } while (y != 0); +} + +void OverlapBlendFromLeft10bpp_SSE4_1(void* const prediction, + const ptrdiff_t prediction_stride, + const int width, const int height, + const void* const obmc_prediction, + const ptrdiff_t obmc_prediction_stride) { + auto* pred = static_cast<uint16_t*>(prediction); + const auto* obmc_pred = static_cast<const uint16_t*>(obmc_prediction); + const ptrdiff_t pred_stride = prediction_stride / sizeof(pred[0]); + const ptrdiff_t obmc_pred_stride = + obmc_prediction_stride / sizeof(obmc_pred[0]); + + if (width == 2) { + OverlapBlendFromLeft2xH_SSE4_1(pred, pred_stride, height, obmc_pred, + obmc_pred_stride); + return; + } + if (width == 4) { + OverlapBlendFromLeft4xH_SSE4_1(pred, pred_stride, height, obmc_pred, + obmc_pred_stride); + return; + } + const __m128i mask_inverter = _mm_set1_epi8(64); + const uint8_t* mask = kObmcMask + width - 2; + int x = 0; + do { + pred = static_cast<uint16_t*>(prediction) + x; + obmc_pred = static_cast<const uint16_t*>(obmc_prediction) + x; + const __m128i mask_val = LoadLo8(mask + x); + // 64 - mask + const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val); + const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val); + const __m128i masks_lo = _mm_cvtepi8_epi16(masks); + const __m128i masks_hi = _mm_cvtepi8_epi16(_mm_srli_si128(masks, 8)); + int y = height; + do { + const __m128i pred_val = LoadUnaligned16(pred); + const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred); + const __m128i terms_lo = _mm_unpacklo_epi16(pred_val, obmc_pred_val); + const __m128i terms_hi = _mm_unpackhi_epi16(pred_val, obmc_pred_val); + const __m128i result_lo = RightShiftWithRounding_U32( + _mm_madd_epi16(terms_lo, masks_lo), kRoundBitsObmcBlend); + const __m128i result_hi = RightShiftWithRounding_U32( + _mm_madd_epi16(terms_hi, masks_hi), kRoundBitsObmcBlend); + StoreUnaligned16(pred, _mm_packus_epi32(result_lo, result_hi)); + + pred += pred_stride; + obmc_pred += obmc_pred_stride; + } while (--y != 0); + x += 8; + } while (x < width); +} + +inline void OverlapBlendFromTop2xH_SSE4_1(uint16_t* const prediction, + const ptrdiff_t pred_stride, + const int height, + const uint16_t* const obmc_prediction, + const ptrdiff_t obmc_pred_stride) { + uint16_t* pred = prediction; + const uint16_t* obmc_pred = obmc_prediction; + const __m128i mask_inverter = _mm_set1_epi16(64); + const __m128i mask_shuffler = _mm_set_epi32(0x01010101, 0x01010101, 0, 0); + const __m128i mask_preinverter = _mm_set1_epi16(-256 | 1); + const uint8_t* mask = kObmcMask + height - 2; + const int compute_height = + height - (height >> 2); // compute_height based on 8-bit opt + const ptrdiff_t pred_stride2 = pred_stride << 1; + const ptrdiff_t obmc_pred_stride2 = obmc_pred_stride << 1; + int y = 0; + do { + // First mask in the first half, second mask in the second half. + const __m128i mask_val = _mm_shuffle_epi8(Load4(mask + y), mask_shuffler); + const __m128i masks = + _mm_sub_epi8(mask_inverter, _mm_sign_epi8(mask_val, mask_preinverter)); + const __m128i masks_lo = _mm_cvtepi8_epi16(masks); + const __m128i masks_hi = _mm_cvtepi8_epi16(_mm_srli_si128(masks, 8)); + + const __m128i pred_val = LoadHi8(LoadLo8(pred), pred + pred_stride); + const __m128i obmc_pred_val = + LoadHi8(LoadLo8(obmc_pred), obmc_pred + obmc_pred_stride); + const __m128i terms_lo = _mm_unpacklo_epi16(obmc_pred_val, pred_val); + const __m128i terms_hi = _mm_unpackhi_epi16(obmc_pred_val, pred_val); + const __m128i result_lo = RightShiftWithRounding_U32( + _mm_madd_epi16(terms_lo, masks_lo), kRoundBitsObmcBlend); + const __m128i result_hi = RightShiftWithRounding_U32( + _mm_madd_epi16(terms_hi, masks_hi), kRoundBitsObmcBlend); + const __m128i packed_result = _mm_packus_epi32(result_lo, result_hi); + + Store4(pred, packed_result); + Store4(pred + pred_stride, _mm_srli_si128(packed_result, 8)); + pred += pred_stride2; + obmc_pred += obmc_pred_stride2; + y += 2; + } while (y < compute_height); +} + +inline void OverlapBlendFromTop4xH_SSE4_1(uint16_t* const prediction, + const ptrdiff_t pred_stride, + const int height, + const uint16_t* const obmc_prediction, + const ptrdiff_t obmc_pred_stride) { + uint16_t* pred = prediction; + const uint16_t* obmc_pred = obmc_prediction; + const __m128i mask_inverter = _mm_set1_epi16(64); + const __m128i mask_shuffler = _mm_set_epi32(0x01010101, 0x01010101, 0, 0); + const __m128i mask_preinverter = _mm_set1_epi16(-256 | 1); + const uint8_t* mask = kObmcMask + height - 2; + const int compute_height = height - (height >> 2); + const ptrdiff_t pred_stride2 = pred_stride << 1; + const ptrdiff_t obmc_pred_stride2 = obmc_pred_stride << 1; + int y = 0; + do { + // First mask in the first half, second mask in the second half. + const __m128i mask_val = _mm_shuffle_epi8(Load4(mask + y), mask_shuffler); + const __m128i masks = + _mm_sub_epi8(mask_inverter, _mm_sign_epi8(mask_val, mask_preinverter)); + const __m128i masks_lo = _mm_cvtepi8_epi16(masks); + const __m128i masks_hi = _mm_cvtepi8_epi16(_mm_srli_si128(masks, 8)); + + const __m128i pred_val = LoadHi8(LoadLo8(pred), pred + pred_stride); + const __m128i obmc_pred_val = + LoadHi8(LoadLo8(obmc_pred), obmc_pred + obmc_pred_stride); + const __m128i terms_lo = _mm_unpacklo_epi16(obmc_pred_val, pred_val); + const __m128i terms_hi = _mm_unpackhi_epi16(obmc_pred_val, pred_val); + const __m128i result_lo = RightShiftWithRounding_U32( + _mm_madd_epi16(terms_lo, masks_lo), kRoundBitsObmcBlend); + const __m128i result_hi = RightShiftWithRounding_U32( + _mm_madd_epi16(terms_hi, masks_hi), kRoundBitsObmcBlend); + const __m128i packed_result = _mm_packus_epi32(result_lo, result_hi); + + StoreLo8(pred, packed_result); + StoreHi8(pred + pred_stride, packed_result); + pred += pred_stride2; + obmc_pred += obmc_pred_stride2; + y += 2; + } while (y < compute_height); +} + +void OverlapBlendFromTop10bpp_SSE4_1(void* const prediction, + const ptrdiff_t prediction_stride, + const int width, const int height, + const void* const obmc_prediction, + const ptrdiff_t obmc_prediction_stride) { + auto* pred = static_cast<uint16_t*>(prediction); + const auto* obmc_pred = static_cast<const uint16_t*>(obmc_prediction); + const ptrdiff_t pred_stride = prediction_stride / sizeof(pred[0]); + const ptrdiff_t obmc_pred_stride = + obmc_prediction_stride / sizeof(obmc_pred[0]); + + if (width == 2) { + OverlapBlendFromTop2xH_SSE4_1(pred, pred_stride, height, obmc_pred, + obmc_pred_stride); + return; + } + if (width == 4) { + OverlapBlendFromTop4xH_SSE4_1(pred, pred_stride, height, obmc_pred, + obmc_pred_stride); + return; + } + + const __m128i mask_inverter = _mm_set1_epi8(64); + const int compute_height = height - (height >> 2); + const uint8_t* mask = kObmcMask + height - 2; + pred = static_cast<uint16_t*>(prediction); + obmc_pred = static_cast<const uint16_t*>(obmc_prediction); + int y = 0; + do { + const __m128i mask_val = _mm_set1_epi8(mask[y]); + // 64 - mask + const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val); + const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val); + const __m128i masks_lo = _mm_cvtepi8_epi16(masks); + const __m128i masks_hi = _mm_cvtepi8_epi16(_mm_srli_si128(masks, 8)); + int x = 0; + do { + const __m128i pred_val = LoadUnaligned16(pred + x); + const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred + x); + const __m128i terms_lo = _mm_unpacklo_epi16(pred_val, obmc_pred_val); + const __m128i terms_hi = _mm_unpackhi_epi16(pred_val, obmc_pred_val); + const __m128i result_lo = RightShiftWithRounding_U32( + _mm_madd_epi16(terms_lo, masks_lo), kRoundBitsObmcBlend); + const __m128i result_hi = RightShiftWithRounding_U32( + _mm_madd_epi16(terms_hi, masks_hi), kRoundBitsObmcBlend); + StoreUnaligned16(pred + x, _mm_packus_epi32(result_lo, result_hi)); + x += 8; + } while (x < width); + pred += pred_stride; + obmc_pred += obmc_pred_stride; + } while (++y < compute_height); +} + +void Init10bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10); + assert(dsp != nullptr); +#if DSP_ENABLED_10BPP_SSE4_1(ObmcVertical) + dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendFromTop10bpp_SSE4_1; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(ObmcHorizontal) + dsp->obmc_blend[kObmcDirectionHorizontal] = OverlapBlendFromLeft10bpp_SSE4_1; +#endif +} + +} // namespace +} // namespace high_bitdepth +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +void ObmcInit_SSE4_1() { + low_bitdepth::Init8bpp(); +#if LIBGAV1_MAX_BITDEPTH >= 10 + high_bitdepth::Init10bpp(); +#endif // LIBGAV1_MAX_BITDEPTH >= 10 +} } // namespace dsp } // namespace libgav1 -#else // !LIBGAV1_TARGETING_SSE4_1 +#else // !LIBGAV1_TARGETING_SSE4_1 namespace libgav1 { namespace dsp { diff --git a/src/dsp/x86/obmc_sse4.h b/src/dsp/x86/obmc_sse4.h index bd8b416..448d2cf 100644 --- a/src/dsp/x86/obmc_sse4.h +++ b/src/dsp/x86/obmc_sse4.h @@ -38,6 +38,12 @@ void ObmcInit_SSE4_1(); #ifndef LIBGAV1_Dsp8bpp_ObmcHorizontal #define LIBGAV1_Dsp8bpp_ObmcHorizontal LIBGAV1_CPU_SSE4_1 #endif +#ifndef LIBGAV1_Dsp10bpp_ObmcVertical +#define LIBGAV1_Dsp10bpp_ObmcVertical LIBGAV1_CPU_SSE4_1 +#endif +#ifndef LIBGAV1_Dsp10bpp_ObmcHorizontal +#define LIBGAV1_Dsp10bpp_ObmcHorizontal LIBGAV1_CPU_SSE4_1 +#endif #endif // LIBGAV1_TARGETING_SSE4_1 #endif // LIBGAV1_SRC_DSP_X86_OBMC_SSE4_H_ diff --git a/src/dsp/x86/super_res_sse4.cc b/src/dsp/x86/super_res_sse4.cc index b2bdfd2..85d05bc 100644 --- a/src/dsp/x86/super_res_sse4.cc +++ b/src/dsp/x86/super_res_sse4.cc @@ -91,10 +91,10 @@ void SuperResCoefficients_SSE4_1(const int upscaled_width, } void SuperRes_SSE4_1(const void* const coefficients, void* const source, - const ptrdiff_t stride, const int height, + const ptrdiff_t source_stride, const int height, const int downscaled_width, const int upscaled_width, const int initial_subpixel_x, const int step, - void* const dest) { + void* const dest, const ptrdiff_t dest_stride) { auto* src = static_cast<uint8_t*>(source) - DivideBy2(kSuperResFilterTaps); auto* dst = static_cast<uint8_t*>(dest); int y = height; @@ -104,16 +104,30 @@ void SuperRes_SSE4_1(const void* const coefficients, void* const source, ExtendLine<uint8_t>(src + DivideBy2(kSuperResFilterTaps), downscaled_width, kSuperResHorizontalBorder, kSuperResHorizontalBorder); int subpixel_x = initial_subpixel_x; - // The below code calculates up to 15 extra upscaled - // pixels which will over-read up to 15 downscaled pixels in the end of each - // row. kSuperResHorizontalBorder accounts for this. + // The below code calculates up to 15 extra upscaled pixels which will + // over-read up to 15 downscaled pixels in the end of each row. + // kSuperResHorizontalPadding protects this behavior from segmentation + // faults and threading issues. int x = RightShiftWithCeiling(upscaled_width, 4); do { __m128i weighted_src[8]; for (int i = 0; i < 8; ++i, filter += 16) { - __m128i s = LoadLo8(&src[subpixel_x >> kSuperResScaleBits]); + // TODO(b/178652672): Remove Msan loads when hadd bug is resolved. + // It's fine to write uninitialized bytes outside the frame, but the + // inside-frame pixels are incorrectly labeled uninitialized if + // uninitialized values go through the hadd intrinsics. + // |src| is offset 4 pixels to the left, and there are 4 extended border + // pixels, so a difference of 0 from |downscaled_width| indicates 8 good + // bytes. A difference of 1 indicates 7 good bytes. + const int msan_bytes_lo = + (subpixel_x >> kSuperResScaleBits) - downscaled_width; + __m128i s = + LoadLo8Msan(&src[subpixel_x >> kSuperResScaleBits], msan_bytes_lo); subpixel_x += step; - s = LoadHi8(s, &src[subpixel_x >> kSuperResScaleBits]); + const int msan_bytes_hi = + (subpixel_x >> kSuperResScaleBits) - downscaled_width; + s = LoadHi8Msan(s, &src[subpixel_x >> kSuperResScaleBits], + msan_bytes_hi); subpixel_x += step; const __m128i f = LoadAligned16(filter); weighted_src[i] = _mm_maddubs_epi16(s, f); @@ -135,26 +149,165 @@ void SuperRes_SSE4_1(const void* const coefficients, void* const source, StoreAligned16(dst_ptr, _mm_packus_epi16(a[0], a[1])); dst_ptr += 16; } while (--x != 0); - src += stride; - dst += stride; + src += source_stride; + dst += dest_stride; } while (--y != 0); } void Init8bpp() { Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth8); +#if DSP_ENABLED_8BPP_SSE4_1(SuperResCoefficients) dsp->super_res_coefficients = SuperResCoefficients_SSE4_1; +#endif // DSP_ENABLED_8BPP_SSE4_1(SuperResCoefficients) +#if DSP_ENABLED_8BPP_SSE4_1(SuperRes) dsp->super_res = SuperRes_SSE4_1; +#endif // DSP_ENABLED_8BPP_SSE4_1(SuperRes) } } // namespace } // namespace low_bitdepth -void SuperResInit_SSE4_1() { low_bitdepth::Init8bpp(); } +//------------------------------------------------------------------------------ +#if LIBGAV1_MAX_BITDEPTH >= 10 +namespace high_bitdepth { +namespace { + +// Upscale_Filter as defined in AV1 Section 7.16 +alignas(16) const int16_t + kUpscaleFilter[kSuperResFilterShifts][kSuperResFilterTaps] = { + {0, 0, 0, 128, 0, 0, 0, 0}, {0, 0, -1, 128, 2, -1, 0, 0}, + {0, 1, -3, 127, 4, -2, 1, 0}, {0, 1, -4, 127, 6, -3, 1, 0}, + {0, 2, -6, 126, 8, -3, 1, 0}, {0, 2, -7, 125, 11, -4, 1, 0}, + {-1, 2, -8, 125, 13, -5, 2, 0}, {-1, 3, -9, 124, 15, -6, 2, 0}, + {-1, 3, -10, 123, 18, -6, 2, -1}, {-1, 3, -11, 122, 20, -7, 3, -1}, + {-1, 4, -12, 121, 22, -8, 3, -1}, {-1, 4, -13, 120, 25, -9, 3, -1}, + {-1, 4, -14, 118, 28, -9, 3, -1}, {-1, 4, -15, 117, 30, -10, 4, -1}, + {-1, 5, -16, 116, 32, -11, 4, -1}, {-1, 5, -16, 114, 35, -12, 4, -1}, + {-1, 5, -17, 112, 38, -12, 4, -1}, {-1, 5, -18, 111, 40, -13, 5, -1}, + {-1, 5, -18, 109, 43, -14, 5, -1}, {-1, 6, -19, 107, 45, -14, 5, -1}, + {-1, 6, -19, 105, 48, -15, 5, -1}, {-1, 6, -19, 103, 51, -16, 5, -1}, + {-1, 6, -20, 101, 53, -16, 6, -1}, {-1, 6, -20, 99, 56, -17, 6, -1}, + {-1, 6, -20, 97, 58, -17, 6, -1}, {-1, 6, -20, 95, 61, -18, 6, -1}, + {-2, 7, -20, 93, 64, -18, 6, -2}, {-2, 7, -20, 91, 66, -19, 6, -1}, + {-2, 7, -20, 88, 69, -19, 6, -1}, {-2, 7, -20, 86, 71, -19, 6, -1}, + {-2, 7, -20, 84, 74, -20, 7, -2}, {-2, 7, -20, 81, 76, -20, 7, -1}, + {-2, 7, -20, 79, 79, -20, 7, -2}, {-1, 7, -20, 76, 81, -20, 7, -2}, + {-2, 7, -20, 74, 84, -20, 7, -2}, {-1, 6, -19, 71, 86, -20, 7, -2}, + {-1, 6, -19, 69, 88, -20, 7, -2}, {-1, 6, -19, 66, 91, -20, 7, -2}, + {-2, 6, -18, 64, 93, -20, 7, -2}, {-1, 6, -18, 61, 95, -20, 6, -1}, + {-1, 6, -17, 58, 97, -20, 6, -1}, {-1, 6, -17, 56, 99, -20, 6, -1}, + {-1, 6, -16, 53, 101, -20, 6, -1}, {-1, 5, -16, 51, 103, -19, 6, -1}, + {-1, 5, -15, 48, 105, -19, 6, -1}, {-1, 5, -14, 45, 107, -19, 6, -1}, + {-1, 5, -14, 43, 109, -18, 5, -1}, {-1, 5, -13, 40, 111, -18, 5, -1}, + {-1, 4, -12, 38, 112, -17, 5, -1}, {-1, 4, -12, 35, 114, -16, 5, -1}, + {-1, 4, -11, 32, 116, -16, 5, -1}, {-1, 4, -10, 30, 117, -15, 4, -1}, + {-1, 3, -9, 28, 118, -14, 4, -1}, {-1, 3, -9, 25, 120, -13, 4, -1}, + {-1, 3, -8, 22, 121, -12, 4, -1}, {-1, 3, -7, 20, 122, -11, 3, -1}, + {-1, 2, -6, 18, 123, -10, 3, -1}, {0, 2, -6, 15, 124, -9, 3, -1}, + {0, 2, -5, 13, 125, -8, 2, -1}, {0, 1, -4, 11, 125, -7, 2, 0}, + {0, 1, -3, 8, 126, -6, 2, 0}, {0, 1, -3, 6, 127, -4, 1, 0}, + {0, 1, -2, 4, 127, -3, 1, 0}, {0, 0, -1, 2, 128, -1, 0, 0}, +}; + +void SuperResCoefficients_SSE4_1(const int upscaled_width, + const int initial_subpixel_x, const int step, + void* const coefficients) { + auto* dst = static_cast<uint16_t*>(coefficients); + int subpixel_x = initial_subpixel_x; + int x = RightShiftWithCeiling(upscaled_width, 3); + do { + for (int i = 0; i < 8; ++i, dst += 8) { + int remainder = subpixel_x & kSuperResScaleMask; + __m128i filter = + LoadAligned16(kUpscaleFilter[remainder >> kSuperResExtraBits]); + subpixel_x += step; + StoreAligned16(dst, filter); + } + } while (--x != 0); +} + +template <int bitdepth> +void SuperRes_SSE4_1(const void* const coefficients, void* const source, + const ptrdiff_t source_stride, const int height, + const int downscaled_width, const int upscaled_width, + const int initial_subpixel_x, const int step, + void* const dest, const ptrdiff_t dest_stride) { + auto* src = static_cast<uint16_t*>(source) - DivideBy2(kSuperResFilterTaps); + auto* dst = static_cast<uint16_t*>(dest); + int y = height; + do { + const auto* filter = static_cast<const uint16_t*>(coefficients); + uint16_t* dst_ptr = dst; + ExtendLine<uint16_t>(src + DivideBy2(kSuperResFilterTaps), downscaled_width, + kSuperResHorizontalBorder, kSuperResHorizontalPadding); + int subpixel_x = initial_subpixel_x; + // The below code calculates up to 7 extra upscaled + // pixels which will over-read up to 7 downscaled pixels in the end of each + // row. kSuperResHorizontalPadding accounts for this. + int x = RightShiftWithCeiling(upscaled_width, 3); + do { + __m128i weighted_src[8]; + for (int i = 0; i < 8; ++i, filter += 8) { + const __m128i s = + LoadUnaligned16(&src[subpixel_x >> kSuperResScaleBits]); + subpixel_x += step; + const __m128i f = LoadAligned16(filter); + weighted_src[i] = _mm_madd_epi16(s, f); + } + + __m128i a[4]; + a[0] = _mm_hadd_epi32(weighted_src[0], weighted_src[1]); + a[1] = _mm_hadd_epi32(weighted_src[2], weighted_src[3]); + a[2] = _mm_hadd_epi32(weighted_src[4], weighted_src[5]); + a[3] = _mm_hadd_epi32(weighted_src[6], weighted_src[7]); + + a[0] = _mm_hadd_epi32(a[0], a[1]); + a[1] = _mm_hadd_epi32(a[2], a[3]); + a[0] = RightShiftWithRounding_S32(a[0], kFilterBits); + a[1] = RightShiftWithRounding_S32(a[1], kFilterBits); + + // Clip the values at (1 << bd) - 1 + const __m128i clipped_16 = _mm_min_epi16( + _mm_packus_epi32(a[0], a[1]), _mm_set1_epi16((1 << bitdepth) - 1)); + StoreAligned16(dst_ptr, clipped_16); + dst_ptr += 8; + } while (--x != 0); + src += source_stride; + dst += dest_stride; + } while (--y != 0); +} + +void Init10bpp() { + Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth10); + assert(dsp != nullptr); + static_cast<void>(dsp); +#if DSP_ENABLED_10BPP_SSE4_1(SuperResCoefficients) + dsp->super_res_coefficients = SuperResCoefficients_SSE4_1; +#else + static_cast<void>(SuperResCoefficients_SSE4_1); +#endif +#if DSP_ENABLED_10BPP_SSE4_1(SuperRes) + dsp->super_res = SuperRes_SSE4_1<10>; +#else + static_cast<void>(SuperRes_SSE4_1); +#endif +} + +} // namespace +} // namespace high_bitdepth +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +void SuperResInit_SSE4_1() { + low_bitdepth::Init8bpp(); +#if LIBGAV1_MAX_BITDEPTH >= 10 + high_bitdepth::Init10bpp(); +#endif +} } // namespace dsp } // namespace libgav1 -#else // !LIBGAV1_TARGETING_SSE4_1 +#else // !LIBGAV1_TARGETING_SSE4_1 namespace libgav1 { namespace dsp { diff --git a/src/dsp/x86/super_res_sse4.h b/src/dsp/x86/super_res_sse4.h index aef5147..07a7ef4 100644 --- a/src/dsp/x86/super_res_sse4.h +++ b/src/dsp/x86/super_res_sse4.h @@ -30,9 +30,21 @@ void SuperResInit_SSE4_1(); } // namespace libgav1 #if LIBGAV1_TARGETING_SSE4_1 +#ifndef LIBGAV1_Dsp8bpp_SuperResCoefficients +#define LIBGAV1_Dsp8bpp_SuperResCoefficients LIBGAV1_CPU_SSE4_1 +#endif + #ifndef LIBGAV1_Dsp8bpp_SuperRes #define LIBGAV1_Dsp8bpp_SuperRes LIBGAV1_CPU_SSE4_1 #endif + +#ifndef LIBGAV1_Dsp10bpp_SuperResCoefficients +#define LIBGAV1_Dsp10bpp_SuperResCoefficients LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_SuperRes +#define LIBGAV1_Dsp10bpp_SuperRes LIBGAV1_CPU_SSE4_1 +#endif #endif // LIBGAV1_TARGETING_SSE4_1 #endif // LIBGAV1_SRC_DSP_X86_SUPER_RES_SSE4_H_ diff --git a/src/dsp/x86/transpose_sse4.h b/src/dsp/x86/transpose_sse4.h index 208b301..9726495 100644 --- a/src/dsp/x86/transpose_sse4.h +++ b/src/dsp/x86/transpose_sse4.h @@ -30,9 +30,9 @@ LIBGAV1_ALWAYS_INLINE void Transpose2x16_U16(const __m128i* const in, __m128i* const out) { // Unpack 16 bit elements. Goes from: // in[0]: 00 01 10 11 20 21 30 31 - // in[0]: 40 41 50 51 60 61 70 71 - // in[0]: 80 81 90 91 a0 a1 b0 b1 - // in[0]: c0 c1 d0 d1 e0 e1 f0 f1 + // in[1]: 40 41 50 51 60 61 70 71 + // in[2]: 80 81 90 91 a0 a1 b0 b1 + // in[3]: c0 c1 d0 d1 e0 e1 f0 f1 // to: // a0: 00 40 01 41 10 50 11 51 // a1: 20 60 21 61 30 70 31 71 diff --git a/src/dsp/x86/warp_sse4.cc b/src/dsp/x86/warp_sse4.cc index 43279ab..9ddfeac 100644 --- a/src/dsp/x86/warp_sse4.cc +++ b/src/dsp/x86/warp_sse4.cc @@ -513,7 +513,7 @@ void WarpInit_SSE4_1() { low_bitdepth::Init8bpp(); } } // namespace dsp } // namespace libgav1 -#else // !LIBGAV1_TARGETING_SSE4_1 +#else // !LIBGAV1_TARGETING_SSE4_1 namespace libgav1 { namespace dsp { diff --git a/src/dsp/x86/weight_mask_sse4.cc b/src/dsp/x86/weight_mask_sse4.cc index dfd5662..08a1739 100644 --- a/src/dsp/x86/weight_mask_sse4.cc +++ b/src/dsp/x86/weight_mask_sse4.cc @@ -36,47 +36,65 @@ namespace { constexpr int kRoundingBits8bpp = 4; -template <bool mask_is_inverse> -inline void WeightMask8_SSE4(const int16_t* prediction_0, - const int16_t* prediction_1, uint8_t* mask) { - const __m128i pred_0 = LoadAligned16(prediction_0); - const __m128i pred_1 = LoadAligned16(prediction_1); - const __m128i difference = RightShiftWithRounding_U16( - _mm_abs_epi16(_mm_sub_epi16(pred_0, pred_1)), kRoundingBits8bpp); - const __m128i scaled_difference = _mm_srli_epi16(difference, 4); +template <bool mask_is_inverse, bool is_store_16> +inline void WeightMask16_SSE4(const int16_t* prediction_0, + const int16_t* prediction_1, uint8_t* mask, + ptrdiff_t mask_stride) { + const __m128i pred_00 = LoadAligned16(prediction_0); + const __m128i pred_10 = LoadAligned16(prediction_1); + const __m128i difference_0 = RightShiftWithRounding_U16( + _mm_abs_epi16(_mm_sub_epi16(pred_00, pred_10)), kRoundingBits8bpp); + const __m128i scaled_difference_0 = _mm_srli_epi16(difference_0, 4); + + const __m128i pred_01 = LoadAligned16(prediction_0 + 8); + const __m128i pred_11 = LoadAligned16(prediction_1 + 8); + const __m128i difference_1 = RightShiftWithRounding_U16( + _mm_abs_epi16(_mm_sub_epi16(pred_01, pred_11)), kRoundingBits8bpp); + const __m128i scaled_difference_1 = _mm_srli_epi16(difference_1, 4); + const __m128i difference_offset = _mm_set1_epi8(38); const __m128i adjusted_difference = - _mm_adds_epu8(_mm_packus_epi16(scaled_difference, scaled_difference), + _mm_adds_epu8(_mm_packus_epi16(scaled_difference_0, scaled_difference_1), difference_offset); const __m128i mask_ceiling = _mm_set1_epi8(64); const __m128i mask_value = _mm_min_epi8(adjusted_difference, mask_ceiling); if (mask_is_inverse) { const __m128i inverted_mask_value = _mm_sub_epi8(mask_ceiling, mask_value); - StoreLo8(mask, inverted_mask_value); + if (is_store_16) { + StoreAligned16(mask, inverted_mask_value); + } else { + StoreLo8(mask, inverted_mask_value); + StoreHi8(mask + mask_stride, inverted_mask_value); + } } else { - StoreLo8(mask, mask_value); + if (is_store_16) { + StoreAligned16(mask, mask_value); + } else { + StoreLo8(mask, mask_value); + StoreHi8(mask + mask_stride, mask_value); + } } } -#define WEIGHT8_WITHOUT_STRIDE \ - WeightMask8_SSE4<mask_is_inverse>(pred_0, pred_1, mask) +#define WEIGHT8_PAIR_WITHOUT_STRIDE \ + WeightMask16_SSE4<mask_is_inverse, false>(pred_0, pred_1, mask, mask_stride) -#define WEIGHT8_AND_STRIDE \ - WEIGHT8_WITHOUT_STRIDE; \ - pred_0 += 8; \ - pred_1 += 8; \ - mask += mask_stride +#define WEIGHT8_PAIR_AND_STRIDE \ + WEIGHT8_PAIR_WITHOUT_STRIDE; \ + pred_0 += 8 << 1; \ + pred_1 += 8 << 1; \ + mask += mask_stride << 1 template <bool mask_is_inverse> void WeightMask8x8_SSE4(const void* prediction_0, const void* prediction_1, uint8_t* mask, ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); - int y = 0; - do { - WEIGHT8_AND_STRIDE; - } while (++y < 7); - WEIGHT8_WITHOUT_STRIDE; + + WEIGHT8_PAIR_AND_STRIDE; + WEIGHT8_PAIR_AND_STRIDE; + WEIGHT8_PAIR_AND_STRIDE; + WEIGHT8_PAIR_WITHOUT_STRIDE; } template <bool mask_is_inverse> @@ -84,13 +102,13 @@ void WeightMask8x16_SSE4(const void* prediction_0, const void* prediction_1, uint8_t* mask, ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); - int y3 = 0; + int y3 = 3; do { - WEIGHT8_AND_STRIDE; - WEIGHT8_AND_STRIDE; - WEIGHT8_AND_STRIDE; - } while (++y3 < 5); - WEIGHT8_WITHOUT_STRIDE; + WEIGHT8_PAIR_AND_STRIDE; + WEIGHT8_PAIR_AND_STRIDE; + } while (--y3 != 0); + WEIGHT8_PAIR_AND_STRIDE; + WEIGHT8_PAIR_WITHOUT_STRIDE; } template <bool mask_is_inverse> @@ -98,21 +116,17 @@ void WeightMask8x32_SSE4(const void* prediction_0, const void* prediction_1, uint8_t* mask, ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); - int y5 = 0; + int y5 = 5; do { - WEIGHT8_AND_STRIDE; - WEIGHT8_AND_STRIDE; - WEIGHT8_AND_STRIDE; - WEIGHT8_AND_STRIDE; - WEIGHT8_AND_STRIDE; - } while (++y5 < 6); - WEIGHT8_AND_STRIDE; - WEIGHT8_WITHOUT_STRIDE; + WEIGHT8_PAIR_AND_STRIDE; + WEIGHT8_PAIR_AND_STRIDE; + WEIGHT8_PAIR_AND_STRIDE; + } while (--y5 != 0); + WEIGHT8_PAIR_WITHOUT_STRIDE; } -#define WEIGHT16_WITHOUT_STRIDE \ - WeightMask8_SSE4<mask_is_inverse>(pred_0, pred_1, mask); \ - WeightMask8_SSE4<mask_is_inverse>(pred_0 + 8, pred_1 + 8, mask + 8) +#define WEIGHT16_WITHOUT_STRIDE \ + WeightMask16_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask, mask_stride) #define WEIGHT16_AND_STRIDE \ WEIGHT16_WITHOUT_STRIDE; \ @@ -125,10 +139,10 @@ void WeightMask16x8_SSE4(const void* prediction_0, const void* prediction_1, uint8_t* mask, ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); - int y = 0; + int y = 7; do { WEIGHT16_AND_STRIDE; - } while (++y < 7); + } while (--y != 0); WEIGHT16_WITHOUT_STRIDE; } @@ -137,12 +151,12 @@ void WeightMask16x16_SSE4(const void* prediction_0, const void* prediction_1, uint8_t* mask, ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); - int y3 = 0; + int y3 = 5; do { WEIGHT16_AND_STRIDE; WEIGHT16_AND_STRIDE; WEIGHT16_AND_STRIDE; - } while (++y3 < 5); + } while (--y3 != 0); WEIGHT16_WITHOUT_STRIDE; } @@ -151,14 +165,14 @@ void WeightMask16x32_SSE4(const void* prediction_0, const void* prediction_1, uint8_t* mask, ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); - int y5 = 0; + int y5 = 6; do { WEIGHT16_AND_STRIDE; WEIGHT16_AND_STRIDE; WEIGHT16_AND_STRIDE; WEIGHT16_AND_STRIDE; WEIGHT16_AND_STRIDE; - } while (++y5 < 6); + } while (--y5 != 0); WEIGHT16_AND_STRIDE; WEIGHT16_WITHOUT_STRIDE; } @@ -168,20 +182,19 @@ void WeightMask16x64_SSE4(const void* prediction_0, const void* prediction_1, uint8_t* mask, ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); - int y3 = 0; + int y3 = 21; do { WEIGHT16_AND_STRIDE; WEIGHT16_AND_STRIDE; WEIGHT16_AND_STRIDE; - } while (++y3 < 21); + } while (--y3 != 0); WEIGHT16_WITHOUT_STRIDE; } -#define WEIGHT32_WITHOUT_STRIDE \ - WeightMask8_SSE4<mask_is_inverse>(pred_0, pred_1, mask); \ - WeightMask8_SSE4<mask_is_inverse>(pred_0 + 8, pred_1 + 8, mask + 8); \ - WeightMask8_SSE4<mask_is_inverse>(pred_0 + 16, pred_1 + 16, mask + 16); \ - WeightMask8_SSE4<mask_is_inverse>(pred_0 + 24, pred_1 + 24, mask + 24) +#define WEIGHT32_WITHOUT_STRIDE \ + WeightMask16_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask, mask_stride); \ + WeightMask16_SSE4<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16, \ + mask + 16, mask_stride) #define WEIGHT32_AND_STRIDE \ WEIGHT32_WITHOUT_STRIDE; \ @@ -209,12 +222,12 @@ void WeightMask32x16_SSE4(const void* prediction_0, const void* prediction_1, uint8_t* mask, ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); - int y3 = 0; + int y3 = 5; do { WEIGHT32_AND_STRIDE; WEIGHT32_AND_STRIDE; WEIGHT32_AND_STRIDE; - } while (++y3 < 5); + } while (--y3 != 0); WEIGHT32_WITHOUT_STRIDE; } @@ -223,14 +236,14 @@ void WeightMask32x32_SSE4(const void* prediction_0, const void* prediction_1, uint8_t* mask, ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); - int y5 = 0; + int y5 = 6; do { WEIGHT32_AND_STRIDE; WEIGHT32_AND_STRIDE; WEIGHT32_AND_STRIDE; WEIGHT32_AND_STRIDE; WEIGHT32_AND_STRIDE; - } while (++y5 < 6); + } while (--y5 != 0); WEIGHT32_AND_STRIDE; WEIGHT32_WITHOUT_STRIDE; } @@ -240,24 +253,23 @@ void WeightMask32x64_SSE4(const void* prediction_0, const void* prediction_1, uint8_t* mask, ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); - int y3 = 0; + int y3 = 21; do { WEIGHT32_AND_STRIDE; WEIGHT32_AND_STRIDE; WEIGHT32_AND_STRIDE; - } while (++y3 < 21); + } while (--y3 != 0); WEIGHT32_WITHOUT_STRIDE; } -#define WEIGHT64_WITHOUT_STRIDE \ - WeightMask8_SSE4<mask_is_inverse>(pred_0, pred_1, mask); \ - WeightMask8_SSE4<mask_is_inverse>(pred_0 + 8, pred_1 + 8, mask + 8); \ - WeightMask8_SSE4<mask_is_inverse>(pred_0 + 16, pred_1 + 16, mask + 16); \ - WeightMask8_SSE4<mask_is_inverse>(pred_0 + 24, pred_1 + 24, mask + 24); \ - WeightMask8_SSE4<mask_is_inverse>(pred_0 + 32, pred_1 + 32, mask + 32); \ - WeightMask8_SSE4<mask_is_inverse>(pred_0 + 40, pred_1 + 40, mask + 40); \ - WeightMask8_SSE4<mask_is_inverse>(pred_0 + 48, pred_1 + 48, mask + 48); \ - WeightMask8_SSE4<mask_is_inverse>(pred_0 + 56, pred_1 + 56, mask + 56) +#define WEIGHT64_WITHOUT_STRIDE \ + WeightMask16_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask, mask_stride); \ + WeightMask16_SSE4<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16, \ + mask + 16, mask_stride); \ + WeightMask16_SSE4<mask_is_inverse, true>(pred_0 + 32, pred_1 + 32, \ + mask + 32, mask_stride); \ + WeightMask16_SSE4<mask_is_inverse, true>(pred_0 + 48, pred_1 + 48, \ + mask + 48, mask_stride) #define WEIGHT64_AND_STRIDE \ WEIGHT64_WITHOUT_STRIDE; \ @@ -447,12 +459,491 @@ void Init8bpp() { } // namespace } // namespace low_bitdepth -void WeightMaskInit_SSE4_1() { low_bitdepth::Init8bpp(); } +#if LIBGAV1_MAX_BITDEPTH >= 10 +namespace high_bitdepth { +namespace { + +constexpr int kRoundingBits10bpp = 6; +constexpr int kScaledDiffShift = 4; + +template <bool mask_is_inverse, bool is_store_16> +inline void WeightMask16_10bpp_SSE4(const uint16_t* prediction_0, + const uint16_t* prediction_1, uint8_t* mask, + ptrdiff_t mask_stride) { + const __m128i diff_offset = _mm_set1_epi8(38); + const __m128i mask_ceiling = _mm_set1_epi8(64); + const __m128i zero = _mm_setzero_si128(); + + // Range of prediction: [3988, 61532]. + const __m128i pred_00 = LoadAligned16(prediction_0); + const __m128i pred_10 = LoadAligned16(prediction_1); + const __m128i pred_lo_00 = _mm_cvtepu16_epi32(pred_00); + const __m128i pred_lo_10 = _mm_cvtepu16_epi32(pred_10); + const __m128i diff_lo_0 = RightShiftWithRounding_U32( + _mm_abs_epi32(_mm_sub_epi32(pred_lo_00, pred_lo_10)), kRoundingBits10bpp); + + const __m128i pred_hi_00 = _mm_unpackhi_epi16(pred_00, zero); + const __m128i pred_hi_10 = _mm_unpackhi_epi16(pred_10, zero); + const __m128i diff_hi_0 = RightShiftWithRounding_U32( + _mm_abs_epi32(_mm_sub_epi32(pred_hi_00, pred_hi_10)), kRoundingBits10bpp); + + const __m128i diff_0 = _mm_packus_epi32(diff_lo_0, diff_hi_0); + const __m128i scaled_diff_0 = _mm_srli_epi16(diff_0, kScaledDiffShift); + + const __m128i pred_01 = LoadAligned16(prediction_0 + 8); + const __m128i pred_11 = LoadAligned16(prediction_1 + 8); + const __m128i pred_lo_01 = _mm_cvtepu16_epi32(pred_01); + const __m128i pred_lo_11 = _mm_cvtepu16_epi32(pred_11); + const __m128i diff_lo_1 = RightShiftWithRounding_U32( + _mm_abs_epi32(_mm_sub_epi32(pred_lo_01, pred_lo_11)), kRoundingBits10bpp); + + const __m128i pred_hi_01 = _mm_unpackhi_epi16(pred_01, zero); + const __m128i pred_hi_11 = _mm_unpackhi_epi16(pred_11, zero); + const __m128i diff_hi_1 = RightShiftWithRounding_U32( + _mm_abs_epi32(_mm_sub_epi32(pred_hi_01, pred_hi_11)), kRoundingBits10bpp); + + const __m128i diff_1 = _mm_packus_epi32(diff_lo_1, diff_hi_1); + const __m128i scaled_diff_1 = _mm_srli_epi16(diff_1, kScaledDiffShift); + + const __m128i adjusted_diff = _mm_adds_epu8( + _mm_packus_epi16(scaled_diff_0, scaled_diff_1), diff_offset); + const __m128i mask_value = _mm_min_epi8(adjusted_diff, mask_ceiling); + + if (mask_is_inverse) { + const __m128i inverted_mask_value = _mm_sub_epi8(mask_ceiling, mask_value); + if (is_store_16) { + StoreAligned16(mask, inverted_mask_value); + } else { + StoreLo8(mask, inverted_mask_value); + StoreHi8(mask + mask_stride, inverted_mask_value); + } + } else { + if (is_store_16) { + StoreAligned16(mask, mask_value); + } else { + StoreLo8(mask, mask_value); + StoreHi8(mask + mask_stride, mask_value); + } + } +} + +#define WEIGHT8_PAIR_WITHOUT_STRIDE_10BPP \ + WeightMask16_10bpp_SSE4<mask_is_inverse, false>(pred_0, pred_1, mask, \ + mask_stride) + +#define WEIGHT8_PAIR_AND_STRIDE_10BPP \ + WEIGHT8_PAIR_WITHOUT_STRIDE_10BPP; \ + pred_0 += 8 << 1; \ + pred_1 += 8 << 1; \ + mask += mask_stride << 1 + +template <bool mask_is_inverse> +void WeightMask8x8_10bpp_SSE4(const void* prediction_0, + const void* prediction_1, uint8_t* mask, + ptrdiff_t mask_stride) { + const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); + const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); + + WEIGHT8_PAIR_AND_STRIDE_10BPP; + WEIGHT8_PAIR_AND_STRIDE_10BPP; + WEIGHT8_PAIR_AND_STRIDE_10BPP; + WEIGHT8_PAIR_WITHOUT_STRIDE_10BPP; +} + +template <bool mask_is_inverse> +void WeightMask8x16_10bpp_SSE4(const void* prediction_0, + const void* prediction_1, uint8_t* mask, + ptrdiff_t mask_stride) { + const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); + const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); + int y3 = 3; + do { + WEIGHT8_PAIR_AND_STRIDE_10BPP; + WEIGHT8_PAIR_AND_STRIDE_10BPP; + } while (--y3 != 0); + WEIGHT8_PAIR_AND_STRIDE_10BPP; + WEIGHT8_PAIR_WITHOUT_STRIDE_10BPP; +} + +template <bool mask_is_inverse> +void WeightMask8x32_10bpp_SSE4(const void* prediction_0, + const void* prediction_1, uint8_t* mask, + ptrdiff_t mask_stride) { + const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); + const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); + int y5 = 5; + do { + WEIGHT8_PAIR_AND_STRIDE_10BPP; + WEIGHT8_PAIR_AND_STRIDE_10BPP; + WEIGHT8_PAIR_AND_STRIDE_10BPP; + } while (--y5 != 0); + WEIGHT8_PAIR_WITHOUT_STRIDE_10BPP; +} + +#define WEIGHT16_WITHOUT_STRIDE_10BPP \ + WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask, \ + mask_stride) + +#define WEIGHT16_AND_STRIDE_10BPP \ + WEIGHT16_WITHOUT_STRIDE_10BPP; \ + pred_0 += 16; \ + pred_1 += 16; \ + mask += mask_stride + +template <bool mask_is_inverse> +void WeightMask16x8_10bpp_SSE4(const void* prediction_0, + const void* prediction_1, uint8_t* mask, + ptrdiff_t mask_stride) { + const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); + const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); + int y = 7; + do { + WEIGHT16_AND_STRIDE_10BPP; + } while (--y != 0); + WEIGHT16_WITHOUT_STRIDE_10BPP; +} + +template <bool mask_is_inverse> +void WeightMask16x16_10bpp_SSE4(const void* prediction_0, + const void* prediction_1, uint8_t* mask, + ptrdiff_t mask_stride) { + const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); + const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); + int y3 = 5; + do { + WEIGHT16_AND_STRIDE_10BPP; + WEIGHT16_AND_STRIDE_10BPP; + WEIGHT16_AND_STRIDE_10BPP; + } while (--y3 != 0); + WEIGHT16_WITHOUT_STRIDE_10BPP; +} + +template <bool mask_is_inverse> +void WeightMask16x32_10bpp_SSE4(const void* prediction_0, + const void* prediction_1, uint8_t* mask, + ptrdiff_t mask_stride) { + const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); + const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); + int y5 = 6; + do { + WEIGHT16_AND_STRIDE_10BPP; + WEIGHT16_AND_STRIDE_10BPP; + WEIGHT16_AND_STRIDE_10BPP; + WEIGHT16_AND_STRIDE_10BPP; + WEIGHT16_AND_STRIDE_10BPP; + } while (--y5 != 0); + WEIGHT16_AND_STRIDE_10BPP; + WEIGHT16_WITHOUT_STRIDE_10BPP; +} + +template <bool mask_is_inverse> +void WeightMask16x64_10bpp_SSE4(const void* prediction_0, + const void* prediction_1, uint8_t* mask, + ptrdiff_t mask_stride) { + const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); + const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); + int y3 = 21; + do { + WEIGHT16_AND_STRIDE_10BPP; + WEIGHT16_AND_STRIDE_10BPP; + WEIGHT16_AND_STRIDE_10BPP; + } while (--y3 != 0); + WEIGHT16_WITHOUT_STRIDE_10BPP; +} + +#define WEIGHT32_WITHOUT_STRIDE_10BPP \ + WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask, \ + mask_stride); \ + WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16, \ + mask + 16, mask_stride) + +#define WEIGHT32_AND_STRIDE_10BPP \ + WEIGHT32_WITHOUT_STRIDE_10BPP; \ + pred_0 += 32; \ + pred_1 += 32; \ + mask += mask_stride + +template <bool mask_is_inverse> +void WeightMask32x8_10bpp_SSE4(const void* prediction_0, + const void* prediction_1, uint8_t* mask, + ptrdiff_t mask_stride) { + const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); + const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); + WEIGHT32_AND_STRIDE_10BPP; + WEIGHT32_AND_STRIDE_10BPP; + WEIGHT32_AND_STRIDE_10BPP; + WEIGHT32_AND_STRIDE_10BPP; + WEIGHT32_AND_STRIDE_10BPP; + WEIGHT32_AND_STRIDE_10BPP; + WEIGHT32_AND_STRIDE_10BPP; + WEIGHT32_WITHOUT_STRIDE_10BPP; +} + +template <bool mask_is_inverse> +void WeightMask32x16_10bpp_SSE4(const void* prediction_0, + const void* prediction_1, uint8_t* mask, + ptrdiff_t mask_stride) { + const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); + const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); + int y3 = 5; + do { + WEIGHT32_AND_STRIDE_10BPP; + WEIGHT32_AND_STRIDE_10BPP; + WEIGHT32_AND_STRIDE_10BPP; + } while (--y3 != 0); + WEIGHT32_WITHOUT_STRIDE_10BPP; +} + +template <bool mask_is_inverse> +void WeightMask32x32_10bpp_SSE4(const void* prediction_0, + const void* prediction_1, uint8_t* mask, + ptrdiff_t mask_stride) { + const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); + const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); + int y5 = 6; + do { + WEIGHT32_AND_STRIDE_10BPP; + WEIGHT32_AND_STRIDE_10BPP; + WEIGHT32_AND_STRIDE_10BPP; + WEIGHT32_AND_STRIDE_10BPP; + WEIGHT32_AND_STRIDE_10BPP; + } while (--y5 != 0); + WEIGHT32_AND_STRIDE_10BPP; + WEIGHT32_WITHOUT_STRIDE_10BPP; +} + +template <bool mask_is_inverse> +void WeightMask32x64_10bpp_SSE4(const void* prediction_0, + const void* prediction_1, uint8_t* mask, + ptrdiff_t mask_stride) { + const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); + const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); + int y3 = 21; + do { + WEIGHT32_AND_STRIDE_10BPP; + WEIGHT32_AND_STRIDE_10BPP; + WEIGHT32_AND_STRIDE_10BPP; + } while (--y3 != 0); + WEIGHT32_WITHOUT_STRIDE_10BPP; +} + +#define WEIGHT64_WITHOUT_STRIDE_10BPP \ + WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask, \ + mask_stride); \ + WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16, \ + mask + 16, mask_stride); \ + WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0 + 32, pred_1 + 32, \ + mask + 32, mask_stride); \ + WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0 + 48, pred_1 + 48, \ + mask + 48, mask_stride) + +#define WEIGHT64_AND_STRIDE_10BPP \ + WEIGHT64_WITHOUT_STRIDE_10BPP; \ + pred_0 += 64; \ + pred_1 += 64; \ + mask += mask_stride + +template <bool mask_is_inverse> +void WeightMask64x16_10bpp_SSE4(const void* prediction_0, + const void* prediction_1, uint8_t* mask, + ptrdiff_t mask_stride) { + const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); + const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); + int y3 = 5; + do { + WEIGHT64_AND_STRIDE_10BPP; + WEIGHT64_AND_STRIDE_10BPP; + WEIGHT64_AND_STRIDE_10BPP; + } while (--y3 != 0); + WEIGHT64_WITHOUT_STRIDE_10BPP; +} + +template <bool mask_is_inverse> +void WeightMask64x32_10bpp_SSE4(const void* prediction_0, + const void* prediction_1, uint8_t* mask, + ptrdiff_t mask_stride) { + const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); + const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); + int y5 = 6; + do { + WEIGHT64_AND_STRIDE_10BPP; + WEIGHT64_AND_STRIDE_10BPP; + WEIGHT64_AND_STRIDE_10BPP; + WEIGHT64_AND_STRIDE_10BPP; + WEIGHT64_AND_STRIDE_10BPP; + } while (--y5 != 0); + WEIGHT64_AND_STRIDE_10BPP; + WEIGHT64_WITHOUT_STRIDE_10BPP; +} + +template <bool mask_is_inverse> +void WeightMask64x64_10bpp_SSE4(const void* prediction_0, + const void* prediction_1, uint8_t* mask, + ptrdiff_t mask_stride) { + const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); + const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); + int y3 = 21; + do { + WEIGHT64_AND_STRIDE_10BPP; + WEIGHT64_AND_STRIDE_10BPP; + WEIGHT64_AND_STRIDE_10BPP; + } while (--y3 != 0); + WEIGHT64_WITHOUT_STRIDE_10BPP; +} + +template <bool mask_is_inverse> +void WeightMask64x128_10bpp_SSE4(const void* prediction_0, + const void* prediction_1, uint8_t* mask, + ptrdiff_t mask_stride) { + const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); + const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); + int y3 = 42; + do { + WEIGHT64_AND_STRIDE_10BPP; + WEIGHT64_AND_STRIDE_10BPP; + WEIGHT64_AND_STRIDE_10BPP; + } while (--y3 != 0); + WEIGHT64_AND_STRIDE_10BPP; + WEIGHT64_WITHOUT_STRIDE_10BPP; +} + +template <bool mask_is_inverse> +void WeightMask128x64_10bpp_SSE4(const void* prediction_0, + const void* prediction_1, uint8_t* mask, + ptrdiff_t mask_stride) { + const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); + const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); + int y3 = 21; + const ptrdiff_t adjusted_mask_stride = mask_stride - 64; + do { + WEIGHT64_WITHOUT_STRIDE_10BPP; + pred_0 += 64; + pred_1 += 64; + mask += 64; + WEIGHT64_WITHOUT_STRIDE_10BPP; + pred_0 += 64; + pred_1 += 64; + mask += adjusted_mask_stride; + + WEIGHT64_WITHOUT_STRIDE_10BPP; + pred_0 += 64; + pred_1 += 64; + mask += 64; + WEIGHT64_WITHOUT_STRIDE_10BPP; + pred_0 += 64; + pred_1 += 64; + mask += adjusted_mask_stride; + + WEIGHT64_WITHOUT_STRIDE_10BPP; + pred_0 += 64; + pred_1 += 64; + mask += 64; + WEIGHT64_WITHOUT_STRIDE_10BPP; + pred_0 += 64; + pred_1 += 64; + mask += adjusted_mask_stride; + } while (--y3 != 0); + WEIGHT64_WITHOUT_STRIDE_10BPP; + pred_0 += 64; + pred_1 += 64; + mask += 64; + WEIGHT64_WITHOUT_STRIDE_10BPP; +} + +template <bool mask_is_inverse> +void WeightMask128x128_10bpp_SSE4(const void* prediction_0, + const void* prediction_1, uint8_t* mask, + ptrdiff_t mask_stride) { + const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); + const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); + int y3 = 42; + const ptrdiff_t adjusted_mask_stride = mask_stride - 64; + do { + WEIGHT64_WITHOUT_STRIDE_10BPP; + pred_0 += 64; + pred_1 += 64; + mask += 64; + WEIGHT64_WITHOUT_STRIDE_10BPP; + pred_0 += 64; + pred_1 += 64; + mask += adjusted_mask_stride; + + WEIGHT64_WITHOUT_STRIDE_10BPP; + pred_0 += 64; + pred_1 += 64; + mask += 64; + WEIGHT64_WITHOUT_STRIDE_10BPP; + pred_0 += 64; + pred_1 += 64; + mask += adjusted_mask_stride; + + WEIGHT64_WITHOUT_STRIDE_10BPP; + pred_0 += 64; + pred_1 += 64; + mask += 64; + WEIGHT64_WITHOUT_STRIDE_10BPP; + pred_0 += 64; + pred_1 += 64; + mask += adjusted_mask_stride; + } while (--y3 != 0); + WEIGHT64_WITHOUT_STRIDE_10BPP; + pred_0 += 64; + pred_1 += 64; + mask += 64; + WEIGHT64_WITHOUT_STRIDE_10BPP; + pred_0 += 64; + pred_1 += 64; + mask += adjusted_mask_stride; + + WEIGHT64_WITHOUT_STRIDE_10BPP; + pred_0 += 64; + pred_1 += 64; + mask += 64; + WEIGHT64_WITHOUT_STRIDE_10BPP; +} + +#define INIT_WEIGHT_MASK_10BPP(width, height, w_index, h_index) \ + dsp->weight_mask[w_index][h_index][0] = \ + WeightMask##width##x##height##_10bpp_SSE4<0>; \ + dsp->weight_mask[w_index][h_index][1] = \ + WeightMask##width##x##height##_10bpp_SSE4<1> +void Init10bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10); + assert(dsp != nullptr); + INIT_WEIGHT_MASK_10BPP(8, 8, 0, 0); + INIT_WEIGHT_MASK_10BPP(8, 16, 0, 1); + INIT_WEIGHT_MASK_10BPP(8, 32, 0, 2); + INIT_WEIGHT_MASK_10BPP(16, 8, 1, 0); + INIT_WEIGHT_MASK_10BPP(16, 16, 1, 1); + INIT_WEIGHT_MASK_10BPP(16, 32, 1, 2); + INIT_WEIGHT_MASK_10BPP(16, 64, 1, 3); + INIT_WEIGHT_MASK_10BPP(32, 8, 2, 0); + INIT_WEIGHT_MASK_10BPP(32, 16, 2, 1); + INIT_WEIGHT_MASK_10BPP(32, 32, 2, 2); + INIT_WEIGHT_MASK_10BPP(32, 64, 2, 3); + INIT_WEIGHT_MASK_10BPP(64, 16, 3, 1); + INIT_WEIGHT_MASK_10BPP(64, 32, 3, 2); + INIT_WEIGHT_MASK_10BPP(64, 64, 3, 3); + INIT_WEIGHT_MASK_10BPP(64, 128, 3, 4); + INIT_WEIGHT_MASK_10BPP(128, 64, 4, 3); + INIT_WEIGHT_MASK_10BPP(128, 128, 4, 4); +} + +} // namespace +} // namespace high_bitdepth +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +void WeightMaskInit_SSE4_1() { + low_bitdepth::Init8bpp(); +#if LIBGAV1_MAX_BITDEPTH >= 10 + high_bitdepth::Init10bpp(); +#endif +} } // namespace dsp } // namespace libgav1 -#else // !LIBGAV1_TARGETING_SSE4_1 +#else // !LIBGAV1_TARGETING_SSE4_1 namespace libgav1 { namespace dsp { diff --git a/src/dsp/x86/weight_mask_sse4.h b/src/dsp/x86/weight_mask_sse4.h index 07636b7..e5d9d70 100644 --- a/src/dsp/x86/weight_mask_sse4.h +++ b/src/dsp/x86/weight_mask_sse4.h @@ -99,6 +99,73 @@ void WeightMaskInit_SSE4_1(); #define LIBGAV1_Dsp8bpp_WeightMask_128x128 LIBGAV1_CPU_SSE4_1 #endif +#ifndef LIBGAV1_Dsp10bpp_WeightMask_8x8 +#define LIBGAV1_Dsp10bpp_WeightMask_8x8 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_WeightMask_8x16 +#define LIBGAV1_Dsp10bpp_WeightMask_8x16 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_WeightMask_8x32 +#define LIBGAV1_Dsp10bpp_WeightMask_8x32 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_WeightMask_16x8 +#define LIBGAV1_Dsp10bpp_WeightMask_16x8 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_WeightMask_16x16 +#define LIBGAV1_Dsp10bpp_WeightMask_16x16 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_WeightMask_16x32 +#define LIBGAV1_Dsp10bpp_WeightMask_16x32 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_WeightMask_16x64 +#define LIBGAV1_Dsp10bpp_WeightMask_16x64 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_WeightMask_32x8 +#define LIBGAV1_Dsp10bpp_WeightMask_32x8 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_WeightMask_32x16 +#define LIBGAV1_Dsp10bpp_WeightMask_32x16 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_WeightMask_32x32 +#define LIBGAV1_Dsp10bpp_WeightMask_32x32 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_WeightMask_32x64 +#define LIBGAV1_Dsp10bpp_WeightMask_32x64 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_WeightMask_64x16 +#define LIBGAV1_Dsp10bpp_WeightMask_64x16 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_WeightMask_64x32 +#define LIBGAV1_Dsp10bpp_WeightMask_64x32 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_WeightMask_64x64 +#define LIBGAV1_Dsp10bpp_WeightMask_64x64 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_WeightMask_64x128 +#define LIBGAV1_Dsp10bpp_WeightMask_64x128 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_WeightMask_128x64 +#define LIBGAV1_Dsp10bpp_WeightMask_128x64 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_WeightMask_128x128 +#define LIBGAV1_Dsp10bpp_WeightMask_128x128 LIBGAV1_CPU_SSE4_1 +#endif #endif // LIBGAV1_TARGETING_SSE4_1 #endif // LIBGAV1_SRC_DSP_ARM_WEIGHT_MASK_SSE4_H_ diff --git a/src/gav1/decoder_settings.h b/src/gav1/decoder_settings.h index ab22a4d..7ee487f 100644 --- a/src/gav1/decoder_settings.h +++ b/src/gav1/decoder_settings.h @@ -62,7 +62,8 @@ typedef struct Libgav1DecoderSettings { Libgav1GetFrameBufferCallback get_frame_buffer; // Release frame buffer callback. Libgav1ReleaseFrameBufferCallback release_frame_buffer; - // Release input frame buffer callback. + // Release input frame buffer callback. This callback must be set when + // |frame_parallel| is true. Libgav1ReleaseInputBufferCallback release_input_buffer; // Passed as the private_data argument to the callbacks. void* callback_private_data; @@ -117,7 +118,8 @@ struct DecoderSettings { GetFrameBufferCallback get_frame_buffer = nullptr; // Release frame buffer callback. ReleaseFrameBufferCallback release_frame_buffer = nullptr; - // Release input frame buffer callback. + // Release input frame buffer callback. This callback must be set when + // |frame_parallel| is true. ReleaseInputBufferCallback release_input_buffer = nullptr; // Passed as the private_data argument to the callbacks. void* callback_private_data = nullptr; diff --git a/src/gav1/symbol_visibility.h b/src/gav1/symbol_visibility.h index ad7498c..116a514 100644 --- a/src/gav1/symbol_visibility.h +++ b/src/gav1/symbol_visibility.h @@ -58,6 +58,11 @@ // // Much of the above information and more can be found at // https://gcc.gnu.org/wiki/Visibility +// +// NOTE: A third-party build system for libgav1 can add -DLIBGAV1_PUBLIC= to the +// compiler command line to override the definition of LIBGAV1_PUBLIC in this +// header. This can be used to create a libgav1 static library that will not +// export any symbols when it is linked into a shared library. #if !defined(LIBGAV1_PUBLIC) #if defined(_WIN32) @@ -76,7 +81,7 @@ #else #define LIBGAV1_PUBLIC #endif // defined(LIBGAV1_BUILDING_DLL) && LIBGAV1_BUILDING_DLL -#else +#else // !defined(_WIN32) #if defined(__GNUC__) && __GNUC__ >= 4 #define LIBGAV1_PUBLIC __attribute__((visibility("default"))) #else diff --git a/src/gav1/version.h b/src/gav1/version.h index 78a573e..c018928 100644 --- a/src/gav1/version.h +++ b/src/gav1/version.h @@ -24,7 +24,7 @@ #define LIBGAV1_MAJOR_VERSION 0 #define LIBGAV1_MINOR_VERSION 16 -#define LIBGAV1_PATCH_VERSION 1 +#define LIBGAV1_PATCH_VERSION 3 #define LIBGAV1_VERSION \ ((LIBGAV1_MAJOR_VERSION << 16) | (LIBGAV1_MINOR_VERSION << 8) | \ diff --git a/src/obu_parser.cc b/src/obu_parser.cc index bbf00ed..69480d7 100644 --- a/src/obu_parser.cc +++ b/src/obu_parser.cc @@ -479,9 +479,13 @@ bool ObuParser::ParseSequenceHeader(bool seen_frame_header) { LIBGAV1_DLOG(ERROR, "Sequence header changed in the middle of a frame."); return false; } + sequence_header_changed_ = true; decoder_state_.ClearReferenceFrames(); } sequence_header_ = sequence_header; + if (!has_sequence_header_) { + sequence_header_changed_ = true; + } has_sequence_header_ = true; // Section 6.4.1: It is a requirement of bitstream conformance that if // OperatingPointIdc is equal to 0, then obu_extension_flag is equal to 0 for @@ -509,12 +513,12 @@ void ObuParser::MarkInvalidReferenceFrames() { if (lower_bound_is_smaller) { if (reference_frame_id > decoder_state_.current_frame_id || reference_frame_id < lower_bound) { - decoder_state_.reference_valid[i] = false; + decoder_state_.reference_frame[i] = nullptr; } } else { if (reference_frame_id > decoder_state_.current_frame_id && reference_frame_id < lower_bound) { - decoder_state_.reference_valid[i] = false; + decoder_state_.reference_frame[i] = nullptr; } } } @@ -621,7 +625,7 @@ bool ObuParser::ParseReferenceOrderHint() { frame_header_.reference_order_hint[i] = scratch; if (frame_header_.reference_order_hint[i] != decoder_state_.reference_order_hint[i]) { - decoder_state_.reference_valid[i] = false; + decoder_state_.reference_frame[i] = nullptr; } } return true; @@ -1787,10 +1791,11 @@ bool ObuParser::ParseFrameParameters() { // whenever display_frame_id is read, the value matches // RefFrameId[ frame_to_show_map_idx ] ..., and that // RefValid[ frame_to_show_map_idx ] is equal to 1. + // + // The current_frame_ == nullptr check below is equivalent to checking + // if RefValid[ frame_to_show_map_idx ] is equal to 1. if (frame_header_.display_frame_id != - decoder_state_ - .reference_frame_id[frame_header_.frame_to_show] || - !decoder_state_.reference_valid[frame_header_.frame_to_show]) { + decoder_state_.reference_frame_id[frame_header_.frame_to_show]) { LIBGAV1_DLOG(ERROR, "Reference buffer %d has a frame id number mismatch.", frame_header_.frame_to_show); @@ -1868,7 +1873,6 @@ bool ObuParser::ParseFrameParameters() { } } if (frame_header_.frame_type == kFrameKey && frame_header_.show_frame) { - decoder_state_.reference_valid.fill(false); decoder_state_.reference_order_hint.fill(0); decoder_state_.reference_frame.fill(nullptr); } @@ -2019,15 +2023,8 @@ bool ObuParser::ParseFrameParameters() { // Note if support for Annex C: Error resilience behavior is added this // check should be omitted per C.5 Decoder consequences of processable // frames. - if (!decoder_state_.reference_valid[reference_frame_index]) { - LIBGAV1_DLOG(ERROR, "ref_frame_idx[%d] (%d) is not valid.", i, - reference_frame_index); - return false; - } - // Check if the inter frame requests a nonexistent reference, whether or - // not frame_refs_short_signaling is used. if (decoder_state_.reference_frame[reference_frame_index] == nullptr) { - LIBGAV1_DLOG(ERROR, "ref_frame_idx[%d] (%d) is not a decoded frame.", i, + LIBGAV1_DLOG(ERROR, "ref_frame_idx[%d] (%d) is not valid.", i, reference_frame_index); return false; } @@ -2043,12 +2040,8 @@ bool ObuParser::ParseFrameParameters() { // Section 6.8.2: It is a requirement of bitstream conformance that // whenever expectedFrameId[ i ] is calculated, the value matches // RefFrameId[ ref_frame_idx[ i ] ] ... - // - // Section 6.8.2: It is a requirement of bitstream conformance that - // RefValid[ ref_frame_idx[ i ] ] is equal to 1, ... if (frame_header_.expected_frame_id[i] != - decoder_state_.reference_frame_id[reference_frame_index] || - !decoder_state_.reference_valid[reference_frame_index]) { + decoder_state_.reference_frame_id[reference_frame_index]) { LIBGAV1_DLOG(ERROR, "Reference buffer %d has a frame id number mismatch.", reference_frame_index); @@ -2665,6 +2658,7 @@ StatusCode ObuParser::ParseOneFrame(RefCountedBufferPtr* const current_frame) { metadata_ = {}; tile_buffers_.clear(); next_tile_group_start_ = 0; + sequence_header_changed_ = false; bool parsed_one_full_frame = false; bool seen_frame_header = false; diff --git a/src/obu_parser.h b/src/obu_parser.h index 86d165f..c4619ed 100644 --- a/src/obu_parser.h +++ b/src/obu_parser.h @@ -276,6 +276,9 @@ class ObuParser : public Allocable { const ObuFrameHeader& frame_header() const { return frame_header_; } const Vector<TileBuffer>& tile_buffers() const { return tile_buffers_; } const ObuMetadata& metadata() const { return metadata_; } + // Returns true if the last call to ParseOneFrame() encountered a sequence + // header change. + bool sequence_header_changed() const { return sequence_header_changed_; } // Setters. void set_sequence_header(const ObuSequenceHeader& sequence_header) { @@ -384,6 +387,9 @@ class ObuParser : public Allocable { int next_tile_group_start_ = 0; // If true, the sequence_header_ field is valid. bool has_sequence_header_ = false; + // If true, it means that the last call to ParseOneFrame() encountered a + // sequence header change. + bool sequence_header_changed_ = false; // If true, the obu_extension_flag syntax element in the OBU header must be // 0. Set to true when parsing a sequence header if OperatingPointIdc is 0. bool extension_disallowed_ = false; diff --git a/src/post_filter.h b/src/post_filter.h index 800d51d..dfcd08e 100644 --- a/src/post_filter.h +++ b/src/post_filter.h @@ -272,8 +272,6 @@ class PostFilter { void CopyBordersForOneSuperBlockRow(int row4x4, int sb4x4, bool for_loop_restoration); // Sets up the |loop_restoration_border_| for loop restoration. - // TODO(linfengz): Unify duplicates in the following two functions if - // possible. // This is called when there is no CDEF filter. We copy rows from // |superres_buffer_| and do the line extension. void SetupLoopRestorationBorder(int row4x4_start); @@ -401,11 +399,14 @@ class PostFilter { // Applies super resolution for the |src| for |rows[plane]| rows of each // plane. If |line_buffer_row| is larger than or equal to 0, one more row will // be processed, the line buffer indicated by |line_buffer_row| will be used - // as the source. + // as the source. If |dst_is_loop_restoration_border| is true, then it means + // that the |dst| pointers come from |loop_restoration_border_| and the + // strides will be populated from that buffer. void ApplySuperRes( const std::array<uint8_t*, kMaxPlanes>& src, const std::array<int, kMaxPlanes>& rows, int line_buffer_row, - const std::array<uint8_t*, kMaxPlanes>& dst); // Section 7.16. + const std::array<uint8_t*, kMaxPlanes>& dst, + bool dst_is_loop_restoration_border = false); // Section 7.16. // Applies SuperRes for the superblock row starting at |row4x4| with a height // of 4*|sb4x4|. void ApplySuperResForOneSuperBlockRow(int row4x4, int sb4x4, diff --git a/src/post_filter/cdef.cc b/src/post_filter/cdef.cc index 994f448..f32b0a0 100644 --- a/src/post_filter/cdef.cc +++ b/src/post_filter/cdef.cc @@ -272,7 +272,7 @@ void PostFilter::ApplyCdefForOneUnit(uint16_t* cdef_block, const int index, const uint16_t* cdef_src_row_base[kMaxPlanes]; int cdef_src_row_base_stride[kMaxPlanes]; int column_step[kMaxPlanes]; - assert(planes_ >= 1); + assert(planes_ == kMaxPlanesMonochrome || planes_ == kMaxPlanes); int plane = kPlaneY; do { cdef_buffer_row_base[plane] = diff --git a/src/post_filter/loop_restoration.cc b/src/post_filter/loop_restoration.cc index 3d5da90..826ef48 100644 --- a/src/post_filter/loop_restoration.cc +++ b/src/post_filter/loop_restoration.cc @@ -29,15 +29,15 @@ void PostFilter::ApplyLoopRestorationForOneRow( unit_row * num_horizontal_units); const bool in_place = DoCdef() || thread_pool_ != nullptr; const Pixel* border = nullptr; + ptrdiff_t border_stride = 0; src_buffer += unit_y * stride; if (in_place) { - assert(loop_restoration_border_.stride(plane) == - static_cast<int>(sizeof(Pixel) * stride)); const int border_unit_y = std::max( RightShiftWithCeiling(unit_y, 4 - subsampling_y_[plane]) - 4, 0); + border_stride = loop_restoration_border_.stride(plane) / sizeof(Pixel); border = reinterpret_cast<const Pixel*>(loop_restoration_border_.data(plane)) + - border_unit_y * stride; + border_unit_y * border_stride; } int unit_column = 0; int column = 0; @@ -61,18 +61,22 @@ void PostFilter::ApplyLoopRestorationForOneRow( } } else { const Pixel* top_border = src - kRestorationVerticalBorder * stride; + ptrdiff_t top_border_stride = stride; const Pixel* bottom_border = src + current_process_unit_height * stride; + ptrdiff_t bottom_border_stride = stride; const bool frame_bottom_border = (unit_y + current_process_unit_height >= plane_height); if (in_place && (unit_y != 0 || !frame_bottom_border)) { const Pixel* loop_restoration_border = border + column; if (unit_y != 0) { top_border = loop_restoration_border; - loop_restoration_border += 4 * stride; + top_border_stride = border_stride; + loop_restoration_border += 4 * border_stride; } if (!frame_bottom_border) { - bottom_border = - loop_restoration_border + kRestorationVerticalBorder * stride; + bottom_border = loop_restoration_border + + kRestorationVerticalBorder * border_stride; + bottom_border_stride = border_stride; } } RestorationBuffer restoration_buffer; @@ -81,10 +85,10 @@ void PostFilter::ApplyLoopRestorationForOneRow( type == kLoopRestorationTypeWiener); const dsp::LoopRestorationFunc restoration_func = dsp_.loop_restorations[type - 2]; - restoration_func(restoration_info[unit_column], src, top_border, - bottom_border, stride, current_process_unit_width, - current_process_unit_height, &restoration_buffer, - dst_buffer + column); + restoration_func(restoration_info[unit_column], src, stride, top_border, + top_border_stride, bottom_border, bottom_border_stride, + current_process_unit_width, current_process_unit_height, + &restoration_buffer, dst_buffer + column); } ++unit_column; column += plane_unit_size; diff --git a/src/post_filter/post_filter.cc b/src/post_filter/post_filter.cc index 0eacf34..7671f01 100644 --- a/src/post_filter/post_filter.cc +++ b/src/post_filter/post_filter.cc @@ -306,11 +306,11 @@ void PostFilter::ExtendBordersForReferenceFrame() { } void PostFilter::CopyDeblockedPixels(Plane plane, int row4x4) { - assert(frame_buffer_.stride(plane) == loop_restoration_border_.stride(plane)); - const ptrdiff_t stride = frame_buffer_.stride(plane); + const ptrdiff_t src_stride = frame_buffer_.stride(plane); const uint8_t* const src = GetSourceBuffer(plane, row4x4, 0); const int row_offset = DivideBy4(row4x4); - uint8_t* dst = loop_restoration_border_.data(plane) + row_offset * stride; + const ptrdiff_t dst_stride = loop_restoration_border_.stride(plane); + uint8_t* dst = loop_restoration_border_.data(plane) + row_offset * dst_stride; const int num_pixels = SubsampledValue(MultiplyBy4(frame_header_.columns4x4), subsampling_x_[plane]); const int row_width = num_pixels << pixel_size_log2_; @@ -326,9 +326,9 @@ void PostFilter::CopyDeblockedPixels(Plane plane, int row4x4) { // border extension). row = last_valid_row; } - memcpy(dst, src + row * stride, row_width); + memcpy(dst, src + row * src_stride, row_width); last_valid_row = row; - dst += stride; + dst += dst_stride; } } @@ -395,9 +395,6 @@ void PostFilter::SetupLoopRestorationBorder(const int row4x4) { if (loop_restoration_.type[plane] == kLoopRestorationTypeNone) { continue; } - assert(frame_buffer_.stride(plane) == - loop_restoration_border_.stride(plane)); - const ptrdiff_t stride = frame_buffer_.stride(plane); const int row_offset = DivideBy4(row4x4); const int num_pixels = SubsampledValue(upscaled_width_, subsampling_x_[plane]); @@ -406,9 +403,13 @@ void PostFilter::SetupLoopRestorationBorder(const int row4x4) { const int row = kLoopRestorationBorderRows[subsampling_y_[plane]]; const int absolute_row = (MultiplyBy4(row4x4) >> subsampling_y_[plane]) + row; + const ptrdiff_t src_stride = frame_buffer_.stride(plane); const uint8_t* src = - GetSuperResBuffer(static_cast<Plane>(plane), row4x4, 0) + row * stride; - uint8_t* dst = loop_restoration_border_.data(plane) + row_offset * stride; + GetSuperResBuffer(static_cast<Plane>(plane), row4x4, 0) + + row * src_stride; + const ptrdiff_t dst_stride = loop_restoration_border_.stride(plane); + uint8_t* dst = + loop_restoration_border_.data(plane) + row_offset * dst_stride; for (int i = 0; i < 4; ++i) { memcpy(dst, src, row_width); #if LIBGAV1_MAX_BITDEPTH >= 10 @@ -421,8 +422,8 @@ void PostFilter::SetupLoopRestorationBorder(const int row4x4) { kRestorationHorizontalBorder); // If we run out of rows, copy the last valid row (mimics the bottom // border extension). - if (absolute_row + i < plane_height - 1) src += stride; - dst += stride; + if (absolute_row + i < plane_height - 1) src += src_stride; + dst += dst_stride; } } while (++plane < planes_); } @@ -434,7 +435,7 @@ void PostFilter::SetupLoopRestorationBorder(int row4x4_start, int sb4x4) { for (int sb_y = 0; sb_y < sb4x4; sb_y += 16) { const int row4x4 = row4x4_start + sb_y; const int row_offset_start = DivideBy4(row4x4); - std::array<uint8_t*, kMaxPlanes> dst = { + const std::array<uint8_t*, kMaxPlanes> dst = { loop_restoration_border_.data(kPlaneY) + row_offset_start * loop_restoration_border_.stride(kPlaneY), loop_restoration_border_.data(kPlaneU) + @@ -462,13 +463,14 @@ void PostFilter::SetupLoopRestorationBorder(int row4x4_start, int sb4x4) { row * frame_buffer_.stride(plane); rows[plane] = Clip3(plane_height - absolute_row, 0, 4); } while (++plane < planes_); - ApplySuperRes(src, rows, /*line_buffer_row=*/-1, dst); + ApplySuperRes(src, rows, /*line_buffer_row=*/-1, dst, + /*dst_is_loop_restoration_border=*/true); // If we run out of rows, copy the last valid row (mimics the bottom // border extension). plane = kPlaneY; do { if (rows[plane] == 0 || rows[plane] >= 4) continue; - const ptrdiff_t stride = frame_buffer_.stride(plane); + const ptrdiff_t stride = loop_restoration_border_.stride(plane); uint8_t* dst_line = dst[plane] + rows[plane] * stride; const uint8_t* const src_line = dst_line - stride; const int upscaled_width = super_res_info_[plane].upscaled_width diff --git a/src/post_filter/super_res.cc b/src/post_filter/super_res.cc index a70e4ed..554e537 100644 --- a/src/post_filter/super_res.cc +++ b/src/post_filter/super_res.cc @@ -19,7 +19,8 @@ namespace libgav1 { void PostFilter::ApplySuperRes(const std::array<uint8_t*, kMaxPlanes>& src, const std::array<int, kMaxPlanes>& rows, const int line_buffer_row, - const std::array<uint8_t*, kMaxPlanes>& dst) { + const std::array<uint8_t*, kMaxPlanes>& dst, + bool dst_is_loop_restoration_border /*=false*/) { int plane = kPlaneY; do { const int plane_width = @@ -28,13 +29,19 @@ void PostFilter::ApplySuperRes(const std::array<uint8_t*, kMaxPlanes>& src, if (bitdepth_ >= 10) { auto* input = reinterpret_cast<uint16_t*>(src[plane]); auto* output = reinterpret_cast<uint16_t*>(dst[plane]); - const ptrdiff_t stride = frame_buffer_.stride(plane) / sizeof(uint16_t); + const ptrdiff_t input_stride = + frame_buffer_.stride(plane) / sizeof(uint16_t); + const ptrdiff_t output_stride = + (dst_is_loop_restoration_border + ? loop_restoration_border_.stride(plane) + : frame_buffer_.stride(plane)) / + sizeof(uint16_t); if (rows[plane] > 0) { dsp_.super_res(superres_coefficients_[static_cast<int>(plane != 0)], - input, stride, rows[plane], plane_width, + input, input_stride, rows[plane], plane_width, super_res_info_[plane].upscaled_width, super_res_info_[plane].initial_subpixel_x, - super_res_info_[plane].step, output); + super_res_info_[plane].step, output, output_stride); } // In the multi-threaded case, the |superres_line_buffer_| holds the last // input row. Apply SuperRes for that row. @@ -44,24 +51,29 @@ void PostFilter::ApplySuperRes(const std::array<uint8_t*, kMaxPlanes>& src, line_buffer_row * superres_line_buffer_.stride(plane) / sizeof(uint16_t) + kSuperResHorizontalBorder; - dsp_.super_res( - superres_coefficients_[static_cast<int>(plane != 0)], - line_buffer_start, /*stride=*/0, - /*height=*/1, plane_width, super_res_info_[plane].upscaled_width, - super_res_info_[plane].initial_subpixel_x, - super_res_info_[plane].step, output + rows[plane] * stride); + dsp_.super_res(superres_coefficients_[static_cast<int>(plane != 0)], + line_buffer_start, /*source_stride=*/0, + /*height=*/1, plane_width, + super_res_info_[plane].upscaled_width, + super_res_info_[plane].initial_subpixel_x, + super_res_info_[plane].step, + output + rows[plane] * output_stride, /*dest_stride=*/0); } continue; } #endif // LIBGAV1_MAX_BITDEPTH >= 10 uint8_t* input = src[plane]; uint8_t* output = dst[plane]; + const ptrdiff_t input_stride = frame_buffer_.stride(plane); + const ptrdiff_t output_stride = dst_is_loop_restoration_border + ? loop_restoration_border_.stride(plane) + : frame_buffer_.stride(plane); if (rows[plane] > 0) { dsp_.super_res(superres_coefficients_[static_cast<int>(plane != 0)], - input, frame_buffer_.stride(plane), rows[plane], - plane_width, super_res_info_[plane].upscaled_width, + input, input_stride, rows[plane], plane_width, + super_res_info_[plane].upscaled_width, super_res_info_[plane].initial_subpixel_x, - super_res_info_[plane].step, output); + super_res_info_[plane].step, output, output_stride); } // In the multi-threaded case, the |superres_line_buffer_| holds the last // input row. Apply SuperRes for that row. @@ -70,13 +82,13 @@ void PostFilter::ApplySuperRes(const std::array<uint8_t*, kMaxPlanes>& src, superres_line_buffer_.data(plane) + line_buffer_row * superres_line_buffer_.stride(plane) + kSuperResHorizontalBorder; - dsp_.super_res(superres_coefficients_[static_cast<int>(plane != 0)], - line_buffer_start, /*stride=*/0, - /*height=*/1, plane_width, - super_res_info_[plane].upscaled_width, - super_res_info_[plane].initial_subpixel_x, - super_res_info_[plane].step, - output + rows[plane] * frame_buffer_.stride(plane)); + dsp_.super_res( + superres_coefficients_[static_cast<int>(plane != 0)], + line_buffer_start, /*source_stride=*/0, + /*height=*/1, plane_width, super_res_info_[plane].upscaled_width, + super_res_info_[plane].initial_subpixel_x, + super_res_info_[plane].step, output + rows[plane] * output_stride, + /*dest_stride=*/0); } } while (++plane < planes_); } diff --git a/src/residual_buffer_pool.cc b/src/residual_buffer_pool.cc index e166392..44a842c 100644 --- a/src/residual_buffer_pool.cc +++ b/src/residual_buffer_pool.cc @@ -129,7 +129,8 @@ std::unique_ptr<ResidualBuffer> ResidualBufferPool::Get() { } void ResidualBufferPool::Release(std::unique_ptr<ResidualBuffer> buffer) { - buffer->transform_parameters()->Reset(); + buffer->transform_parameters()->Clear(); + buffer->partition_tree_order()->Clear(); std::lock_guard<std::mutex> lock(mutex_); buffers_.Push(std::move(buffer)); } diff --git a/src/residual_buffer_pool.h b/src/residual_buffer_pool.h index f7bc75d..75924db 100644 --- a/src/residual_buffer_pool.h +++ b/src/residual_buffer_pool.h @@ -27,73 +27,11 @@ #include "src/utils/compiler_attributes.h" #include "src/utils/constants.h" #include "src/utils/memory.h" +#include "src/utils/queue.h" #include "src/utils/types.h" namespace libgav1 { -// A simple fixed size queue implementation to hold the transform parameters -// when |Tile::split_parse_and_decode_| is true. We don't have to do any -// boundary checks since we always push data into the queue before accessing it. -class TransformParameterQueue { - public: - TransformParameterQueue() = default; - - // Move only. - TransformParameterQueue(TransformParameterQueue&& other) = default; - TransformParameterQueue& operator=(TransformParameterQueue&& other) = default; - - LIBGAV1_MUST_USE_RESULT bool Init(int max_size) { - max_size_ = max_size; - // No initialization is necessary since the data will be always written to - // before being read. - non_zero_coeff_count_.reset(new (std::nothrow) int16_t[max_size_]); - tx_type_.reset(new (std::nothrow) TransformType[max_size_]); - return non_zero_coeff_count_ != nullptr && tx_type_ != nullptr; - } - - // Adds the |non_zero_coeff_count| and the |tx_type| to the back of the queue. - void Push(int non_zero_coeff_count, TransformType tx_type) { - assert(back_ < max_size_); - non_zero_coeff_count_[back_] = non_zero_coeff_count; - tx_type_[back_++] = tx_type; - } - - // Returns the non_zero_coeff_count at the front of the queue. - int16_t NonZeroCoeffCount() const { - assert(front_ != back_); - return non_zero_coeff_count_[front_]; - } - - // Returns the tx_type at the front of the queue. - TransformType Type() const { - assert(front_ != back_); - return tx_type_[front_]; - } - - // Removes the |non_zero_coeff_count| and the |tx_type| from the front of the - // queue. - void Pop() { - assert(front_ != back_); - ++front_; - } - - // Clears the queue. - void Reset() { - front_ = 0; - back_ = 0; - } - - // Used only in the tests. Returns the number of elements in the queue. - int Size() const { return back_ - front_; } - - private: - int max_size_ = 0; - std::unique_ptr<int16_t[]> non_zero_coeff_count_; - std::unique_ptr<TransformType[]> tx_type_; - int front_ = 0; - int back_ = 0; -}; - // This class is used for parsing and decoding a superblock. Members of this // class are populated in the "parse" step and consumed in the "decode" step. class ResidualBuffer : public Allocable { @@ -104,7 +42,8 @@ class ResidualBuffer : public Allocable { if (buffer != nullptr) { buffer->buffer_ = MakeAlignedUniquePtr<uint8_t>(32, buffer_size); if (buffer->buffer_ == nullptr || - !buffer->transform_parameters_.Init(queue_size)) { + !buffer->transform_parameters_.Init(queue_size) || + !buffer->partition_tree_order_.Init(queue_size)) { buffer = nullptr; } } @@ -118,9 +57,14 @@ class ResidualBuffer : public Allocable { // Buffer used to store the residual values. uint8_t* buffer() { return buffer_.get(); } // Queue used to store the transform parameters. - TransformParameterQueue* transform_parameters() { + Queue<TransformParameters>* transform_parameters() { return &transform_parameters_; } + // Queue used to store the block ordering in the partition tree of the + // superblocks. + Queue<PartitionTreeNode>* partition_tree_order() { + return &partition_tree_order_; + } private: friend class ResidualBufferStack; @@ -128,7 +72,8 @@ class ResidualBuffer : public Allocable { ResidualBuffer() = default; AlignedUniquePtr<uint8_t> buffer_; - TransformParameterQueue transform_parameters_; + Queue<TransformParameters> transform_parameters_; + Queue<PartitionTreeNode> partition_tree_order_; // Used by ResidualBufferStack to form a chain of ResidualBuffers. ResidualBuffer* next_ = nullptr; }; diff --git a/src/threading_strategy.cc b/src/threading_strategy.cc index cd4d576..17ce18f 100644 --- a/src/threading_strategy.cc +++ b/src/threading_strategy.cc @@ -36,24 +36,25 @@ constexpr int kFrameParallelThresholdMultiplier = // Computes the number of frame threads to be used based on the following // heuristic: // * If |thread_count| == 1, return 0. -// * If |thread_count| <= |tile_count| * 4, return 0. +// * If |thread_count| <= |tile_count| * kFrameParallelThresholdMultiplier, +// return 0. // * Otherwise, return the largest value of i which satisfies the following // condition: i + i * tile_columns <= thread_count. This ensures that there // are at least |tile_columns| worker threads for each frame thread. // * This function will never return 1 or a value > |thread_count|. // -// This heuristic is based empirical performance data. The in-frame threading -// model (combination of tile multithreading, superblock row multithreading and -// post filter multithreading) performs better than the frame parallel model -// until we reach the threshold of |thread_count| > |tile_count| * -// kFrameParallelThresholdMultiplier. +// This heuristic is based on empirical performance data. The in-frame +// threading model (combination of tile multithreading, superblock row +// multithreading and post filter multithreading) performs better than the +// frame parallel model until we reach the threshold of |thread_count| > +// |tile_count| * kFrameParallelThresholdMultiplier. // // It is a function of |tile_count| since tile threading and superblock row -// multithreading will scale only as a factor of |tile_count|. The threshold 4 -// is arrived at based on empirical data. The general idea is that superblock -// row multithreading plateaus at 4 * |tile_count| because in most practical -// cases there aren't more than that many superblock rows and columns available -// to work on in parallel. +// multithreading will scale only as a factor of |tile_count|. The threshold +// kFrameParallelThresholdMultiplier is arrived at based on empirical data. +// The general idea is that superblock row multithreading plateaus at 4 * +// |tile_count| because in most practical cases there aren't more than that +// many superblock rows and columns available to work on in parallel. int ComputeFrameThreadCount(int thread_count, int tile_count, int tile_columns) { assert(thread_count > 0); @@ -132,7 +133,7 @@ bool ThreadingStrategy::Reset(const ObuFrameHeader& frame_header, thread_count -= 2; if (thread_count <= 0) break; } -#else // !defined(__ANDROID__) +#else // !defined(__ANDROID__) // Assign the remaining threads to each Tile. for (int i = 0; i < tile_count; ++i) { const int count = thread_count / tile_count + @@ -48,7 +48,6 @@ #include "src/utils/constants.h" #include "src/utils/entropy_decoder.h" #include "src/utils/memory.h" -#include "src/utils/parameter_tree.h" #include "src/utils/segmentation_map.h" #include "src/utils/threadpool.h" #include "src/utils/types.h" @@ -292,26 +291,25 @@ class Tile : public Allocable { // iteratively. It performs a DFS traversal over the partition tree to process // the blocks in the right order. bool ProcessPartition( - int row4x4_start, int column4x4_start, ParameterTree* root, - TileScratchBuffer* scratch_buffer, + int row4x4_start, int column4x4_start, TileScratchBuffer* scratch_buffer, ResidualPtr* residual); // Iterative implementation of 5.11.4. bool ProcessBlock(int row4x4, int column4x4, BlockSize block_size, - ParameterTree* tree, TileScratchBuffer* scratch_buffer, + TileScratchBuffer* scratch_buffer, ResidualPtr* residual); // 5.11.5. void ResetCdef(int row4x4, int column4x4); // 5.11.55. // This function is used to decode a superblock when the parsing has already // been done for that superblock. - bool DecodeSuperBlock(ParameterTree* tree, TileScratchBuffer* scratch_buffer, - ResidualPtr* residual); + bool DecodeSuperBlock(int sb_row_index, int sb_column_index, + TileScratchBuffer* scratch_buffer); // Helper function used by DecodeSuperBlock(). Note that the decode_block() // function in the spec is equivalent to ProcessBlock() in the code. - bool DecodeBlock(ParameterTree* tree, TileScratchBuffer* scratch_buffer, - ResidualPtr* residual); + bool DecodeBlock(int row4x4, int column4x4, BlockSize block_size, + TileScratchBuffer* scratch_buffer, ResidualPtr* residual); void ClearBlockDecoded(TileScratchBuffer* scratch_buffer, int row4x4, int column4x4); // 5.11.3. - bool ProcessSuperBlock(int row4x4, int column4x4, int block_width4x4, + bool ProcessSuperBlock(int row4x4, int column4x4, TileScratchBuffer* scratch_buffer, ProcessingMode mode); void ResetLoopRestorationParams(); diff --git a/src/tile/bitstream/palette.cc b/src/tile/bitstream/palette.cc index 674d210..41b42d6 100644 --- a/src/tile/bitstream/palette.cc +++ b/src/tile/bitstream/palette.cc @@ -130,10 +130,10 @@ void Tile::ReadPaletteColors(const Block& block, Plane plane) { void Tile::ReadPaletteModeInfo(const Block& block) { BlockParameters& bp = *block.bp; + bp.palette_mode_info.size[kPlaneTypeY] = 0; + bp.palette_mode_info.size[kPlaneTypeUV] = 0; if (IsBlockSmallerThan8x8(block.size) || block.size > kBlock64x64 || !frame_header_.allow_screen_content_tools) { - bp.palette_mode_info.size[kPlaneTypeY] = 0; - bp.palette_mode_info.size[kPlaneTypeUV] = 0; return; } const int block_size_context = @@ -156,7 +156,7 @@ void Tile::ReadPaletteModeInfo(const Block& block) { ReadPaletteColors(block, kPlaneY); } } - if (bp.uv_mode == kPredictionModeDc && block.HasChroma()) { + if (block.HasChroma() && bp.uv_mode == kPredictionModeDc) { const int context = static_cast<int>(bp.palette_mode_info.size[kPlaneTypeY] > 0); const bool has_palette_uv = diff --git a/src/tile/tile.cc b/src/tile/tile.cc index ee48f17..9699517 100644 --- a/src/tile/tile.cc +++ b/src/tile/tile.cc @@ -609,7 +609,7 @@ bool Tile::ProcessSuperBlockRow(int row4x4, const int block_width4x4 = kNum4x4BlocksWide[SuperBlockSize()]; for (int column4x4 = column4x4_start_; column4x4 < column4x4_end_; column4x4 += block_width4x4) { - if (!ProcessSuperBlock(row4x4, column4x4, block_width4x4, scratch_buffer, + if (!ProcessSuperBlock(row4x4, column4x4, scratch_buffer, processing_mode)) { LIBGAV1_DLOG(ERROR, "Error decoding super block row: %d column: %d", row4x4, column4x4); @@ -642,9 +642,6 @@ void Tile::SaveSymbolDecoderContext() { } bool Tile::ParseAndDecode() { - // If this is the main thread, we build the loop filter bit masks when parsing - // so that it happens in the current thread. This ensures that the main thread - // does as much work as possible. if (split_parse_and_decode_) { if (!ThreadedParseAndDecode()) return false; SaveSymbolDecoderContext(); @@ -776,8 +773,8 @@ bool Tile::ThreadedParseAndDecode() { for (int column4x4 = column4x4_start_, column_index = 0; column4x4 < column4x4_end_; column4x4 += block_width4x4, ++column_index) { - if (!ProcessSuperBlock(row4x4, column4x4, block_width4x4, - scratch_buffer.get(), kProcessingModeParseOnly)) { + if (!ProcessSuperBlock(row4x4, column4x4, scratch_buffer.get(), + kProcessingModeParseOnly)) { std::lock_guard<std::mutex> lock(threading_.mutex); threading_.abort = true; break; @@ -862,8 +859,8 @@ void Tile::DecodeSuperBlock(int row_index, int column_index, tile_scratch_buffer_pool_->Get(); bool ok = scratch_buffer != nullptr; if (ok) { - ok = ProcessSuperBlock(row4x4, column4x4, block_width4x4, - scratch_buffer.get(), kProcessingModeDecodeOnly); + ok = ProcessSuperBlock(row4x4, column4x4, scratch_buffer.get(), + kProcessingModeDecodeOnly); tile_scratch_buffer_pool_->Release(std::move(scratch_buffer)); } std::unique_lock<std::mutex> lock(threading_.mutex); @@ -1629,11 +1626,12 @@ bool Tile::TransformBlock(const Block& block, Plane plane, int base_x, const int sb_row_index = SuperBlockRowIndex(block.row4x4); const int sb_column_index = SuperBlockColumnIndex(block.column4x4); if (mode == kProcessingModeDecodeOnly) { - TransformParameterQueue& tx_params = + Queue<TransformParameters>& tx_params = *residual_buffer_threaded_[sb_row_index][sb_column_index] ->transform_parameters(); ReconstructBlock(block, plane, start_x, start_y, tx_size, - tx_params.Type(), tx_params.NonZeroCoeffCount()); + tx_params.Front().type, + tx_params.Front().non_zero_coeff_count); tx_params.Pop(); } else { TransformType tx_type; @@ -1656,7 +1654,7 @@ bool Tile::TransformBlock(const Block& block, Plane plane, int base_x, assert(mode == kProcessingModeParseOnly); residual_buffer_threaded_[sb_row_index][sb_column_index] ->transform_parameters() - ->Push(non_zero_coeff_count, tx_type); + ->Push(TransformParameters(tx_type, non_zero_coeff_count)); } } } @@ -1886,6 +1884,7 @@ bool Tile::AssignInterMv(const Block& block, bool is_compound) { GetClampParameters(block, min, max); BlockParameters& bp = *block.bp; const PredictionParameters& prediction_parameters = *bp.prediction_parameters; + bp.mv.mv64 = 0; if (is_compound) { for (int i = 0; i < 2; ++i) { const PredictionMode mode = GetSinglePredictionMode(i, bp.y_mode); @@ -1948,6 +1947,7 @@ bool Tile::AssignIntraMv(const Block& block) { BlockParameters& bp = *block.bp; const PredictionParameters& prediction_parameters = *bp.prediction_parameters; const MotionVector& ref_mv_0 = prediction_parameters.reference_mv(0); + bp.mv.mv64 = 0; ReadMotionVector(block, 0); if (ref_mv_0.mv32 == 0) { const MotionVector& ref_mv_1 = prediction_parameters.reference_mv(1); @@ -2122,7 +2122,6 @@ void Tile::PopulateDeblockFilterLevel(const Block& block) { } bool Tile::ProcessBlock(int row4x4, int column4x4, BlockSize block_size, - ParameterTree* const tree, TileScratchBuffer* const scratch_buffer, ResidualPtr* residual) { // Do not process the block if the starting point is beyond the visible frame. @@ -2133,8 +2132,24 @@ bool Tile::ProcessBlock(int row4x4, int column4x4, BlockSize block_size, column4x4 >= frame_header_.columns4x4) { return true; } - BlockParameters& bp = *tree->parameters(); - block_parameters_holder_.FillCache(row4x4, column4x4, block_size, &bp); + + if (split_parse_and_decode_) { + // Push block ordering info to the queue. DecodeBlock() will use this queue + // to decode the blocks in the correct order. + const int sb_row_index = SuperBlockRowIndex(row4x4); + const int sb_column_index = SuperBlockColumnIndex(column4x4); + residual_buffer_threaded_[sb_row_index][sb_column_index] + ->partition_tree_order() + ->Push(PartitionTreeNode(row4x4, column4x4, block_size)); + } + + BlockParameters* bp_ptr = + block_parameters_holder_.Get(row4x4, column4x4, block_size); + if (bp_ptr == nullptr) { + LIBGAV1_DLOG(ERROR, "Failed to get BlockParameters."); + return false; + } + BlockParameters& bp = *bp_ptr; Block block(*this, block_size, row4x4, column4x4, scratch_buffer, residual); bp.size = block_size; bp.prediction_parameters = @@ -2186,16 +2201,13 @@ bool Tile::ProcessBlock(int row4x4, int column4x4, BlockSize block_size, return true; } -bool Tile::DecodeBlock(ParameterTree* const tree, +bool Tile::DecodeBlock(int row4x4, int column4x4, BlockSize block_size, TileScratchBuffer* const scratch_buffer, ResidualPtr* residual) { - const int row4x4 = tree->row4x4(); - const int column4x4 = tree->column4x4(); if (row4x4 >= frame_header_.rows4x4 || column4x4 >= frame_header_.columns4x4) { return true; } - const BlockSize block_size = tree->block_size(); Block block(*this, block_size, row4x4, column4x4, scratch_buffer, residual); if (!ComputePrediction(block) || !Residual(block, kProcessingModeDecodeOnly)) { @@ -2206,27 +2218,22 @@ bool Tile::DecodeBlock(ParameterTree* const tree, } bool Tile::ProcessPartition(int row4x4_start, int column4x4_start, - ParameterTree* const root, TileScratchBuffer* const scratch_buffer, ResidualPtr* residual) { - Stack<ParameterTree*, kDfsStackSize> stack; + Stack<PartitionTreeNode, kDfsStackSize> stack; // Set up the first iteration. - ParameterTree* node = root; - int row4x4 = row4x4_start; - int column4x4 = column4x4_start; - BlockSize block_size = SuperBlockSize(); + stack.Push( + PartitionTreeNode(row4x4_start, column4x4_start, SuperBlockSize())); // DFS loop. If it sees a terminal node (leaf node), ProcessBlock is invoked. // Otherwise, the children are pushed into the stack for future processing. do { - if (!stack.Empty()) { - // Set up subsequent iterations. - node = stack.Pop(); - row4x4 = node->row4x4(); - column4x4 = node->column4x4(); - block_size = node->block_size(); - } + PartitionTreeNode node = stack.Pop(); + int row4x4 = node.row4x4; + int column4x4 = node.column4x4; + BlockSize block_size = node.block_size; + if (row4x4 >= frame_header_.rows4x4 || column4x4 >= frame_header_.columns4x4) { continue; @@ -2262,13 +2269,13 @@ bool Tile::ProcessPartition(int row4x4_start, int column4x4_start, sequence_header_.color_config.subsampling_y); return false; } - if (!node->SetPartitionType(partition)) { - LIBGAV1_DLOG(ERROR, "node->SetPartitionType() failed."); - return false; - } + + const int quarter_block4x4 = half_block4x4 >> 1; + const BlockSize split_size = kSubSize[kPartitionSplit][block_size]; + assert(partition == kPartitionNone || sub_size != kBlockInvalid); switch (partition) { case kPartitionNone: - if (!ProcessBlock(row4x4, column4x4, sub_size, node, scratch_buffer, + if (!ProcessBlock(row4x4, column4x4, sub_size, scratch_buffer, residual)) { return false; } @@ -2276,28 +2283,82 @@ bool Tile::ProcessPartition(int row4x4_start, int column4x4_start, case kPartitionSplit: // The children must be added in reverse order since a stack is being // used. - for (int i = 3; i >= 0; --i) { - ParameterTree* const child = node->children(i); - assert(child != nullptr); - stack.Push(child); - } + stack.Push(PartitionTreeNode(row4x4 + half_block4x4, + column4x4 + half_block4x4, sub_size)); + stack.Push( + PartitionTreeNode(row4x4 + half_block4x4, column4x4, sub_size)); + stack.Push( + PartitionTreeNode(row4x4, column4x4 + half_block4x4, sub_size)); + stack.Push(PartitionTreeNode(row4x4, column4x4, sub_size)); break; case kPartitionHorizontal: + if (!ProcessBlock(row4x4, column4x4, sub_size, scratch_buffer, + residual) || + !ProcessBlock(row4x4 + half_block4x4, column4x4, sub_size, + scratch_buffer, residual)) { + return false; + } + break; case kPartitionVertical: + if (!ProcessBlock(row4x4, column4x4, sub_size, scratch_buffer, + residual) || + !ProcessBlock(row4x4, column4x4 + half_block4x4, sub_size, + scratch_buffer, residual)) { + return false; + } + break; case kPartitionHorizontalWithTopSplit: + if (!ProcessBlock(row4x4, column4x4, split_size, scratch_buffer, + residual) || + !ProcessBlock(row4x4, column4x4 + half_block4x4, split_size, + scratch_buffer, residual) || + !ProcessBlock(row4x4 + half_block4x4, column4x4, sub_size, + scratch_buffer, residual)) { + return false; + } + break; case kPartitionHorizontalWithBottomSplit: + if (!ProcessBlock(row4x4, column4x4, sub_size, scratch_buffer, + residual) || + !ProcessBlock(row4x4 + half_block4x4, column4x4, split_size, + scratch_buffer, residual) || + !ProcessBlock(row4x4 + half_block4x4, column4x4 + half_block4x4, + split_size, scratch_buffer, residual)) { + return false; + } + break; case kPartitionVerticalWithLeftSplit: + if (!ProcessBlock(row4x4, column4x4, split_size, scratch_buffer, + residual) || + !ProcessBlock(row4x4 + half_block4x4, column4x4, split_size, + scratch_buffer, residual) || + !ProcessBlock(row4x4, column4x4 + half_block4x4, sub_size, + scratch_buffer, residual)) { + return false; + } + break; case kPartitionVerticalWithRightSplit: + if (!ProcessBlock(row4x4, column4x4, sub_size, scratch_buffer, + residual) || + !ProcessBlock(row4x4, column4x4 + half_block4x4, split_size, + scratch_buffer, residual) || + !ProcessBlock(row4x4 + half_block4x4, column4x4 + half_block4x4, + split_size, scratch_buffer, residual)) { + return false; + } + break; case kPartitionHorizontal4: + for (int i = 0; i < 4; ++i) { + if (!ProcessBlock(row4x4 + i * quarter_block4x4, column4x4, sub_size, + scratch_buffer, residual)) { + return false; + } + } + break; case kPartitionVertical4: for (int i = 0; i < 4; ++i) { - ParameterTree* const child = node->children(i); - // Once a null child is seen, all the subsequent children will also be - // null. - if (child == nullptr) break; - if (!ProcessBlock(child->row4x4(), child->column4x4(), - child->block_size(), child, scratch_buffer, - residual)) { + if (!ProcessBlock(row4x4, column4x4 + i * quarter_block4x4, sub_size, + scratch_buffer, residual)) { return false; } } @@ -2370,7 +2431,7 @@ void Tile::ClearBlockDecoded(TileScratchBuffer* const scratch_buffer, } } -bool Tile::ProcessSuperBlock(int row4x4, int column4x4, int block_width4x4, +bool Tile::ProcessSuperBlock(int row4x4, int column4x4, TileScratchBuffer* const scratch_buffer, ProcessingMode mode) { const bool parsing = @@ -2388,13 +2449,10 @@ bool Tile::ProcessSuperBlock(int row4x4, int column4x4, int block_width4x4, if (parsing) { ReadLoopRestorationCoefficients(row4x4, column4x4, block_size); } - const int row = row4x4 / block_width4x4; - const int column = column4x4 / block_width4x4; if (parsing && decoding) { uint8_t* residual_buffer = residual_buffer_.get(); - if (!ProcessPartition(row4x4, column4x4, - block_parameters_holder_.Tree(row, column), - scratch_buffer, &residual_buffer)) { + if (!ProcessPartition(row4x4, column4x4, scratch_buffer, + &residual_buffer)) { LIBGAV1_DLOG(ERROR, "Error decoding partition row: %d column: %d", row4x4, column4x4); return false; @@ -2412,18 +2470,14 @@ bool Tile::ProcessSuperBlock(int row4x4, int column4x4, int block_width4x4, } uint8_t* residual_buffer = residual_buffer_threaded_[sb_row_index][sb_column_index]->buffer(); - if (!ProcessPartition(row4x4, column4x4, - block_parameters_holder_.Tree(row, column), - scratch_buffer, &residual_buffer)) { + if (!ProcessPartition(row4x4, column4x4, scratch_buffer, + &residual_buffer)) { LIBGAV1_DLOG(ERROR, "Error parsing partition row: %d column: %d", row4x4, column4x4); return false; } } else { - uint8_t* residual_buffer = - residual_buffer_threaded_[sb_row_index][sb_column_index]->buffer(); - if (!DecodeSuperBlock(block_parameters_holder_.Tree(row, column), - scratch_buffer, &residual_buffer)) { + if (!DecodeSuperBlock(sb_row_index, sb_column_index, scratch_buffer)) { LIBGAV1_DLOG(ERROR, "Error decoding superblock row: %d column: %d", row4x4, column4x4); return false; @@ -2434,26 +2488,23 @@ bool Tile::ProcessSuperBlock(int row4x4, int column4x4, int block_width4x4, return true; } -bool Tile::DecodeSuperBlock(ParameterTree* const tree, - TileScratchBuffer* const scratch_buffer, - ResidualPtr* residual) { - Stack<ParameterTree*, kDfsStackSize> stack; - stack.Push(tree); - do { - ParameterTree* const node = stack.Pop(); - if (node->partition() != kPartitionNone) { - for (int i = 3; i >= 0; --i) { - if (node->children(i) == nullptr) continue; - stack.Push(node->children(i)); - } - continue; - } - if (!DecodeBlock(node, scratch_buffer, residual)) { +bool Tile::DecodeSuperBlock(int sb_row_index, int sb_column_index, + TileScratchBuffer* const scratch_buffer) { + uint8_t* residual_buffer = + residual_buffer_threaded_[sb_row_index][sb_column_index]->buffer(); + Queue<PartitionTreeNode>& partition_tree_order = + *residual_buffer_threaded_[sb_row_index][sb_column_index] + ->partition_tree_order(); + while (!partition_tree_order.Empty()) { + PartitionTreeNode block = partition_tree_order.Front(); + if (!DecodeBlock(block.row4x4, block.column4x4, block.block_size, + scratch_buffer, &residual_buffer)) { LIBGAV1_DLOG(ERROR, "Error decoding block row: %d column: %d", - node->row4x4(), node->column4x4()); + block.row4x4, block.column4x4); return false; } - } while (!stack.Empty()); + partition_tree_order.Pop(); + } return true; } diff --git a/src/utils/array_2d.h b/src/utils/array_2d.h index 2df6241..df2da9f 100644 --- a/src/utils/array_2d.h +++ b/src/utils/array_2d.h @@ -120,7 +120,7 @@ class Array2D { const T* operator[](int row) const { return data_view_[row]; } private: - std::unique_ptr<T[]> data_ = nullptr; + std::unique_ptr<T[]> data_; size_t allocated_size_ = 0; size_t size_ = 0; Array2DView<T> data_view_; diff --git a/src/utils/block_parameters_holder.cc b/src/utils/block_parameters_holder.cc index 3ccdb9b..3bb9f1e 100644 --- a/src/utils/block_parameters_holder.cc +++ b/src/utils/block_parameters_holder.cc @@ -19,53 +19,29 @@ #include "src/utils/common.h" #include "src/utils/constants.h" #include "src/utils/logging.h" -#include "src/utils/parameter_tree.h" #include "src/utils/types.h" namespace libgav1 { -namespace { - -// Returns the number of super block rows/columns for |value4x4| where value4x4 -// is either rows4x4 or columns4x4. -int RowsOrColumns4x4ToSuperBlocks(int value4x4, bool use_128x128_superblock) { - return use_128x128_superblock ? DivideBy128(MultiplyBy4(value4x4) + 127) - : DivideBy64(MultiplyBy4(value4x4) + 63); -} - -} // namespace - -bool BlockParametersHolder::Reset(int rows4x4, int columns4x4, - bool use_128x128_superblock) { +bool BlockParametersHolder::Reset(int rows4x4, int columns4x4) { rows4x4_ = rows4x4; columns4x4_ = columns4x4; - use_128x128_superblock_ = use_128x128_superblock; - if (!block_parameters_cache_.Reset(rows4x4_, columns4x4_)) { - LIBGAV1_DLOG(ERROR, "block_parameters_cache_.Reset() failed."); - return false; - } - const int rows = - RowsOrColumns4x4ToSuperBlocks(rows4x4_, use_128x128_superblock_); - const int columns = - RowsOrColumns4x4ToSuperBlocks(columns4x4_, use_128x128_superblock_); - const BlockSize sb_size = - use_128x128_superblock_ ? kBlock128x128 : kBlock64x64; - const int multiplier = kNum4x4BlocksWide[sb_size]; - if (!trees_.Reset(rows, columns, /*zero_initialize=*/false)) { - LIBGAV1_DLOG(ERROR, "trees_.Reset() failed."); - return false; - } - for (int i = 0; i < rows; ++i) { - for (int j = 0; j < columns; ++j) { - trees_[i][j] = - ParameterTree::Create(i * multiplier, j * multiplier, sb_size); - if (trees_[i][j] == nullptr) { - LIBGAV1_DLOG(ERROR, "Allocation of trees_[%d][%d] failed.", i, j); - return false; - } - } + index_ = 0; + return block_parameters_cache_.Reset(rows4x4_, columns4x4_) && + block_parameters_.Resize(rows4x4_ * columns4x4_); +} + +BlockParameters* BlockParametersHolder::Get(int row4x4, int column4x4, + BlockSize block_size) { + const size_t index = index_.fetch_add(1, std::memory_order_relaxed); + if (index >= block_parameters_.size()) return nullptr; + auto& bp = block_parameters_.get()[index]; + if (bp == nullptr) { + bp.reset(new (std::nothrow) BlockParameters); + if (bp == nullptr) return nullptr; } - return true; + FillCache(row4x4, column4x4, block_size, bp.get()); + return bp.get(); } void BlockParametersHolder::FillCache(int row4x4, int column4x4, diff --git a/src/utils/block_parameters_holder.h b/src/utils/block_parameters_holder.h index 35543c3..ca36907 100644 --- a/src/utils/block_parameters_holder.h +++ b/src/utils/block_parameters_holder.h @@ -17,18 +17,18 @@ #ifndef LIBGAV1_SRC_UTILS_BLOCK_PARAMETERS_HOLDER_H_ #define LIBGAV1_SRC_UTILS_BLOCK_PARAMETERS_HOLDER_H_ +#include <atomic> #include <memory> #include "src/utils/array_2d.h" #include "src/utils/compiler_attributes.h" #include "src/utils/constants.h" -#include "src/utils/parameter_tree.h" +#include "src/utils/dynamic_buffer.h" #include "src/utils/types.h" namespace libgav1 { -// Holds a 2D array of |ParameterTree| objects. Each tree stores the parameters -// corresponding to a superblock. +// Holds the BlockParameters pointers to each 4x4 block in the frame. class BlockParametersHolder { public: BlockParametersHolder() = default; @@ -37,10 +37,13 @@ class BlockParametersHolder { BlockParametersHolder(const BlockParametersHolder&) = delete; BlockParametersHolder& operator=(const BlockParametersHolder&) = delete; - // If |use_128x128_superblock| is true, 128x128 superblocks will be used, - // otherwise 64x64 superblocks will be used. - LIBGAV1_MUST_USE_RESULT bool Reset(int rows4x4, int columns4x4, - bool use_128x128_superblock); + LIBGAV1_MUST_USE_RESULT bool Reset(int rows4x4, int columns4x4); + + // Returns a pointer to a BlockParameters object that can be used safely until + // the next call to Reset(). Returns nullptr on memory allocation failure. It + // also fills the cache matrix for the block starting at |row4x4|, |column4x4| + // of size |block_size| with the returned pointer. + BlockParameters* Get(int row4x4, int column4x4, BlockSize block_size); // Finds the BlockParameters corresponding to |row4x4| and |column4x4|. This // is done as a simple look up of the |block_parameters_cache_| matrix. @@ -59,20 +62,24 @@ class BlockParametersHolder { int columns4x4() const { return columns4x4_; } - // Returns the ParameterTree corresponding to superblock starting at (|row|, - // |column|). - ParameterTree* Tree(int row, int column) { return trees_[row][column].get(); } + private: + // Needs access to FillCache for testing Cdef. + template <int bitdepth, typename Pixel> + friend class PostFilterApplyCdefTest; - // Fills the cache matrix for the block starting at |row4x4|, |column4x4| of - // size |block_size| with the pointer |bp|. void FillCache(int row4x4, int column4x4, BlockSize block_size, BlockParameters* bp); - private: int rows4x4_ = 0; int columns4x4_ = 0; - bool use_128x128_superblock_ = false; - Array2D<std::unique_ptr<ParameterTree>> trees_; + + // Owns the memory of BlockParameters pointers for the entire frame. It can + // hold upto |rows4x4_| * |columns4x4_| objects. Each object will be allocated + // on demand and re-used across frames. + DynamicBuffer<std::unique_ptr<BlockParameters>> block_parameters_; + + // Points to the next available index of |block_parameters_|. + std::atomic<int> index_; // This is a 2d array of size |rows4x4_| * |columns4x4_|. This is filled in by // FillCache() and used by Find() to perform look ups using exactly one look diff --git a/src/utils/common.h b/src/utils/common.h index ae43c2b..2e599f0 100644 --- a/src/utils/common.h +++ b/src/utils/common.h @@ -30,7 +30,6 @@ #include <cassert> #include <cstddef> #include <cstdint> -#include <cstdlib> #include <cstring> #include <type_traits> @@ -131,7 +130,7 @@ inline int CountLeadingZeros(uint64_t n) { #if defined(HAVE_BITSCANREVERSE64) const unsigned char bit_set = _BitScanReverse64(&first_set_bit, static_cast<unsigned __int64>(n)); -#else // !defined(HAVE_BITSCANREVERSE64) +#else // !defined(HAVE_BITSCANREVERSE64) const auto n_hi = static_cast<unsigned long>(n >> 32); // NOLINT(runtime/int) if (n_hi != 0) { const unsigned char bit_set = _BitScanReverse(&first_set_bit, n_hi); @@ -376,7 +375,7 @@ constexpr bool IsDirectionalMode(PredictionMode mode) { // behavior and result apply to other CPUs' SIMD instructions. inline int GetRelativeDistance(const unsigned int a, const unsigned int b, const unsigned int order_hint_shift_bits) { - const int diff = a - b; + const int diff = static_cast<int>(a) - static_cast<int>(b); assert(order_hint_shift_bits <= 31); if (order_hint_shift_bits == 0) { assert(a == 0); diff --git a/src/utils/constants.h b/src/utils/constants.h index 34cf56d..a2076c5 100644 --- a/src/utils/constants.h +++ b/src/utils/constants.h @@ -629,6 +629,52 @@ inline const char* ToString(const LoopRestorationType type) { abort(); } +inline const char* ToString(const TransformSize size) { + switch (size) { + case kTransformSize4x4: + return "kTransformSize4x4"; + case kTransformSize4x8: + return "kTransformSize4x8"; + case kTransformSize4x16: + return "kTransformSize4x16"; + case kTransformSize8x4: + return "kTransformSize8x4"; + case kTransformSize8x8: + return "kTransformSize8x8"; + case kTransformSize8x16: + return "kTransformSize8x16"; + case kTransformSize8x32: + return "kTransformSize8x32"; + case kTransformSize16x4: + return "kTransformSize16x4"; + case kTransformSize16x8: + return "kTransformSize16x8"; + case kTransformSize16x16: + return "kTransformSize16x16"; + case kTransformSize16x32: + return "kTransformSize16x32"; + case kTransformSize16x64: + return "kTransformSize16x64"; + case kTransformSize32x8: + return "kTransformSize32x8"; + case kTransformSize32x16: + return "kTransformSize32x16"; + case kTransformSize32x32: + return "kTransformSize32x32"; + case kTransformSize32x64: + return "kTransformSize32x64"; + case kTransformSize64x16: + return "kTransformSize64x16"; + case kTransformSize64x32: + return "kTransformSize64x32"; + case kTransformSize64x64: + return "kTransformSize64x64"; + case kNumTransformSizes: + return "kNumTransformSizes"; + } + abort(); +} + inline const char* ToString(const TransformType type) { switch (type) { case kTransformTypeDctDct: diff --git a/src/utils/cpu.cc b/src/utils/cpu.cc index a6b7057..b3c51da 100644 --- a/src/utils/cpu.cc +++ b/src/utils/cpu.cc @@ -39,7 +39,7 @@ uint64_t Xgetbv() { __asm__ volatile("xgetbv" : "=a"(eax), "=d"(edx) : "c"(ecx)); return (static_cast<uint64_t>(edx) << 32) | eax; } -#else // _MSC_VER +#else // _MSC_VER void CpuId(int leaf, uint32_t info[4]) { __cpuidex(reinterpret_cast<int*>(info), leaf, 0 /*ecx=subleaf*/); } diff --git a/src/utils/cpu.h b/src/utils/cpu.h index 630b251..aefc2df 100644 --- a/src/utils/cpu.h +++ b/src/utils/cpu.h @@ -38,7 +38,7 @@ namespace libgav1 { #if !defined(LIBGAV1_ENABLE_AVX2) #define LIBGAV1_ENABLE_AVX2 1 #endif // !defined(LIBGAV1_ENABLE_AVX2) -#else // !LIBGAV1_ENABLE_SSE4_1 +#else // !LIBGAV1_ENABLE_SSE4_1 // Disable AVX2 when SSE4.1 is disabled as it may rely on shared components. #undef LIBGAV1_ENABLE_AVX2 #define LIBGAV1_ENABLE_AVX2 0 diff --git a/src/utils/dynamic_buffer.h b/src/utils/dynamic_buffer.h index b51345a..40ece26 100644 --- a/src/utils/dynamic_buffer.h +++ b/src/utils/dynamic_buffer.h @@ -46,6 +46,8 @@ class DynamicBuffer { return true; } + size_t size() const { return size_; } + private: std::unique_ptr<T[]> buffer_; size_t size_ = 0; diff --git a/src/utils/libgav1_utils.cmake b/src/utils/libgav1_utils.cmake index 8b6ec4b..587ca5d 100644 --- a/src/utils/libgav1_utils.cmake +++ b/src/utils/libgav1_utils.cmake @@ -39,8 +39,6 @@ list(APPEND libgav1_utils_sources "${libgav1_source}/utils/logging.cc" "${libgav1_source}/utils/logging.h" "${libgav1_source}/utils/memory.h" - "${libgav1_source}/utils/parameter_tree.cc" - "${libgav1_source}/utils/parameter_tree.h" "${libgav1_source}/utils/queue.h" "${libgav1_source}/utils/raw_bit_reader.cc" "${libgav1_source}/utils/raw_bit_reader.h" diff --git a/src/utils/logging.cc b/src/utils/logging.cc index 9a43c22..26e3e15 100644 --- a/src/utils/logging.cc +++ b/src/utils/logging.cc @@ -56,7 +56,7 @@ void Log(LogSeverity severity, const char* file, int line, const char* format, va_end(ap); fprintf(stderr, "\n"); } -#else // !LIBGAV1_ENABLE_LOGGING +#else // !LIBGAV1_ENABLE_LOGGING void Log(LogSeverity /*severity*/, const char* /*file*/, int /*line*/, const char* /*format*/, ...) {} #endif // LIBGAV1_ENABLE_LOGGING diff --git a/src/utils/logging.h b/src/utils/logging.h index 48928db..473aebd 100644 --- a/src/utils/logging.h +++ b/src/utils/logging.h @@ -35,13 +35,13 @@ // setting LIBGAV1_ENABLE_LOGGING. // Severity is given as an all-caps version of enum LogSeverity with the // leading 'k' removed: LIBGAV1_DLOG(INFO, "..."); -#define LIBGAV1_DLOG(severity, ...) \ - do { \ - constexpr const char* libgav1_logging_internal_basename = \ - ::libgav1::internal::Basename(__FILE__, sizeof(__FILE__) - 1); \ - ::libgav1::internal::Log(LIBGAV1_LOGGING_INTERNAL_##severity, \ - libgav1_logging_internal_basename, __LINE__, \ - __VA_ARGS__); \ +#define LIBGAV1_DLOG(severity, ...) \ + do { \ + constexpr const char* libgav1_logging_internal_basename = \ + libgav1::internal::Basename(__FILE__, sizeof(__FILE__) - 1); \ + libgav1::internal::Log(LIBGAV1_LOGGING_INTERNAL_##severity, \ + libgav1_logging_internal_basename, __LINE__, \ + __VA_ARGS__); \ } while (0) #else #define LIBGAV1_DLOG(severity, ...) \ @@ -49,10 +49,10 @@ } while (0) #endif // LIBGAV1_ENABLE_LOGGING -#define LIBGAV1_LOGGING_INTERNAL_ERROR ::libgav1::internal::LogSeverity::kError +#define LIBGAV1_LOGGING_INTERNAL_ERROR libgav1::internal::LogSeverity::kError #define LIBGAV1_LOGGING_INTERNAL_WARNING \ - ::libgav1::internal::LogSeverity::kWarning -#define LIBGAV1_LOGGING_INTERNAL_INFO ::libgav1::internal::LogSeverity::kInfo + libgav1::internal::LogSeverity::kWarning +#define LIBGAV1_LOGGING_INTERNAL_INFO libgav1::internal::LogSeverity::kInfo namespace libgav1 { namespace internal { diff --git a/src/utils/memory.h b/src/utils/memory.h index 219a83f..a8da53b 100644 --- a/src/utils/memory.h +++ b/src/utils/memory.h @@ -71,7 +71,7 @@ inline void* AlignedAlloc(size_t alignment, size_t size) { // more convenient to use memalign(). Unlike glibc, Android does not consider // memalign() an obsolete function. return memalign(alignment, size); -#else // !defined(__ANDROID__) +#else // !defined(__ANDROID__) void* ptr = nullptr; // posix_memalign requires that the requested alignment be at least // sizeof(void*). In this case, fall back on malloc which should return diff --git a/src/utils/parameter_tree.cc b/src/utils/parameter_tree.cc deleted file mode 100644 index 9426ce6..0000000 --- a/src/utils/parameter_tree.cc +++ /dev/null @@ -1,133 +0,0 @@ -// Copyright 2019 The libgav1 Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "src/utils/parameter_tree.h" - -#include <cassert> -#include <memory> -#include <new> - -#include "src/utils/common.h" -#include "src/utils/constants.h" -#include "src/utils/logging.h" -#include "src/utils/types.h" - -namespace libgav1 { - -// static -std::unique_ptr<ParameterTree> ParameterTree::Create(int row4x4, int column4x4, - BlockSize block_size, - bool is_leaf) { - std::unique_ptr<ParameterTree> tree( - new (std::nothrow) ParameterTree(row4x4, column4x4, block_size)); - if (tree != nullptr && is_leaf && !tree->SetPartitionType(kPartitionNone)) { - tree = nullptr; - } - return tree; -} - -bool ParameterTree::SetPartitionType(Partition partition) { - assert(!partition_type_set_); - partition_ = partition; - partition_type_set_ = true; - const int block_width4x4 = kNum4x4BlocksWide[block_size_]; - const int half_block4x4 = block_width4x4 >> 1; - const int quarter_block4x4 = half_block4x4 >> 1; - const BlockSize sub_size = kSubSize[partition][block_size_]; - const BlockSize split_size = kSubSize[kPartitionSplit][block_size_]; - assert(partition == kPartitionNone || sub_size != kBlockInvalid); - switch (partition) { - case kPartitionNone: - parameters_.reset(new (std::nothrow) BlockParameters()); - return parameters_ != nullptr; - case kPartitionHorizontal: - children_[0] = ParameterTree::Create(row4x4_, column4x4_, sub_size, true); - children_[1] = ParameterTree::Create(row4x4_ + half_block4x4, column4x4_, - sub_size, true); - return children_[0] != nullptr && children_[1] != nullptr; - case kPartitionVertical: - children_[0] = ParameterTree::Create(row4x4_, column4x4_, sub_size, true); - children_[1] = ParameterTree::Create(row4x4_, column4x4_ + half_block4x4, - sub_size, true); - return children_[0] != nullptr && children_[1] != nullptr; - case kPartitionSplit: - children_[0] = - ParameterTree::Create(row4x4_, column4x4_, sub_size, false); - children_[1] = ParameterTree::Create(row4x4_, column4x4_ + half_block4x4, - sub_size, false); - children_[2] = ParameterTree::Create(row4x4_ + half_block4x4, column4x4_, - sub_size, false); - children_[3] = ParameterTree::Create( - row4x4_ + half_block4x4, column4x4_ + half_block4x4, sub_size, false); - return children_[0] != nullptr && children_[1] != nullptr && - children_[2] != nullptr && children_[3] != nullptr; - case kPartitionHorizontalWithTopSplit: - assert(split_size != kBlockInvalid); - children_[0] = - ParameterTree::Create(row4x4_, column4x4_, split_size, true); - children_[1] = ParameterTree::Create(row4x4_, column4x4_ + half_block4x4, - split_size, true); - children_[2] = ParameterTree::Create(row4x4_ + half_block4x4, column4x4_, - sub_size, true); - return children_[0] != nullptr && children_[1] != nullptr && - children_[2] != nullptr; - case kPartitionHorizontalWithBottomSplit: - assert(split_size != kBlockInvalid); - children_[0] = ParameterTree::Create(row4x4_, column4x4_, sub_size, true); - children_[1] = ParameterTree::Create(row4x4_ + half_block4x4, column4x4_, - split_size, true); - children_[2] = - ParameterTree::Create(row4x4_ + half_block4x4, - column4x4_ + half_block4x4, split_size, true); - return children_[0] != nullptr && children_[1] != nullptr && - children_[2] != nullptr; - case kPartitionVerticalWithLeftSplit: - assert(split_size != kBlockInvalid); - children_[0] = - ParameterTree::Create(row4x4_, column4x4_, split_size, true); - children_[1] = ParameterTree::Create(row4x4_ + half_block4x4, column4x4_, - split_size, true); - children_[2] = ParameterTree::Create(row4x4_, column4x4_ + half_block4x4, - sub_size, true); - return children_[0] != nullptr && children_[1] != nullptr && - children_[2] != nullptr; - case kPartitionVerticalWithRightSplit: - assert(split_size != kBlockInvalid); - children_[0] = ParameterTree::Create(row4x4_, column4x4_, sub_size, true); - children_[1] = ParameterTree::Create(row4x4_, column4x4_ + half_block4x4, - split_size, true); - children_[2] = - ParameterTree::Create(row4x4_ + half_block4x4, - column4x4_ + half_block4x4, split_size, true); - return children_[0] != nullptr && children_[1] != nullptr && - children_[2] != nullptr; - case kPartitionHorizontal4: - for (int i = 0; i < 4; ++i) { - children_[i] = ParameterTree::Create(row4x4_ + i * quarter_block4x4, - column4x4_, sub_size, true); - if (children_[i] == nullptr) return false; - } - return true; - default: - assert(partition == kPartitionVertical4); - for (int i = 0; i < 4; ++i) { - children_[i] = ParameterTree::Create( - row4x4_, column4x4_ + i * quarter_block4x4, sub_size, true); - if (children_[i] == nullptr) return false; - } - return true; - } -} - -} // namespace libgav1 diff --git a/src/utils/parameter_tree.h b/src/utils/parameter_tree.h deleted file mode 100644 index 935f3eb..0000000 --- a/src/utils/parameter_tree.h +++ /dev/null @@ -1,113 +0,0 @@ -/* - * Copyright 2019 The libgav1 Authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LIBGAV1_SRC_UTILS_PARAMETER_TREE_H_ -#define LIBGAV1_SRC_UTILS_PARAMETER_TREE_H_ - -#include <cassert> -#include <memory> - -#include "src/utils/common.h" -#include "src/utils/compiler_attributes.h" -#include "src/utils/constants.h" -#include "src/utils/memory.h" -#include "src/utils/types.h" - -namespace libgav1 { - -class ParameterTree : public Allocable { - public: - // Creates a parameter tree to store the parameters of a block of size - // |block_size| starting at coordinates |row4x4| and |column4x4|. If |is_leaf| - // is set to true, the memory will be allocated for the BlockParameters for - // this node. Otherwise, no memory will be allocated. If |is_leaf| is set to - // false, |block_size| must be a square block, i.e., - // kBlockWidthPixels[block_size] must be equal to - // kBlockHeightPixels[block_size]. - static std::unique_ptr<ParameterTree> Create(int row4x4, int column4x4, - BlockSize block_size, - bool is_leaf = false); - - // Move only (not Copyable). - ParameterTree(ParameterTree&& other) = default; - ParameterTree& operator=(ParameterTree&& other) = default; - ParameterTree(const ParameterTree&) = delete; - ParameterTree& operator=(const ParameterTree&) = delete; - - // Set the partition type of the current node to |partition|. - // if (partition == kPartitionNone) { - // Memory will be allocated for the BlockParameters for this node. - // } else if (partition != kPartitionSplit) { - // The appropriate child nodes will be populated and memory will be - // allocated for the BlockParameters of the children. - // } else { - // The appropriate child nodes will be populated but they are considered to - // be hanging, i.e., future calls to SetPartitionType() on the child nodes - // will have to set them or their descendants to a terminal type. - // } - // This function must be called only once per node. - LIBGAV1_MUST_USE_RESULT bool SetPartitionType(Partition partition); - - // Basic getters. - int row4x4() const { return row4x4_; } - int column4x4() const { return column4x4_; } - BlockSize block_size() const { return block_size_; } - Partition partition() const { return partition_; } - ParameterTree* children(int index) const { - assert(index < 4); - return children_[index].get(); - } - // Returns the BlockParameters object of the current node if one exists. - // Otherwise returns nullptr. This function will return a valid - // BlockParameters object only for leaf nodes. - BlockParameters* parameters() const { return parameters_.get(); } - - private: - ParameterTree(int row4x4, int column4x4, BlockSize block_size) - : row4x4_(row4x4), column4x4_(column4x4), block_size_(block_size) {} - - Partition partition_ = kPartitionNone; - std::unique_ptr<BlockParameters> parameters_ = nullptr; - int row4x4_ = -1; - int column4x4_ = -1; - BlockSize block_size_ = kBlockInvalid; - bool partition_type_set_ = false; - - // Child values are defined as follows for various partition types: - // * Horizontal: 0 top partition; 1 bottom partition; 2 nullptr; 3 nullptr; - // * Vertical: 0 left partition; 1 right partition; 2 nullptr; 3 nullptr; - // * Split: 0 top-left partition; 1 top-right partition; 2; bottom-left - // partition; 3 bottom-right partition; - // * HorizontalWithTopSplit: 0 top-left partition; 1 top-right partition; 2 - // bottom partition; 3 nullptr; - // * HorizontalWithBottomSplit: 0 top partition; 1 bottom-left partition; 2 - // bottom-right partition; 3 nullptr; - // * VerticalWithLeftSplit: 0 top-left partition; 1 bottom-left partition; 2 - // right partition; 3 nullptr; - // * VerticalWithRightSplit: 0 left-partition; 1 top-right partition; 2 - // bottom-right partition; 3 nullptr; - // * Horizontal4: 0 top partition; 1 second top partition; 2 third top - // partition; 3 bottom partition; - // * Vertical4: 0 left partition; 1 second left partition; 2 third left - // partition; 3 right partition; - std::unique_ptr<ParameterTree> children_[4] = {}; - - friend class ParameterTreeTest; -}; - -} // namespace libgav1 - -#endif // LIBGAV1_SRC_UTILS_PARAMETER_TREE_H_ diff --git a/src/utils/raw_bit_reader.h b/src/utils/raw_bit_reader.h index 76e7bfa..7d8ce8f 100644 --- a/src/utils/raw_bit_reader.h +++ b/src/utils/raw_bit_reader.h @@ -38,7 +38,7 @@ class RawBitReader : public BitReader, public Allocable { size_t* value); // le(n) in the spec. bool ReadUnsignedLeb128(size_t* value); // leb128() in the spec. // Reads a variable length unsigned number and stores it in |*value|. On a - // successful return, |*value| is in the range of 0 to UINT32_MAX − 1, + // successful return, |*value| is in the range of 0 to UINT32_MAX - 1, // inclusive. bool ReadUvlc(uint32_t* value); // uvlc() in the spec. bool Finished() const; diff --git a/src/utils/threadpool.cc b/src/utils/threadpool.cc index 8c8f4fe..a3099e1 100644 --- a/src/utils/threadpool.cc +++ b/src/utils/threadpool.cc @@ -37,17 +37,21 @@ #include <chrono> // NOLINT (unapproved c++11 header) #endif +// Define the GetTid() function, a wrapper for the gettid() system call in +// Linux. +#if defined(__ANDROID__) +static pid_t GetTid() { return gettid(); } +#elif defined(__GLIBC__) // The glibc wrapper for the gettid() system call was added in glibc 2.30. // Emulate it for older versions of glibc. -#if defined(__GLIBC_PREREQ) -#if !__GLIBC_PREREQ(2, 30) - +#if __GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 30) +static pid_t GetTid() { return gettid(); } +#else // Older than glibc 2.30 #include <sys/syscall.h> -static pid_t gettid() { return static_cast<pid_t>(syscall(SYS_gettid)); } - -#endif -#endif // defined(__GLIBC_PREREQ) +static pid_t GetTid() { return static_cast<pid_t>(syscall(SYS_gettid)); } +#endif // glibc 2.30 or later. +#endif // defined(__GLIBC__) namespace libgav1 { @@ -216,7 +220,7 @@ void ThreadPool::WorkerThread::SetupName() { // If the |name| buffer is longer than 16 bytes, pthread_setname_np fails // with error 34 (ERANGE) on Android. char name[16]; - pid_t id = gettid(); + pid_t id = GetTid(); int rv = snprintf(name, sizeof(name), "%s/%" PRId64, pool_->name_prefix_, static_cast<int64_t>(id)); assert(rv >= 0); diff --git a/src/utils/types.h b/src/utils/types.h index 374f06b..eba13b7 100644 --- a/src/utils/types.h +++ b/src/utils/types.h @@ -18,6 +18,7 @@ #define LIBGAV1_SRC_UTILS_TYPES_H_ #include <array> +#include <cstddef> #include <cstdint> #include <memory> @@ -512,6 +513,10 @@ struct ObuFrameHeader { Delta delta_lf; // A valid value of reference_frame_index[i] is in the range [0, 7]. -1 // indicates an invalid value. + // + // NOTE: When the frame is an intra frame (frame_type is kFrameKey or + // kFrameIntraOnly), reference_frame_index is not used and may be + // uninitialized. int8_t reference_frame_index[kNumInterReferenceFrameTypes]; // The ref_order_hint[ i ] syntax element in the uncompressed header. // Specifies the expected output order hint for each reference frame. @@ -521,5 +526,24 @@ struct ObuFrameHeader { FilmGrainParams film_grain_params; }; +// Structure used for traversing the partition tree. +struct PartitionTreeNode { + PartitionTreeNode() = default; + PartitionTreeNode(int row4x4, int column4x4, BlockSize block_size) + : row4x4(row4x4), column4x4(column4x4), block_size(block_size) {} + int row4x4 = -1; + int column4x4 = -1; + BlockSize block_size = kBlockInvalid; +}; + +// Structure used for storing the transform parameters in a superblock. +struct TransformParameters { + TransformParameters() = default; + TransformParameters(TransformType type, int non_zero_coeff_count) + : type(type), non_zero_coeff_count(non_zero_coeff_count) {} + TransformType type; + int non_zero_coeff_count; +}; + } // namespace libgav1 #endif // LIBGAV1_SRC_UTILS_TYPES_H_ diff --git a/tests/block_utils.cc b/tests/block_utils.cc new file mode 100644 index 0000000..96833a2 --- /dev/null +++ b/tests/block_utils.cc @@ -0,0 +1,130 @@ +// Copyright 2020 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "tests/block_utils.h" + +#include <algorithm> +#include <cstdint> +#include <cstdio> +#include <cstring> + +namespace libgav1 { +namespace test_utils { +namespace { + +template <typename Pixel> +void PrintBlockDiff(const Pixel* block1, const Pixel* block2, int width, + int height, int stride1, int stride2, + const bool print_padding) { + const int print_width = print_padding ? std::min(stride1, stride2) : width; + const int field_width = (sizeof(Pixel) == 1) ? 4 : 5; + + for (int y = 0; y < height; ++y) { + printf("[%2d] ", y); + for (int x = 0; x < print_width; ++x) { + if (x >= width) { + if (block1[x] == block2[x]) { + printf("[%*d] ", field_width, block1[x]); + } else { + printf("[*%*d] ", field_width - 1, block1[x]); + } + } else { + if (block1[x] == block2[x]) { + printf("%*d ", field_width, block1[x]); + } else { + printf("*%*d ", field_width - 1, block1[x]); + } + } + } + printf("\n"); + block1 += stride1; + block2 += stride2; + } +} + +} // namespace + +template <typename Pixel> +void PrintBlock(const Pixel* block, int width, int height, int stride, + const bool print_padding /*= false*/) { + const int print_width = print_padding ? stride : width; + const int field_width = (sizeof(Pixel) == 1) ? 4 : 5; + for (int y = 0; y < height; ++y) { + printf("[%2d] ", y); + for (int x = 0; x < print_width; ++x) { + if (x >= width) { + printf("[%*d] ", field_width, block[x]); + } else { + printf("%*d ", field_width, block[x]); + } + } + printf("\n"); + block += stride; + } +} + +template void PrintBlock(const uint8_t* block, int width, int height, + int stride, bool print_padding /*= false*/); +template void PrintBlock(const uint16_t* block, int width, int height, + int stride, bool print_padding /*= false*/); +template void PrintBlock(const int8_t* block, int width, int height, int stride, + bool print_padding /*= false*/); +template void PrintBlock(const int16_t* block, int width, int height, + int stride, bool print_padding /*= false*/); + +template <typename Pixel> +bool CompareBlocks(const Pixel* block1, const Pixel* block2, int width, + int height, int stride1, int stride2, + const bool check_padding, const bool print_diff /*= true*/) { + bool ok = true; + const int check_width = check_padding ? std::min(stride1, stride2) : width; + for (int y = 0; y < height; ++y) { + const uint64_t row1 = static_cast<uint64_t>(y) * stride1; + const uint64_t row2 = static_cast<uint64_t>(y) * stride2; + ok = memcmp(block1 + row1, block2 + row2, + sizeof(block1[0]) * check_width) == 0; + if (!ok) break; + } + if (!ok && print_diff) { + printf("block1 (width: %d height: %d stride: %d):\n", width, height, + stride1); + PrintBlockDiff(block1, block2, width, height, stride1, stride2, + check_padding); + printf("\nblock2 (width: %d height: %d stride: %d):\n", width, height, + stride2); + PrintBlockDiff(block2, block1, width, height, stride2, stride1, + check_padding); + } + return ok; +} + +template bool CompareBlocks(const uint8_t* block1, const uint8_t* block2, + int width, int height, int stride1, int stride2, + const bool check_padding, + const bool print_diff /*= true*/); +template bool CompareBlocks(const uint16_t* block1, const uint16_t* block2, + int width, int height, int stride1, int stride2, + const bool check_padding, + const bool print_diff /*= true*/); +template bool CompareBlocks(const int8_t* block1, const int8_t* block2, + int width, int height, int stride1, int stride2, + const bool check_padding, + const bool print_diff /*= true*/); +template bool CompareBlocks(const int16_t* block1, const int16_t* block2, + int width, int height, int stride1, int stride2, + const bool check_padding, + const bool print_diff /*= true*/); + +} // namespace test_utils +} // namespace libgav1 diff --git a/tests/block_utils.h b/tests/block_utils.h new file mode 100644 index 0000000..4542420 --- /dev/null +++ b/tests/block_utils.h @@ -0,0 +1,62 @@ +/* + * Copyright 2020 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_TESTS_BLOCK_UTILS_H_ +#define LIBGAV1_TESTS_BLOCK_UTILS_H_ + +#include <cstdint> + +namespace libgav1 { +namespace test_utils { + +//------------------------------------------------------------------------------ +// Prints |block| pixel by pixel with |width| pixels per row if |print_padding| +// is false, |stride| otherwise. If |print_padding| is true padding pixels are +// surrounded in '[]'. +template <typename Pixel> +void PrintBlock(const Pixel* block, int width, int height, int stride, + bool print_padding = false); + +extern template void PrintBlock(const uint8_t* block, int width, int height, + int stride, bool print_padding /*= false*/); +extern template void PrintBlock(const uint16_t* block, int width, int height, + int stride, bool print_padding /*= false*/); + +//------------------------------------------------------------------------------ +// Compares |block1| and |block2| pixel by pixel checking |width| pixels per row +// if |check_padding| is false, min(|stride1|, |stride2|) pixels otherwise. +// Prints the blocks with differences marked with a '*' if |print_diff| is +// true (the default). + +template <typename Pixel> +bool CompareBlocks(const Pixel* block1, const Pixel* block2, int width, + int height, int stride1, int stride2, bool check_padding, + bool print_diff = true); + +extern template bool CompareBlocks(const uint8_t* block1, const uint8_t* block2, + int width, int height, int stride1, + int stride2, bool check_padding, + bool print_diff /*= true*/); +extern template bool CompareBlocks(const uint16_t* block1, + const uint16_t* block2, int width, + int height, int stride1, int stride2, + bool check_padding, + bool print_diff /*= true*/); + +} // namespace test_utils +} // namespace libgav1 + +#endif // LIBGAV1_TESTS_BLOCK_UTILS_H_ diff --git a/tests/libgav1_tests.cmake b/tests/libgav1_tests.cmake new file mode 100644 index 0000000..ac2fb2e --- /dev/null +++ b/tests/libgav1_tests.cmake @@ -0,0 +1,626 @@ +# Copyright 2020 The libgav1 Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if(LIBGAV1_LIBGAV1_TESTS_CMAKE_) + return() +endif() # LIBGAV1_LIBGAV1_TESTS_CMAKE_ +set(LIBGAV1_LIBGAV1_TESTS_CMAKE_ 1) + +set(libgav1_googletest "${libgav1_root}/third_party/googletest") +if(NOT LIBGAV1_ENABLE_TESTS OR NOT EXISTS "${libgav1_googletest}") + macro(libgav1_add_tests_targets) + + endmacro() + + if(LIBGAV1_ENABLE_TESTS AND NOT EXISTS "${libgav1_googletest}") + message( + "GoogleTest not found, setting LIBGAV1_ENABLE_TESTS to false.\n" + "To enable tests download the GoogleTest repository to" + " third_party/googletest:\n\n git \\\n -C ${libgav1_root} \\\n" + " clone \\\n" + " https://github.com/google/googletest.git third_party/googletest\n") + set(LIBGAV1_ENABLE_TESTS FALSE CACHE BOOL "Enables tests." FORCE) + endif() + return() +endif() + +# Check GoogleTest compiler requirements. +if((CMAKE_CXX_COMPILER_ID + MATCHES + "Clang|GNU" + AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS "5") + OR (MSVC AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS "19")) + macro(libgav1_add_tests_targets) + + endmacro() + + message( + WARNING + "${CMAKE_CXX_COMPILER} (${CMAKE_CXX_COMPILER_ID} version" + " ${CMAKE_CXX_COMPILER_VERSION}) is below the minimum requirements for" + " GoogleTest; disabling unit tests. See" + " https://github.com/google/googletest#compilers for more detail.") + set(LIBGAV1_ENABLE_TESTS FALSE CACHE BOOL "Enables tests." FORCE) + return() +endif() + +list(APPEND libgav1_tests_block_utils_sources + "${libgav1_root}/tests/block_utils.h" + "${libgav1_root}/tests/block_utils.cc") + +list(APPEND libgav1_tests_utils_sources + "${libgav1_root}/tests/third_party/libvpx/acm_random.h" + "${libgav1_root}/tests/third_party/libvpx/md5_helper.h" + "${libgav1_root}/tests/third_party/libvpx/md5_utils.cc" + "${libgav1_root}/tests/third_party/libvpx/md5_utils.h" + "${libgav1_root}/tests/utils.h" "${libgav1_root}/tests/utils.cc") + +list(APPEND libgav1_tests_utils_test_sources + "${libgav1_root}/tests/utils_test.cc") + +list(APPEND libgav1_average_blend_test_sources + "${libgav1_source}/dsp/average_blend_test.cc") +list(APPEND libgav1_cdef_test_sources "${libgav1_source}/dsp/cdef_test.cc") +list(APPEND libgav1_convolve_test_sources + "${libgav1_source}/dsp/convolve_test.cc") +list(APPEND libgav1_distance_weighted_blend_test_sources + "${libgav1_source}/dsp/distance_weighted_blend_test.cc") +list(APPEND libgav1_dsp_test_sources "${libgav1_source}/dsp/dsp_test.cc") +list(APPEND libgav1_intra_edge_test_sources + "${libgav1_source}/dsp/intra_edge_test.cc") +list(APPEND libgav1_intrapred_cfl_test_sources + "${libgav1_source}/dsp/intrapred_cfl_test.cc") +list(APPEND libgav1_intrapred_directional_test_sources + "${libgav1_source}/dsp/intrapred_directional_test.cc") +list(APPEND libgav1_intrapred_filter_test_sources + "${libgav1_source}/dsp/intrapred_filter_test.cc") +list(APPEND libgav1_intrapred_test_sources + "${libgav1_source}/dsp/intrapred_test.cc") +list(APPEND libgav1_inverse_transform_test_sources + "${libgav1_source}/dsp/inverse_transform_test.cc") +list(APPEND libgav1_loop_filter_test_sources + "${libgav1_source}/dsp/loop_filter_test.cc") +list(APPEND libgav1_loop_restoration_test_sources + "${libgav1_source}/dsp/loop_restoration_test.cc") +list(APPEND libgav1_mask_blend_test_sources + "${libgav1_source}/dsp/mask_blend_test.cc") +list(APPEND libgav1_motion_field_projection_test_sources + "${libgav1_source}/dsp/motion_field_projection_test.cc") +list(APPEND libgav1_motion_vector_search_test_sources + "${libgav1_source}/dsp/motion_vector_search_test.cc") +list(APPEND libgav1_super_res_test_sources + "${libgav1_source}/dsp/super_res_test.cc") +list(APPEND libgav1_weight_mask_test_sources + "${libgav1_source}/dsp/weight_mask_test.cc") +list(APPEND libgav1_obmc_test_sources "${libgav1_source}/dsp/obmc_test.cc") +list(APPEND libgav1_warp_test_sources "${libgav1_source}/dsp/warp_test.cc") + +macro(libgav1_add_tests_targets) + if(NOT LIBGAV1_ENABLE_TESTS) + message( + FATAL_ERROR + "This version of libgav1_add_tests_targets() should only be used with" + " LIBGAV1_ENABLE_TESTS set to true.") + endif() + libgav1_add_library(TEST + NAME + libgav1_gtest + TYPE + STATIC + SOURCES + "${libgav1_googletest}/googletest/src/gtest-all.cc" + DEFINES + ${libgav1_defines} + INCLUDES + ${libgav1_gtest_include_paths} + ${libgav1_include_paths}) + + libgav1_add_library(TEST + NAME + libgav1_gtest_main + TYPE + STATIC + SOURCES + "${libgav1_googletest}/googletest/src/gtest_main.cc" + DEFINES + ${libgav1_defines} + INCLUDES + ${libgav1_gtest_include_paths} + ${libgav1_include_paths}) + + if(ANDROID OR IOS) + if(DEFINED LIBGAV1_THREADPOOL_USE_STD_MUTEX + AND NOT LIBGAV1_THREADPOOL_USE_STD_MUTEX) + set(use_absl_threading TRUE) + endif() + elseif(NOT + (DEFINED + LIBGAV1_THREADPOOL_USE_STD_MUTEX + AND LIBGAV1_THREADPOOL_USE_STD_MUTEX)) + set(use_absl_threading TRUE) + endif() + + if(use_absl_threading) + list(APPEND libgav1_common_test_absl_deps absl::synchronization) + endif() + + libgav1_add_executable(TEST + NAME + tests_utils_test + SOURCES + ${libgav1_tests_utils_test_sources} + DEFINES + ${libgav1_defines} + INCLUDES + ${libgav1_test_include_paths} + OBJLIB_DEPS + libgav1_dsp + libgav1_tests_utils + libgav1_utils + LIB_DEPS + absl::time + ${libgav1_common_test_absl_deps} + libgav1_gtest + libgav1_gtest_main) + + libgav1_add_library(TEST + NAME + libgav1_tests_block_utils + TYPE + OBJECT + SOURCES + ${libgav1_tests_block_utils_sources} + DEFINES + ${libgav1_defines} + INCLUDES + ${libgav1_test_include_paths}) + + libgav1_add_library(TEST + NAME + libgav1_tests_utils + TYPE + OBJECT + SOURCES + ${libgav1_tests_utils_sources} + DEFINES + ${libgav1_defines} + INCLUDES + ${libgav1_test_include_paths}) + + libgav1_add_executable(TEST + NAME + average_blend_test + SOURCES + ${libgav1_average_blend_test_sources} + DEFINES + ${libgav1_defines} + INCLUDES + ${libgav1_test_include_paths} + OBJLIB_DEPS + libgav1_decoder + libgav1_dsp + libgav1_tests_block_utils + libgav1_tests_utils + libgav1_utils + LIB_DEPS + absl::str_format_internal + absl::strings + absl::time + ${libgav1_common_test_absl_deps} + libgav1_gtest + libgav1_gtest_main) + + libgav1_add_executable(TEST + NAME + cdef_test + SOURCES + ${libgav1_cdef_test_sources} + DEFINES + ${libgav1_defines} + INCLUDES + ${libgav1_test_include_paths} + OBJLIB_DEPS + libgav1_decoder + libgav1_dsp + libgav1_tests_utils + libgav1_utils + LIB_DEPS + absl::strings + absl::time + ${libgav1_common_test_absl_deps} + libgav1_gtest + libgav1_gtest_main) + + libgav1_add_executable(TEST + NAME + convolve_test + SOURCES + ${libgav1_convolve_test_sources} + DEFINES + ${libgav1_defines} + INCLUDES + ${libgav1_test_include_paths} + OBJLIB_DEPS + libgav1_decoder + libgav1_dsp + libgav1_tests_block_utils + libgav1_tests_utils + libgav1_utils + LIB_DEPS + absl::str_format_internal + absl::time + ${libgav1_common_test_absl_deps} + libgav1_gtest + libgav1_gtest_main) + + libgav1_add_executable(TEST + NAME + distance_weighted_blend_test + SOURCES + ${libgav1_distance_weighted_blend_test_sources} + DEFINES + ${libgav1_defines} + INCLUDES + ${libgav1_test_include_paths} + OBJLIB_DEPS + libgav1_decoder + libgav1_dsp + libgav1_tests_utils + libgav1_utils + LIB_DEPS + absl::str_format_internal + absl::time + ${libgav1_common_test_absl_deps} + libgav1_gtest + libgav1_gtest_main) + + libgav1_add_executable(TEST + NAME + dsp_test + SOURCES + ${libgav1_dsp_test_sources} + DEFINES + ${libgav1_defines} + INCLUDES + ${libgav1_test_include_paths} + OBJLIB_DEPS + libgav1_decoder + libgav1_dsp + libgav1_tests_utils + libgav1_utils + LIB_DEPS + absl::strings + absl::time + ${libgav1_common_test_absl_deps} + libgav1_gtest + libgav1_gtest_main) + + libgav1_add_executable(TEST + NAME + intrapred_cfl_test + SOURCES + ${libgav1_intrapred_cfl_test_sources} + DEFINES + ${libgav1_defines} + INCLUDES + ${libgav1_test_include_paths} + OBJLIB_DEPS + libgav1_decoder + libgav1_dsp + libgav1_tests_block_utils + libgav1_tests_utils + libgav1_utils + LIB_DEPS + absl::time + ${libgav1_common_test_absl_deps} + libgav1_gtest + libgav1_gtest_main) + + libgav1_add_executable(TEST + NAME + intrapred_directional_test + SOURCES + ${libgav1_intrapred_directional_test_sources} + DEFINES + ${libgav1_defines} + INCLUDES + ${libgav1_test_include_paths} + OBJLIB_DEPS + libgav1_decoder + libgav1_dsp + libgav1_tests_block_utils + libgav1_tests_utils + libgav1_utils + LIB_DEPS + absl::time + ${libgav1_common_test_absl_deps} + libgav1_gtest + libgav1_gtest_main) + + libgav1_add_executable(TEST + NAME + intrapred_filter_test + SOURCES + ${libgav1_intrapred_filter_test_sources} + DEFINES + ${libgav1_defines} + INCLUDES + ${libgav1_test_include_paths} + OBJLIB_DEPS + libgav1_decoder + libgav1_dsp + libgav1_tests_block_utils + libgav1_tests_utils + libgav1_utils + LIB_DEPS + absl::time + ${libgav1_common_test_absl_deps} + libgav1_gtest + libgav1_gtest_main) + + libgav1_add_executable(TEST + NAME + intrapred_test + SOURCES + ${libgav1_intrapred_test_sources} + DEFINES + ${libgav1_defines} + INCLUDES + ${libgav1_test_include_paths} + OBJLIB_DEPS + libgav1_decoder + libgav1_dsp + libgav1_tests_block_utils + libgav1_tests_utils + libgav1_utils + LIB_DEPS + absl::time + ${libgav1_common_test_absl_deps} + libgav1_gtest + libgav1_gtest_main) + + libgav1_add_executable(TEST + NAME + intra_edge_test + SOURCES + ${libgav1_intra_edge_test_sources} + DEFINES + ${libgav1_defines} + INCLUDES + ${libgav1_test_include_paths} + OBJLIB_DEPS + libgav1_decoder + libgav1_tests_utils + libgav1_dsp + libgav1_utils + LIB_DEPS + absl::strings + absl::time + ${libgav1_common_test_absl_deps} + libgav1_gtest + libgav1_gtest_main) + + libgav1_add_executable(TEST + NAME + inverse_transform_test + SOURCES + ${libgav1_inverse_transform_test_sources} + DEFINES + ${libgav1_defines} + INCLUDES + ${libgav1_test_include_paths} + OBJLIB_DEPS + libgav1_decoder + libgav1_tests_block_utils + libgav1_tests_utils + libgav1_dsp + libgav1_utils + LIB_DEPS + absl::strings + absl::time + ${libgav1_common_test_absl_deps} + libgav1_gtest + libgav1_gtest_main) + + libgav1_add_executable(TEST + NAME + loop_filter_test + SOURCES + ${libgav1_loop_filter_test_sources} + DEFINES + ${libgav1_defines} + INCLUDES + ${libgav1_test_include_paths} + OBJLIB_DEPS + libgav1_decoder + libgav1_dsp + libgav1_tests_block_utils + libgav1_tests_utils + libgav1_utils + LIB_DEPS + absl::time + ${libgav1_common_test_absl_deps} + libgav1_gtest + libgav1_gtest_main) + + libgav1_add_executable(TEST + NAME + loop_restoration_test + SOURCES + ${libgav1_loop_restoration_test_sources} + DEFINES + ${libgav1_defines} + INCLUDES + ${libgav1_test_include_paths} + OBJLIB_DEPS + libgav1_decoder + libgav1_dsp + libgav1_tests_block_utils + libgav1_tests_utils + libgav1_utils + LIB_DEPS + absl::time + ${libgav1_common_test_absl_deps} + libgav1_gtest + libgav1_gtest_main) + + libgav1_add_executable(TEST + NAME + mask_blend_test + SOURCES + ${libgav1_mask_blend_test_sources} + DEFINES + ${libgav1_defines} + INCLUDES + ${libgav1_test_include_paths} + OBJLIB_DEPS + libgav1_decoder + libgav1_dsp + libgav1_tests_utils + libgav1_utils + LIB_DEPS + absl::str_format_internal + absl::time + ${libgav1_common_test_absl_deps} + libgav1_gtest + libgav1_gtest_main) + + libgav1_add_executable(TEST + NAME + motion_field_projection_test + SOURCES + ${libgav1_motion_field_projection_test_sources} + DEFINES + ${libgav1_defines} + INCLUDES + ${libgav1_test_include_paths} + OBJLIB_DEPS + libgav1_decoder + libgav1_dsp + libgav1_tests_utils + libgav1_utils + LIB_DEPS + absl::str_format_internal + absl::time + ${libgav1_common_test_absl_deps} + libgav1_gtest + libgav1_gtest_main) + + libgav1_add_executable(TEST + NAME + motion_vector_search_test + SOURCES + ${libgav1_motion_vector_search_test_sources} + DEFINES + ${libgav1_defines} + INCLUDES + ${libgav1_test_include_paths} + OBJLIB_DEPS + libgav1_decoder + libgav1_dsp + libgav1_tests_utils + libgav1_utils + LIB_DEPS + absl::str_format_internal + absl::time + ${libgav1_common_test_absl_deps} + libgav1_gtest + libgav1_gtest_main) + + libgav1_add_executable(TEST + NAME + obmc_test + SOURCES + ${libgav1_obmc_test_sources} + DEFINES + ${libgav1_defines} + INCLUDES + ${libgav1_test_include_paths} + OBJLIB_DEPS + libgav1_decoder + libgav1_dsp + libgav1_tests_block_utils + libgav1_tests_utils + libgav1_utils + LIB_DEPS + absl::str_format_internal + absl::time + ${libgav1_common_test_absl_deps} + libgav1_gtest + libgav1_gtest_main) + + libgav1_add_executable(TEST + NAME + super_res_test + SOURCES + ${libgav1_super_res_test_sources} + DEFINES + ${libgav1_defines} + INCLUDES + ${libgav1_test_include_paths} + OBJLIB_DEPS + libgav1_decoder + libgav1_dsp + libgav1_tests_utils + libgav1_utils + LIB_DEPS + absl::str_format_internal + absl::time + ${libgav1_common_test_absl_deps} + libgav1_gtest + libgav1_gtest_main) + + libgav1_add_executable(TEST + NAME + warp_test + SOURCES + ${libgav1_warp_test_sources} + DEFINES + ${libgav1_defines} + INCLUDES + ${libgav1_test_include_paths} + OBJLIB_DEPS + libgav1_decoder + libgav1_dsp + libgav1_tests_block_utils + libgav1_tests_utils + libgav1_utils + LIB_DEPS + absl::str_format_internal + absl::time + ${libgav1_common_test_absl_deps} + libgav1_gtest + libgav1_gtest_main) + + libgav1_add_executable(TEST + NAME + weight_mask_test + SOURCES + ${libgav1_weight_mask_test_sources} + DEFINES + ${libgav1_defines} + INCLUDES + ${libgav1_test_include_paths} + OBJLIB_DEPS + libgav1_decoder + libgav1_dsp + libgav1_tests_utils + libgav1_utils + LIB_DEPS + absl::str_format_internal + absl::time + ${libgav1_common_test_absl_deps} + libgav1_gtest + libgav1_gtest_main) +endmacro() diff --git a/tests/third_party/libvpx/LICENSE b/tests/third_party/libvpx/LICENSE new file mode 100644 index 0000000..83ef339 --- /dev/null +++ b/tests/third_party/libvpx/LICENSE @@ -0,0 +1,30 @@ +Copyright (c) 2010, The WebM Project authors. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + + * Neither the name of Google, nor the WebM Project, nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/tests/third_party/libvpx/acm_random.h b/tests/third_party/libvpx/acm_random.h new file mode 100644 index 0000000..e8cfc9c --- /dev/null +++ b/tests/third_party/libvpx/acm_random.h @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef LIBGAV1_TESTS_THIRD_PARTY_LIBVPX_ACM_RANDOM_H_ +#define LIBGAV1_TESTS_THIRD_PARTY_LIBVPX_ACM_RANDOM_H_ + +#include <cassert> +#include <cstdint> +#include <limits> + +#include "gtest/gtest.h" + +namespace libvpx_test { + +class ACMRandom { + public: + ACMRandom() : random_(DeterministicSeed()) {} + + explicit ACMRandom(int seed) : random_(seed) {} + + void Reset(int seed) { random_.Reseed(seed); } + uint16_t Rand16(void) { + const uint32_t value = + random_.Generate(testing::internal::Random::kMaxRange); + return (value >> 15) & 0xffff; + } + + int32_t Rand20Signed(void) { + // Use 20 bits: values between 524287 and -524288. + const uint32_t value = random_.Generate(1048576); + return static_cast<int32_t>(value) - 524288; + } + + int16_t Rand16Signed(void) { + // Use 16 bits: values between 32767 and -32768. + return static_cast<int16_t>(random_.Generate(65536)); + } + + int16_t Rand13Signed(void) { + // Use 13 bits: values between 4095 and -4096. + const uint32_t value = random_.Generate(8192); + return static_cast<int16_t>(value) - 4096; + } + + int16_t Rand9Signed(void) { + // Use 9 bits: values between 255 (0x0FF) and -256 (0x100). + const uint32_t value = random_.Generate(512); + return static_cast<int16_t>(value) - 256; + } + + uint8_t Rand8(void) { + const uint32_t value = + random_.Generate(testing::internal::Random::kMaxRange); + // There's a bit more entropy in the upper bits of this implementation. + return (value >> 23) & 0xff; + } + + uint8_t Rand8Extremes(void) { + // Returns a random value near 0 or near 255, to better exercise + // saturation behavior. + const uint8_t r = Rand8(); + return static_cast<uint8_t>((r < 128) ? r << 4 : r >> 4); + } + + uint32_t RandRange(const uint32_t range) { + // testing::internal::Random::Generate provides values in the range + // testing::internal::Random::kMaxRange. + assert(range <= testing::internal::Random::kMaxRange); + return random_.Generate(range); + } + + int PseudoUniform(int range) { return random_.Generate(range); } + + int operator()(int n) { return PseudoUniform(n); } + + static constexpr int DeterministicSeed(void) { return 0xbaba; } + + private: + testing::internal::Random random_; +}; + +} // namespace libvpx_test + +#endif // LIBGAV1_TESTS_THIRD_PARTY_LIBVPX_ACM_RANDOM_H_ diff --git a/tests/third_party/libvpx/md5_helper.h b/tests/third_party/libvpx/md5_helper.h new file mode 100644 index 0000000..c97b590 --- /dev/null +++ b/tests/third_party/libvpx/md5_helper.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef LIBGAV1_TESTS_THIRD_PARTY_LIBVPX_MD5_HELPER_H_ +#define LIBGAV1_TESTS_THIRD_PARTY_LIBVPX_MD5_HELPER_H_ + +#include <cstddef> +#include <cstdint> + +#include "tests/third_party/libvpx/md5_utils.h" + +namespace libvpx_test { +class MD5 { + public: + MD5() { MD5Init(&md5_); } + + void Add(const uint8_t *data, size_t size) { + MD5Update(&md5_, data, static_cast<uint32_t>(size)); + } + + const char *Get(void) { + static const char hex[16] = { + '0', '1', '2', '3', '4', '5', '6', '7', + '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', + }; + uint8_t tmp[16]; + MD5Context ctx_tmp = md5_; + + MD5Final(tmp, &ctx_tmp); + for (int i = 0; i < 16; i++) { + res_[i * 2 + 0] = hex[tmp[i] >> 4]; + res_[i * 2 + 1] = hex[tmp[i] & 0xf]; + } + res_[32] = 0; + + return res_; + } + + protected: + char res_[33]; + MD5Context md5_; +}; + +} // namespace libvpx_test + +#endif // LIBGAV1_TESTS_THIRD_PARTY_LIBVPX_MD5_HELPER_H_ diff --git a/tests/third_party/libvpx/md5_utils.cc b/tests/third_party/libvpx/md5_utils.cc new file mode 100644 index 0000000..4638e54 --- /dev/null +++ b/tests/third_party/libvpx/md5_utils.cc @@ -0,0 +1,249 @@ +/* + * This code implements the MD5 message-digest algorithm. + * The algorithm is due to Ron Rivest. This code was + * written by Colin Plumb in 1993, no copyright is claimed. + * This code is in the public domain; do with it what you wish. + * + * Equivalent code is available from RSA Data Security, Inc. + * This code has been tested against that, and is equivalent, + * except that you don't need to include two pages of legalese + * with every copy. + * + * To compute the message digest of a chunk of bytes, declare an + * MD5Context structure, pass it to MD5Init, call MD5Update as + * needed on buffers full of bytes, and then call MD5Final, which + * will fill a supplied 16-byte array with the digest. + * + * Changed so as no longer to depend on Colin Plumb's `usual.h' header + * definitions + * - Ian Jackson <ian@chiark.greenend.org.uk>. + * Still in the public domain. + */ + +#include "tests/third_party/libvpx/md5_utils.h" + +#include <cstring> + +static void byteSwap(UWORD32 *buf, unsigned words) { + md5byte *p; + + /* Only swap bytes for big endian machines */ + int i = 1; + + if (*(char *)&i == 1) return; + + p = (md5byte *)buf; + + do { + *buf++ = (UWORD32)((unsigned)p[3] << 8 | p[2]) << 16 | + ((unsigned)p[1] << 8 | p[0]); + p += 4; + } while (--words); +} + +/* + * Start MD5 accumulation. Set bit count to 0 and buffer to mysterious + * initialization constants. + */ +void MD5Init(struct MD5Context *ctx) { + ctx->buf[0] = 0x67452301; + ctx->buf[1] = 0xefcdab89; + ctx->buf[2] = 0x98badcfe; + ctx->buf[3] = 0x10325476; + + ctx->bytes[0] = 0; + ctx->bytes[1] = 0; +} + +/* + * Update context to reflect the concatenation of another buffer full + * of bytes. + */ +void MD5Update(struct MD5Context *ctx, md5byte const *buf, unsigned len) { + UWORD32 t; + + /* Update byte count */ + + t = ctx->bytes[0]; + + if ((ctx->bytes[0] = t + len) < t) + ctx->bytes[1]++; /* Carry from low to high */ + + t = 64 - (t & 0x3f); /* Space available in ctx->in (at least 1) */ + + if (t > len) { + memcpy((md5byte *)ctx->in + 64 - t, buf, len); + return; + } + + /* First chunk is an odd size */ + memcpy((md5byte *)ctx->in + 64 - t, buf, t); + byteSwap(ctx->in, 16); + MD5Transform(ctx->buf, ctx->in); + buf += t; + len -= t; + + /* Process data in 64-byte chunks */ + while (len >= 64) { + memcpy(ctx->in, buf, 64); + byteSwap(ctx->in, 16); + MD5Transform(ctx->buf, ctx->in); + buf += 64; + len -= 64; + } + + /* Handle any remaining bytes of data. */ + memcpy(ctx->in, buf, len); +} + +/* + * Final wrapup - pad to 64-byte boundary with the bit pattern + * 1 0* (64-bit count of bits processed, MSB-first) + */ +void MD5Final(md5byte digest[16], struct MD5Context *ctx) { + int count = ctx->bytes[0] & 0x3f; /* Number of bytes in ctx->in */ + md5byte *p = (md5byte *)ctx->in + count; + + /* Set the first char of padding to 0x80. There is always room. */ + *p++ = 0x80; + + /* Bytes of padding needed to make 56 bytes (-8..55) */ + count = 56 - 1 - count; + + if (count < 0) { /* Padding forces an extra block */ + memset(p, 0, count + 8); + byteSwap(ctx->in, 16); + MD5Transform(ctx->buf, ctx->in); + p = (md5byte *)ctx->in; + count = 56; + } + + memset(p, 0, count); + byteSwap(ctx->in, 14); + + /* Append length in bits and transform */ + ctx->in[14] = ctx->bytes[0] << 3; + ctx->in[15] = ctx->bytes[1] << 3 | ctx->bytes[0] >> 29; + MD5Transform(ctx->buf, ctx->in); + + byteSwap(ctx->buf, 4); + memcpy(digest, ctx->buf, 16); + memset(ctx, 0, sizeof(*ctx)); /* In case it's sensitive */ +} + +#ifndef ASM_MD5 + +/* The four core functions - F1 is optimized somewhat */ + +/* #define F1(x, y, z) (x & y | ~x & z) */ +#define F1(x, y, z) (z ^ (x & (y ^ z))) +#define F2(x, y, z) F1(z, x, y) +#define F3(x, y, z) (x ^ y ^ z) +#define F4(x, y, z) (y ^ (x | ~z)) + +/* This is the central step in the MD5 algorithm. */ +#define MD5STEP(f, w, x, y, z, in, s) \ + (w += f(x, y, z) + in, w = (w << s | w >> (32 - s)) + x) + +#if defined(__clang__) && defined(__has_attribute) +#if __has_attribute(no_sanitize) +#define VPX_NO_UNSIGNED_OVERFLOW_CHECK \ + __attribute__((no_sanitize("unsigned-integer-overflow"))) +#endif +#endif + +#ifndef VPX_NO_UNSIGNED_OVERFLOW_CHECK +#define VPX_NO_UNSIGNED_OVERFLOW_CHECK +#endif + +/* + * The core of the MD5 algorithm, this alters an existing MD5 hash to + * reflect the addition of 16 longwords of new data. MD5Update blocks + * the data and converts bytes into longwords for this routine. + */ +VPX_NO_UNSIGNED_OVERFLOW_CHECK void MD5Transform(UWORD32 buf[4], + UWORD32 const in[16]) { + UWORD32 a, b, c, d; + + a = buf[0]; + b = buf[1]; + c = buf[2]; + d = buf[3]; + + MD5STEP(F1, a, b, c, d, in[0] + 0xd76aa478, 7); + MD5STEP(F1, d, a, b, c, in[1] + 0xe8c7b756, 12); + MD5STEP(F1, c, d, a, b, in[2] + 0x242070db, 17); + MD5STEP(F1, b, c, d, a, in[3] + 0xc1bdceee, 22); + MD5STEP(F1, a, b, c, d, in[4] + 0xf57c0faf, 7); + MD5STEP(F1, d, a, b, c, in[5] + 0x4787c62a, 12); + MD5STEP(F1, c, d, a, b, in[6] + 0xa8304613, 17); + MD5STEP(F1, b, c, d, a, in[7] + 0xfd469501, 22); + MD5STEP(F1, a, b, c, d, in[8] + 0x698098d8, 7); + MD5STEP(F1, d, a, b, c, in[9] + 0x8b44f7af, 12); + MD5STEP(F1, c, d, a, b, in[10] + 0xffff5bb1, 17); + MD5STEP(F1, b, c, d, a, in[11] + 0x895cd7be, 22); + MD5STEP(F1, a, b, c, d, in[12] + 0x6b901122, 7); + MD5STEP(F1, d, a, b, c, in[13] + 0xfd987193, 12); + MD5STEP(F1, c, d, a, b, in[14] + 0xa679438e, 17); + MD5STEP(F1, b, c, d, a, in[15] + 0x49b40821, 22); + + MD5STEP(F2, a, b, c, d, in[1] + 0xf61e2562, 5); + MD5STEP(F2, d, a, b, c, in[6] + 0xc040b340, 9); + MD5STEP(F2, c, d, a, b, in[11] + 0x265e5a51, 14); + MD5STEP(F2, b, c, d, a, in[0] + 0xe9b6c7aa, 20); + MD5STEP(F2, a, b, c, d, in[5] + 0xd62f105d, 5); + MD5STEP(F2, d, a, b, c, in[10] + 0x02441453, 9); + MD5STEP(F2, c, d, a, b, in[15] + 0xd8a1e681, 14); + MD5STEP(F2, b, c, d, a, in[4] + 0xe7d3fbc8, 20); + MD5STEP(F2, a, b, c, d, in[9] + 0x21e1cde6, 5); + MD5STEP(F2, d, a, b, c, in[14] + 0xc33707d6, 9); + MD5STEP(F2, c, d, a, b, in[3] + 0xf4d50d87, 14); + MD5STEP(F2, b, c, d, a, in[8] + 0x455a14ed, 20); + MD5STEP(F2, a, b, c, d, in[13] + 0xa9e3e905, 5); + MD5STEP(F2, d, a, b, c, in[2] + 0xfcefa3f8, 9); + MD5STEP(F2, c, d, a, b, in[7] + 0x676f02d9, 14); + MD5STEP(F2, b, c, d, a, in[12] + 0x8d2a4c8a, 20); + + MD5STEP(F3, a, b, c, d, in[5] + 0xfffa3942, 4); + MD5STEP(F3, d, a, b, c, in[8] + 0x8771f681, 11); + MD5STEP(F3, c, d, a, b, in[11] + 0x6d9d6122, 16); + MD5STEP(F3, b, c, d, a, in[14] + 0xfde5380c, 23); + MD5STEP(F3, a, b, c, d, in[1] + 0xa4beea44, 4); + MD5STEP(F3, d, a, b, c, in[4] + 0x4bdecfa9, 11); + MD5STEP(F3, c, d, a, b, in[7] + 0xf6bb4b60, 16); + MD5STEP(F3, b, c, d, a, in[10] + 0xbebfbc70, 23); + MD5STEP(F3, a, b, c, d, in[13] + 0x289b7ec6, 4); + MD5STEP(F3, d, a, b, c, in[0] + 0xeaa127fa, 11); + MD5STEP(F3, c, d, a, b, in[3] + 0xd4ef3085, 16); + MD5STEP(F3, b, c, d, a, in[6] + 0x04881d05, 23); + MD5STEP(F3, a, b, c, d, in[9] + 0xd9d4d039, 4); + MD5STEP(F3, d, a, b, c, in[12] + 0xe6db99e5, 11); + MD5STEP(F3, c, d, a, b, in[15] + 0x1fa27cf8, 16); + MD5STEP(F3, b, c, d, a, in[2] + 0xc4ac5665, 23); + + MD5STEP(F4, a, b, c, d, in[0] + 0xf4292244, 6); + MD5STEP(F4, d, a, b, c, in[7] + 0x432aff97, 10); + MD5STEP(F4, c, d, a, b, in[14] + 0xab9423a7, 15); + MD5STEP(F4, b, c, d, a, in[5] + 0xfc93a039, 21); + MD5STEP(F4, a, b, c, d, in[12] + 0x655b59c3, 6); + MD5STEP(F4, d, a, b, c, in[3] + 0x8f0ccc92, 10); + MD5STEP(F4, c, d, a, b, in[10] + 0xffeff47d, 15); + MD5STEP(F4, b, c, d, a, in[1] + 0x85845dd1, 21); + MD5STEP(F4, a, b, c, d, in[8] + 0x6fa87e4f, 6); + MD5STEP(F4, d, a, b, c, in[15] + 0xfe2ce6e0, 10); + MD5STEP(F4, c, d, a, b, in[6] + 0xa3014314, 15); + MD5STEP(F4, b, c, d, a, in[13] + 0x4e0811a1, 21); + MD5STEP(F4, a, b, c, d, in[4] + 0xf7537e82, 6); + MD5STEP(F4, d, a, b, c, in[11] + 0xbd3af235, 10); + MD5STEP(F4, c, d, a, b, in[2] + 0x2ad7d2bb, 15); + MD5STEP(F4, b, c, d, a, in[9] + 0xeb86d391, 21); + + buf[0] += a; + buf[1] += b; + buf[2] += c; + buf[3] += d; +} + +#undef VPX_NO_UNSIGNED_OVERFLOW_CHECK + +#endif diff --git a/tests/third_party/libvpx/md5_utils.h b/tests/third_party/libvpx/md5_utils.h new file mode 100644 index 0000000..13be035 --- /dev/null +++ b/tests/third_party/libvpx/md5_utils.h @@ -0,0 +1,41 @@ +/* + * This is the header file for the MD5 message-digest algorithm. + * The algorithm is due to Ron Rivest. This code was + * written by Colin Plumb in 1993, no copyright is claimed. + * This code is in the public domain; do with it what you wish. + * + * Equivalent code is available from RSA Data Security, Inc. + * This code has been tested against that, and is equivalent, + * except that you don't need to include two pages of legalese + * with every copy. + * + * To compute the message digest of a chunk of bytes, declare an + * MD5Context structure, pass it to MD5Init, call MD5Update as + * needed on buffers full of bytes, and then call MD5Final, which + * will fill a supplied 16-byte array with the digest. + * + * Changed so as no longer to depend on Colin Plumb's `usual.h' + * header definitions + * - Ian Jackson <ian@chiark.greenend.org.uk>. + * Still in the public domain. + */ + +#ifndef LIBGAV1_TESTS_THIRD_PARTY_LIBVPX_MD5_UTILS_H_ +#define LIBGAV1_TESTS_THIRD_PARTY_LIBVPX_MD5_UTILS_H_ + +#define md5byte unsigned char +#define UWORD32 unsigned int + +typedef struct MD5Context MD5Context; +struct MD5Context { + UWORD32 buf[4]; + UWORD32 bytes[2]; + UWORD32 in[16]; +}; + +void MD5Init(struct MD5Context *context); +void MD5Update(struct MD5Context *context, md5byte const *buf, unsigned len); +void MD5Final(unsigned char digest[16], struct MD5Context *context); +void MD5Transform(UWORD32 buf[4], UWORD32 const in[16]); + +#endif // LIBGAV1_TESTS_THIRD_PARTY_LIBVPX_MD5_UTILS_H_ diff --git a/tests/utils.cc b/tests/utils.cc new file mode 100644 index 0000000..b73cf01 --- /dev/null +++ b/tests/utils.cc @@ -0,0 +1,120 @@ +// Copyright 2020 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "tests/utils.h" + +#include <cstddef> +#include <cstdint> +#include <cstdio> +#include <cstring> +#include <memory> +#include <string> + +#include "absl/time/time.h" +#include "gtest/gtest.h" +#include "src/dsp/dsp.h" +#include "src/gav1/decoder_buffer.h" +#include "src/utils/constants.h" +#include "tests/third_party/libvpx/md5_helper.h" + +namespace libgav1 { +namespace test_utils { + +void ResetDspTable(const int bitdepth) { + dsp::Dsp* const dsp = dsp_internal::GetWritableDspTable(bitdepth); + ASSERT_NE(dsp, nullptr); + memset(dsp, 0, sizeof(dsp::Dsp)); +} + +std::string GetMd5Sum(const void* bytes, size_t size) { + libvpx_test::MD5 md5; + md5.Add(static_cast<const uint8_t*>(bytes), size); + return md5.Get(); +} + +template <typename Pixel> +std::string GetMd5Sum(const Pixel* block, int width, int height, int stride) { + libvpx_test::MD5 md5; + const Pixel* row = block; + for (int i = 0; i < height; ++i) { + md5.Add(reinterpret_cast<const uint8_t*>(row), width * sizeof(Pixel)); + row += stride; + } + return md5.Get(); +} + +template std::string GetMd5Sum(const int8_t* block, int width, int height, + int stride); +template std::string GetMd5Sum(const int16_t* block, int width, int height, + int stride); + +std::string GetMd5Sum(const DecoderBuffer& buffer) { + libvpx_test::MD5 md5; + const size_t pixel_size = + (buffer.bitdepth == 8) ? sizeof(uint8_t) : sizeof(uint16_t); + for (int plane = kPlaneY; plane < buffer.NumPlanes(); ++plane) { + const int height = buffer.displayed_height[plane]; + const size_t width = buffer.displayed_width[plane] * pixel_size; + const int stride = buffer.stride[plane]; + const uint8_t* plane_buffer = buffer.plane[plane]; + for (int row = 0; row < height; ++row) { + md5.Add(plane_buffer, width); + plane_buffer += stride; + } + } + return md5.Get(); +} + +void CheckMd5Digest(const char name[], const char function_name[], + const char expected_digest[], const void* data, size_t size, + absl::Duration elapsed_time) { + const std::string digest = test_utils::GetMd5Sum(data, size); + printf("Mode %s[%31s]: %5d us MD5: %s\n", name, function_name, + static_cast<int>(absl::ToInt64Microseconds(elapsed_time)), + digest.c_str()); + EXPECT_STREQ(expected_digest, digest.c_str()); +} + +template <typename Pixel> +void CheckMd5Digest(const char name[], const char function_name[], + const char expected_digest[], const Pixel* block, int width, + int height, int stride, absl::Duration elapsed_time) { + const std::string digest = + test_utils::GetMd5Sum(block, width, height, stride); + printf("Mode %s[%31s]: %5d us MD5: %s\n", name, function_name, + static_cast<int>(absl::ToInt64Microseconds(elapsed_time)), + digest.c_str()); + EXPECT_STREQ(expected_digest, digest.c_str()); +} + +template void CheckMd5Digest(const char name[], const char function_name[], + const char expected_digest[], const int8_t* block, + int width, int height, int stride, + absl::Duration elapsed_time); +template void CheckMd5Digest(const char name[], const char function_name[], + const char expected_digest[], const int16_t* block, + int width, int height, int stride, + absl::Duration elapsed_time); + +void CheckMd5Digest(const char name[], const char function_name[], + const char expected_digest[], const char actual_digest[], + absl::Duration elapsed_time) { + printf("Mode %s[%31s]: %5d us MD5: %s\n", name, function_name, + static_cast<int>(absl::ToInt64Microseconds(elapsed_time)), + actual_digest); + EXPECT_STREQ(expected_digest, actual_digest); +} + +} // namespace test_utils +} // namespace libgav1 diff --git a/tests/utils.h b/tests/utils.h new file mode 100644 index 0000000..b3062da --- /dev/null +++ b/tests/utils.h @@ -0,0 +1,138 @@ +/* + * Copyright 2020 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_TESTS_UTILS_H_ +#define LIBGAV1_TESTS_UTILS_H_ + +#include <cstddef> +#include <new> +#include <string> + +#include "absl/base/config.h" +#include "absl/time/time.h" +#include "src/gav1/decoder_buffer.h" +#include "src/utils/memory.h" +#include "tests/third_party/libvpx/acm_random.h" + +#ifdef ABSL_HAVE_EXCEPTIONS +#include <exception> +#endif + +namespace libgav1 { +namespace test_utils { + +enum { kAlternateDeterministicSeed = 0x9571 }; +static_assert(kAlternateDeterministicSeed != + libvpx_test::ACMRandom::DeterministicSeed(), + ""); + +// Similar to libgav1::MaxAlignedAllocable, but retains the throwing versions +// of new to support googletest allocations. +struct MaxAlignedAllocable { + // Class-specific allocation functions. + static void* operator new(size_t size) { + void* const p = + libgav1::MaxAlignedAllocable::operator new(size, std::nothrow); +#ifdef ABSL_HAVE_EXCEPTIONS + if (p == nullptr) throw std::bad_alloc(); +#endif + return p; + } + static void* operator new[](size_t size) { + void* const p = + libgav1::MaxAlignedAllocable::operator new[](size, std::nothrow); +#ifdef ABSL_HAVE_EXCEPTIONS + if (p == nullptr) throw std::bad_alloc(); +#endif + return p; + } + + // Class-specific non-throwing allocation functions + static void* operator new(size_t size, const std::nothrow_t& tag) noexcept { + return libgav1::MaxAlignedAllocable::operator new(size, tag); + } + static void* operator new[](size_t size, const std::nothrow_t& tag) noexcept { + return libgav1::MaxAlignedAllocable::operator new[](size, tag); + } + + // Class-specific deallocation functions. + static void operator delete(void* ptr) noexcept { + libgav1::MaxAlignedAllocable::operator delete(ptr); + } + static void operator delete[](void* ptr) noexcept { + libgav1::MaxAlignedAllocable::operator delete[](ptr); + } + + // Only called if new (std::nothrow) is used and the constructor throws an + // exception. + static void operator delete(void* ptr, const std::nothrow_t& tag) noexcept { + libgav1::MaxAlignedAllocable::operator delete(ptr, tag); + } + // Only called if new[] (std::nothrow) is used and the constructor throws an + // exception. + static void operator delete[](void* ptr, const std::nothrow_t& tag) noexcept { + libgav1::MaxAlignedAllocable::operator delete[](ptr, tag); + } +}; + +// Clears dsp table entries for |bitdepth|. This function is not thread safe. +void ResetDspTable(int bitdepth); + +//------------------------------------------------------------------------------ +// Gets human readable hexadecimal encoded MD5 sum from given data, block, or +// frame buffer. + +std::string GetMd5Sum(const void* bytes, size_t size); +template <typename Pixel> +std::string GetMd5Sum(const Pixel* block, int width, int height, int stride); +std::string GetMd5Sum(const DecoderBuffer& buffer); + +//------------------------------------------------------------------------------ +// Compares the md5 digest of |size| bytes of |data| with |expected_digest|. +// Prints a log message with |name|, |function_name|, md5 digest and +// |elapsed_time|. |name| and |function_name| are merely tags used for logging +// and can be any meaningful string depending on the caller's context. + +void CheckMd5Digest(const char name[], const char function_name[], + const char expected_digest[], const void* data, size_t size, + absl::Duration elapsed_time); + +//------------------------------------------------------------------------------ +// Compares the md5 digest of |block| with |expected_digest|. The width, height, +// and stride of |block| are |width|, |height|, and |stride|, respectively. +// Prints a log message with |name|, |function_name|, md5 digest and +// |elapsed_time|. |name| and |function_name| are merely tags used for logging +// and can be any meaningful string depending on the caller's context. + +template <typename Pixel> +void CheckMd5Digest(const char name[], const char function_name[], + const char expected_digest[], const Pixel* block, int width, + int height, int stride, absl::Duration elapsed_time); + +//------------------------------------------------------------------------------ +// Compares |actual_digest| with |expected_digest|. Prints a log message with +// |name|, |function_name|, md5 digest and |elapsed_time|. |name| and +// |function_name| are merely tags used for logging and can be any meaningful +// string depending on the caller's context. + +void CheckMd5Digest(const char name[], const char function_name[], + const char expected_digest[], const char actual_digest[], + absl::Duration elapsed_time); + +} // namespace test_utils +} // namespace libgav1 + +#endif // LIBGAV1_TESTS_UTILS_H_ diff --git a/tests/utils_test.cc b/tests/utils_test.cc new file mode 100644 index 0000000..1d5b598 --- /dev/null +++ b/tests/utils_test.cc @@ -0,0 +1,190 @@ +// Copyright 2020 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "tests/utils.h" + +#include <cstddef> +#include <cstdint> +#include <memory> +#include <new> + +#include "absl/base/config.h" +#include "gtest/gtest.h" +#include "src/utils/memory.h" + +#ifdef ABSL_HAVE_EXCEPTIONS +#include <exception> +#endif + +namespace libgav1 { +namespace test_utils { +namespace { + +constexpr size_t kMaxAllocableSize = 0x40000000; + +// Has a trivial default constructor that performs no action. +struct SmallMaxAligned : public MaxAlignedAllocable { + alignas(kMaxAlignment) uint8_t x; +}; + +// Has a nontrivial default constructor that initializes the data member. +struct SmallMaxAlignedNontrivialConstructor : public MaxAlignedAllocable { + alignas(kMaxAlignment) uint8_t x = 0; +}; + +// Has a trivial default constructor that performs no action. +struct HugeMaxAligned : public MaxAlignedAllocable { + alignas(kMaxAlignment) uint8_t x[kMaxAllocableSize + 1]; +}; + +// Has a nontrivial default constructor that initializes the data member. +struct HugeMaxAlignedNontrivialConstructor : public MaxAlignedAllocable { + alignas(kMaxAlignment) uint8_t x[kMaxAllocableSize + 1] = {}; +}; + +#ifdef ABSL_HAVE_EXCEPTIONS +struct MaxAlignedThrowingConstructor : public MaxAlignedAllocable { + MaxAlignedThrowingConstructor() { throw std::exception(); } + + uint8_t x; +}; +#endif + +TEST(TestUtilsTest, TestMaxAlignedAllocable) { + { + // MaxAlignedAllocable::operator new (std::nothrow) is called. + std::unique_ptr<SmallMaxAligned> small(new (std::nothrow) SmallMaxAligned); + EXPECT_NE(small, nullptr); + // Note this check doesn't guarantee conformance as a suitably aligned + // address may be returned from any allocator. + EXPECT_EQ(reinterpret_cast<uintptr_t>(small.get()) & (kMaxAlignment - 1), + 0); + // MaxAlignedAllocable::operator delete is called. + } + + { + // MaxAlignedAllocable::operator new is called. + std::unique_ptr<SmallMaxAligned> small(new SmallMaxAligned); + EXPECT_NE(small, nullptr); + // Note this check doesn't guarantee conformance as a suitably aligned + // address may be returned from any allocator. + EXPECT_EQ(reinterpret_cast<uintptr_t>(small.get()) & (kMaxAlignment - 1), + 0); + // MaxAlignedAllocable::operator delete is called. + } + + { + // MaxAlignedAllocable::operator new[] (std::nothrow) is called. + std::unique_ptr<SmallMaxAligned[]> small_array_of_smalls( + new (std::nothrow) SmallMaxAligned[10]); + EXPECT_NE(small_array_of_smalls, nullptr); + EXPECT_EQ(reinterpret_cast<uintptr_t>(small_array_of_smalls.get()) & + (kMaxAlignment - 1), + 0); + // MaxAlignedAllocable::operator delete[] is called. + } + + { + // MaxAlignedAllocable::operator new[] is called. + std::unique_ptr<SmallMaxAligned[]> small_array_of_smalls( + new SmallMaxAligned[10]); + EXPECT_NE(small_array_of_smalls, nullptr); + EXPECT_EQ(reinterpret_cast<uintptr_t>(small_array_of_smalls.get()) & + (kMaxAlignment - 1), + 0); + // MaxAlignedAllocable::operator delete[] is called. + } + + { + // MaxAlignedAllocable::operator new (std::nothrow) is called. + std::unique_ptr<HugeMaxAligned> huge(new (std::nothrow) HugeMaxAligned); + EXPECT_EQ(huge, nullptr); + } + + { + // MaxAlignedAllocable::operator new[] (std::nothrow) is called. + std::unique_ptr<SmallMaxAligned[]> huge_array_of_smalls( + new (std::nothrow) + SmallMaxAligned[kMaxAllocableSize / sizeof(SmallMaxAligned) + 1]); + EXPECT_EQ(huge_array_of_smalls, nullptr); + } + +#ifdef ABSL_HAVE_EXCEPTIONS + try { + // MaxAlignedAllocable::operator new (std::nothrow) is called. + // The constructor throws an exception. + // MaxAlignedAllocable::operator delete (std::nothrow) is called. + auto* always = new (std::nothrow) MaxAlignedThrowingConstructor; + static_cast<void>(always); + } catch (...) { + } + + try { + // MaxAlignedAllocable::operator new is called. + // The constructor throws an exception. + // MaxAlignedAllocable::operator delete is called. + auto* always = new MaxAlignedThrowingConstructor; + static_cast<void>(always); + } catch (...) { + } + + try { + // MaxAlignedAllocable::operator new[] (std::nothrow) is called. + // The constructor throws an exception. + // MaxAlignedAllocable::operator delete[] (std::nothrow) is called. + auto* always = new (std::nothrow) MaxAlignedThrowingConstructor[2]; + static_cast<void>(always); + } catch (...) { + } + + try { + // MaxAlignedAllocable::operator new[] is called. + // The constructor throws an exception. + // MaxAlignedAllocable::operator delete[] is called. + auto* always = new MaxAlignedThrowingConstructor[2]; + static_cast<void>(always); + } catch (...) { + } + + // Note these calls are only safe with exceptions enabled as if the throwing + // operator new returns the object is expected to be valid. In this case an + // attempt to invoke the object's constructor on a nullptr may be made which + // is undefined behavior. + try { + // MaxAlignedAllocable::operator new is called. + std::unique_ptr<HugeMaxAlignedNontrivialConstructor> huge( + new HugeMaxAlignedNontrivialConstructor); + ADD_FAILURE() << "huge allocation should fail."; + } catch (...) { + SUCCEED(); + } + + try { + // MaxAlignedAllocable::operator new[] is called. + std::unique_ptr<SmallMaxAlignedNontrivialConstructor[]> + huge_array_of_smalls( + new SmallMaxAlignedNontrivialConstructor + [kMaxAllocableSize / + sizeof(SmallMaxAlignedNontrivialConstructor) + + 1]); + ADD_FAILURE() << "huge_array_of_smalls allocation should fail."; + } catch (...) { + SUCCEED(); + } +#endif // ABSL_HAVE_EXCEPTIONS +} + +} // namespace +} // namespace test_utils +} // namespace libgav1 |