diff options
author | Boyuan Yang <byang@debian.org> | 2022-07-14 15:56:59 -0400 |
---|---|---|
committer | Boyuan Yang <byang@debian.org> | 2022-07-14 15:56:59 -0400 |
commit | 1a2e17bd28a068714658551c8c355171ce15dfa0 (patch) | |
tree | db9e739007016850ee355365874a20b07034ef2c | |
parent | a08da9600832caf817125edee2c3206fe24cd5cb (diff) | |
parent | d4dbf19f6b0181ee78034bfe4caf189d1c016998 (diff) | |
download | libgav1-1a2e17bd28a068714658551c8c355171ce15dfa0.tar.gz libgav1-1a2e17bd28a068714658551c8c355171ce15dfa0.tar.bz2 libgav1-1a2e17bd28a068714658551c8c355171ce15dfa0.zip |
Update upstream source from tag 'upstream/0.18.0'
Update to upstream version '0.18.0'
with Debian dir a69c1f7f3e7109393a3f9f5f1a2e7a5c3d3eda9f
113 files changed, 8878 insertions, 4376 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index 4029de1..52b1b32 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -48,6 +48,8 @@ libgav1_option(NAME LIBGAV1_ENABLE_NEON HELPSTRING "Enables neon optimizations." VALUE ON) libgav1_option(NAME LIBGAV1_ENABLE_SSE4_1 HELPSTRING "Enables sse4.1 optimizations." VALUE ON) +libgav1_option(NAME LIBGAV1_ENABLE_EXAMPLES HELPSTRING "Enables examples." VALUE + ON) libgav1_option(NAME LIBGAV1_ENABLE_TESTS HELPSTRING "Enables tests." VALUE ON) libgav1_option( NAME LIBGAV1_VERBOSE HELPSTRING @@ -101,6 +103,12 @@ libgav1_track_configuration_variable(LIBGAV1_GENERATED_SOURCES_DIRECTORY) # Controls use of std::mutex and absl::Mutex in ThreadPool. libgav1_track_configuration_variable(LIBGAV1_THREADPOOL_USE_STD_MUTEX) +if((DEFINED + LIBGAV1_THREADPOOL_USE_STD_MUTEX + AND NOT LIBGAV1_THREADPOOL_USE_STD_MUTEX) + OR NOT (DEFINED LIBGAV1_THREADPOOL_USE_STD_MUTEX OR ANDROID OR IOS)) + set(use_absl_threading TRUE) +endif() if(LIBGAV1_VERBOSE) libgav1_dump_cmake_flag_variables() @@ -124,18 +132,22 @@ endif() libgav1_set_test_flags() set(libgav1_abseil "${libgav1_root}/third_party/abseil-cpp") -if(NOT EXISTS "${libgav1_abseil}") - message( - FATAL_ERROR - "Abseil not found. This dependency is required by the" - " examples & tests and libgav1 when LIBGAV1_THREADPOOL_USE_STD_MUTEX is" - " not defined. To continue, download the Abseil repository to" - " third_party/abseil-cpp:\n git \\\n -C ${libgav1_root} \\\n" - " clone \\\n" - " https://github.com/abseil/abseil-cpp.git third_party/abseil-cpp") +if(EXISTS "${libgav1_abseil}") + set(ABSL_PROPAGATE_CXX_STD ON) + add_subdirectory("${libgav1_abseil}" "${libgav1_abseil_build}" + EXCLUDE_FROM_ALL) +else() + if(use_absl_threading OR LIBGAV1_ENABLE_EXAMPLES OR LIBGAV1_ENABLE_TESTS) + message( + FATAL_ERROR + "Abseil not found. This dependency is required by the" + " examples & tests and libgav1 when LIBGAV1_THREADPOOL_USE_STD_MUTEX is" + " not defined. To continue, download the Abseil repository to" + " third_party/abseil-cpp:\n git \\\n -C ${libgav1_root} \\\n" + " clone \\\n" + " https://github.com/abseil/abseil-cpp.git third_party/abseil-cpp") + endif() endif() -set(ABSL_PROPAGATE_CXX_STD ON) -add_subdirectory("${libgav1_abseil}" "${libgav1_abseil_build}" EXCLUDE_FROM_ALL) libgav1_reset_target_lists() libgav1_add_dsp_targets() @@ -1,7 +1,7 @@ # libgav1 -- an AV1 decoder -libgav1 is a Main profile (0) & High profile (1) compliant AV1 decoder. More -information on the AV1 video format can be found at +libgav1 is a Main profile (0), High profile (1) & Professional profile (2) +compliant AV1 decoder. More information on the AV1 video format can be found at [aomedia.org](https://aomedia.org). [TOC] diff --git a/cmake/libgav1_build_definitions.cmake b/cmake/libgav1_build_definitions.cmake index 0d00bb6..95c17be 100644 --- a/cmake/libgav1_build_definitions.cmake +++ b/cmake/libgav1_build_definitions.cmake @@ -31,8 +31,8 @@ macro(libgav1_set_build_definitions) # passed to libtool. # # We set LIBGAV1_SOVERSION = [c-a].a.r - set(LT_CURRENT 0) - set(LT_REVISION 1) + set(LT_CURRENT 1) + set(LT_REVISION 0) set(LT_AGE 0) math(EXPR LIBGAV1_SOVERSION_MAJOR "${LT_CURRENT} - ${LT_AGE}") set(LIBGAV1_SOVERSION "${LIBGAV1_SOVERSION_MAJOR}.${LT_AGE}.${LT_REVISION}") @@ -142,8 +142,10 @@ macro(libgav1_set_build_definitions) if(NOT LIBGAV1_MAX_BITDEPTH) set(LIBGAV1_MAX_BITDEPTH 10) - elseif(NOT LIBGAV1_MAX_BITDEPTH EQUAL 8 AND NOT LIBGAV1_MAX_BITDEPTH EQUAL 10) - libgav1_die("LIBGAV1_MAX_BITDEPTH must be 8 or 10.") + elseif(NOT LIBGAV1_MAX_BITDEPTH EQUAL 8 + AND NOT LIBGAV1_MAX_BITDEPTH EQUAL 10 + AND NOT LIBGAV1_MAX_BITDEPTH EQUAL 12) + libgav1_die("LIBGAV1_MAX_BITDEPTH must be 8, 10 or 12.") endif() list(APPEND libgav1_defines "LIBGAV1_MAX_BITDEPTH=${LIBGAV1_MAX_BITDEPTH}") diff --git a/cmake/libgav1_install.cmake b/cmake/libgav1_install.cmake index b7f6006..e2c79b9 100644 --- a/cmake/libgav1_install.cmake +++ b/cmake/libgav1_install.cmake @@ -48,8 +48,10 @@ macro(libgav1_setup_install_target) FILES ${libgav1_api_includes} DESTINATION "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR}/gav1") - install(TARGETS gav1_decode DESTINATION - "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR}") + if(LIBGAV1_ENABLE_EXAMPLES) + install(TARGETS gav1_decode DESTINATION + "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR}") + endif() install(TARGETS libgav1_static DESTINATION "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}") if(BUILD_SHARED_LIBS) diff --git a/cmake/toolchains/android.cmake b/cmake/toolchains/android.cmake index 492957b..b550397 100644 --- a/cmake/toolchains/android.cmake +++ b/cmake/toolchains/android.cmake @@ -30,9 +30,9 @@ if(NOT ANDROID_ABI) set(ANDROID_ABI arm64-v8a) endif() -# Force arm mode for 32-bit targets (instead of the default thumb) to improve -# performance. -if(NOT ANDROID_ARM_MODE) +# Force arm mode for 32-bit arm targets (instead of the default thumb) to +# improve performance. +if(ANDROID_ABI MATCHES "^armeabi" AND NOT ANDROID_ARM_MODE) set(ANDROID_ARM_MODE arm) endif() diff --git a/cmake/toolchains/arm-linux-gnueabihf.cmake b/cmake/toolchains/arm-linux-gnueabihf.cmake index 7448f54..7d58ce1 100644 --- a/cmake/toolchains/arm-linux-gnueabihf.cmake +++ b/cmake/toolchains/arm-linux-gnueabihf.cmake @@ -27,10 +27,13 @@ endif() if(NOT CMAKE_C_COMPILER) set(CMAKE_C_COMPILER ${CROSS}gcc) endif() -set(CMAKE_C_FLAGS_INIT "-march=armv7-a -marm") +# Note: -march=armv7-a+fp is an alternative to -mfpu with newer versions of +# gcc: +# https://gcc.gnu.org/git/?p=gcc.git&a=commit;h=dff2abcbee65dbb4b7ca3ade0f7622ffdc0af391 +set(CMAKE_C_FLAGS_INIT "-march=armv7-a -marm -mfpu=vfpv3") if(NOT CMAKE_CXX_COMPILER) set(CMAKE_CXX_COMPILER ${CROSS}g++) endif() -set(CMAKE_CXX_FLAGS_INIT "-march=armv7-a -marm") +set(CMAKE_CXX_FLAGS_INIT "-march=armv7-a -marm -mfpu=vfpv3") set(CMAKE_SYSTEM_PROCESSOR "armv7") set(LIBGAV1_NEON_INTRINSICS_FLAG "-mfpu=neon") diff --git a/examples/libgav1_examples.cmake b/examples/libgav1_examples.cmake index 1f949f3..a3ec156 100644 --- a/examples/libgav1_examples.cmake +++ b/examples/libgav1_examples.cmake @@ -17,6 +17,13 @@ if(LIBGAV1_EXAMPLES_LIBGAV1_EXAMPLES_CMAKE_) endif() # LIBGAV1_EXAMPLES_LIBGAV1_EXAMPLES_CMAKE_ set(LIBGAV1_EXAMPLES_LIBGAV1_EXAMPLES_CMAKE_ 1) +if(NOT LIBGAV1_ENABLE_EXAMPLES) + macro(libgav1_add_examples_targets) + + endmacro() + return() +endif() + set(libgav1_file_reader_sources "${libgav1_examples}/file_reader.cc" "${libgav1_examples}/file_reader.h" "${libgav1_examples}/file_reader_constants.cc" diff --git a/src/buffer_pool.cc b/src/buffer_pool.cc index c1a5606..582f13c 100644 --- a/src/buffer_pool.cc +++ b/src/buffer_pool.cc @@ -156,19 +156,15 @@ bool BufferPool::OnFrameBufferSizeChanged(int bitdepth, } RefCountedBufferPtr BufferPool::GetFreeBuffer() { - // In frame parallel mode, the GetFreeBuffer() calls from ObuParser all happen - // from the same thread serially, but the GetFreeBuffer() call in - // DecoderImpl::ApplyFilmGrain can happen from multiple threads at the same - // time. So this function has to be thread safe. - // TODO(b/142583029): Investigate if the GetFreeBuffer() call in - // DecoderImpl::ApplyFilmGrain() call can be serialized so that this function - // need not be thread safe. std::unique_lock<std::mutex> lock(mutex_); for (auto buffer : buffers_) { if (!buffer->in_use_) { buffer->in_use_ = true; buffer->progress_row_ = -1; buffer->frame_state_ = kFrameStateUnknown; + buffer->hdr_cll_set_ = false; + buffer->hdr_mdcv_set_ = false; + buffer->itut_t35_set_ = false; lock.unlock(); return RefCountedBufferPtr(buffer, RefCountedBuffer::ReturnToBufferPool); } diff --git a/src/buffer_pool.h b/src/buffer_pool.h index d9eba6d..d4e50e0 100644 --- a/src/buffer_pool.h +++ b/src/buffer_pool.h @@ -33,6 +33,7 @@ #include "src/symbol_decoder_context.h" #include "src/utils/compiler_attributes.h" #include "src/utils/constants.h" +#include "src/utils/dynamic_buffer.h" #include "src/utils/reference_info.h" #include "src/utils/segmentation.h" #include "src/utils/segmentation_map.h" @@ -134,6 +135,36 @@ class RefCountedBuffer : public MaxAlignedAllocable { int temporal_id() const { return temporal_id_; } void set_temporal_id(int value) { temporal_id_ = value; } + ObuMetadataHdrCll hdr_cll() const { return hdr_cll_; } + void set_hdr_cll(const ObuMetadataHdrCll& hdr_cll) { + hdr_cll_set_ = true; + hdr_cll_ = hdr_cll; + } + bool hdr_cll_set() const { return hdr_cll_set_; } + + ObuMetadataHdrMdcv hdr_mdcv() const { return hdr_mdcv_; } + void set_hdr_mdcv(const ObuMetadataHdrMdcv& hdr_mdcv) { + hdr_mdcv_set_ = true; + hdr_mdcv_ = hdr_mdcv; + } + bool hdr_mdcv_set() const { return hdr_mdcv_set_; } + + ObuMetadataItutT35 itut_t35() const { return itut_t35_; } + bool set_itut_t35(const ObuMetadataItutT35& itut_t35, + const uint8_t* const payload) { + itut_t35_ = itut_t35; + if (itut_t35.payload_size > 0) { + if (!itut_t35_payload_.Resize(itut_t35.payload_size)) return false; + memcpy(itut_t35_payload_.get(), payload, itut_t35.payload_size); + itut_t35_.payload_bytes = itut_t35_payload_.get(); + } else { + itut_t35_.payload_bytes = nullptr; + } + itut_t35_set_ = true; + return true; + } + bool itut_t35_set() const { return itut_t35_set_; } + SegmentationMap* segmentation_map() { return &segmentation_map_; } const SegmentationMap* segmentation_map() const { return &segmentation_map_; } @@ -317,6 +348,14 @@ class RefCountedBuffer : public MaxAlignedAllocable { int spatial_id_ = 0; int temporal_id_ = 0; + ObuMetadataHdrCll hdr_cll_ = {}; + bool hdr_cll_set_ = false; // Set to true when set_hdr_cll() is called. + ObuMetadataHdrMdcv hdr_mdcv_ = {}; + bool hdr_mdcv_set_ = false; // Set to true when set_hdr_mdcv() is called. + ObuMetadataItutT35 itut_t35_ = {}; + DynamicBuffer<uint8_t> itut_t35_payload_; + bool itut_t35_set_ = false; // Set to true when set_itut_t35() is called. + // segmentation_map_ contains a rows4x4_ by columns4x4_ 2D array. SegmentationMap segmentation_map_; diff --git a/src/c_decoder_test.c b/src/c_decoder_test.c index 10ef29f..9587262 100644 --- a/src/c_decoder_test.c +++ b/src/c_decoder_test.c @@ -20,6 +20,9 @@ // clang-format off #include "src/gav1/decoder.h" + +// Import the test frame #defines. +#include "src/decoder_test_data.h" // clang-format on #include <stddef.h> @@ -67,40 +70,17 @@ } \ } while (0) -// These two frames come from the libaom test vector av1-1-b8-01-size-32x32.ivf -static const uint8_t kFrame1[] = { - 0x12, 0x0, 0xa, 0xa, 0x0, 0x0, 0x0, 0x2, 0x27, 0xfe, 0xff, 0xfc, - 0xc0, 0x20, 0x32, 0x93, 0x2, 0x10, 0x0, 0xa8, 0x80, 0x0, 0x3, 0x0, - 0x10, 0x10, 0x30, 0x0, 0xd3, 0xc6, 0xc6, 0x82, 0xaa, 0x5e, 0xbf, 0x82, - 0xf2, 0xa4, 0xa4, 0x29, 0xab, 0xda, 0xd7, 0x1, 0x5, 0x0, 0xb3, 0xde, - 0xa8, 0x6f, 0x8d, 0xbf, 0x1b, 0xa8, 0x25, 0xc3, 0x84, 0x7c, 0x1a, 0x2b, - 0x8b, 0x0, 0xff, 0x19, 0x1f, 0x45, 0x7e, 0xe0, 0xbe, 0xe1, 0x3a, 0x63, - 0xc2, 0xc6, 0x6e, 0xf4, 0xc8, 0xce, 0x11, 0xe1, 0x9f, 0x48, 0x64, 0x72, - 0xeb, 0xbb, 0x4f, 0xf3, 0x94, 0xb4, 0xb6, 0x9d, 0x4f, 0x4, 0x18, 0x5e, - 0x5e, 0x1b, 0x65, 0x49, 0x74, 0x90, 0x13, 0x50, 0xef, 0x8c, 0xb8, 0xe8, - 0xd9, 0x8e, 0x9c, 0xc9, 0x4d, 0xda, 0x60, 0x6a, 0xa, 0xf9, 0x75, 0xd0, - 0x62, 0x69, 0xd, 0xf5, 0xdc, 0xa9, 0xb9, 0x4c, 0x8, 0x9e, 0x33, 0x15, - 0xa3, 0xe1, 0x42, 0x0, 0xe2, 0xb0, 0x46, 0xd0, 0xf7, 0xad, 0x55, 0xbc, - 0x75, 0xe9, 0xe3, 0x1f, 0xa3, 0x41, 0x11, 0xba, 0xaa, 0x81, 0xf3, 0xcb, - 0x82, 0x87, 0x71, 0x0, 0xe6, 0xb9, 0x8c, 0xe1, 0xe9, 0xd3, 0x21, 0xcc, - 0xcd, 0xe7, 0x12, 0xb9, 0xe, 0x43, 0x6a, 0xa3, 0x76, 0x5c, 0x35, 0x90, - 0x45, 0x36, 0x52, 0xb4, 0x2d, 0xa3, 0x55, 0xde, 0x20, 0xf8, 0x80, 0xe1, - 0x26, 0x46, 0x1b, 0x3f, 0x59, 0xc7, 0x2e, 0x5b, 0x4a, 0x73, 0xf8, 0xb3, - 0xf4, 0x62, 0xf4, 0xf5, 0xa4, 0xc2, 0xae, 0x9e, 0xa6, 0x9c, 0x10, 0xbb, - 0xe1, 0xd6, 0x88, 0x75, 0xb9, 0x85, 0x48, 0xe5, 0x7, 0x12, 0xf3, 0x11, - 0x85, 0x8e, 0xa2, 0x95, 0x9d, 0xed, 0x50, 0xfb, 0x6, 0x5a, 0x1, 0x37, - 0xc4, 0x8e, 0x9e, 0x73, 0x9b, 0x96, 0x64, 0xbd, 0x42, 0xb, 0x80, 0xde, - 0x57, 0x86, 0xcb, 0x7d, 0xab, 0x12, 0xb2, 0xcc, 0xe6, 0xea, 0xb5, 0x89, - 0xeb, 0x91, 0xb3, 0x93, 0xb2, 0x4f, 0x2f, 0x5b, 0xf3, 0x72, 0x12, 0x51, - 0x56, 0x75, 0xb3, 0xdd, 0x49, 0xb6, 0x5b, 0x77, 0xbe, 0xc5, 0xd7, 0xd4, - 0xaf, 0xd6, 0x6b, 0x38}; - -static const uint8_t kFrame2[] = { - 0x12, 0x0, 0x32, 0x33, 0x30, 0x3, 0xc3, 0x0, 0xa7, 0x2e, 0x46, - 0xa8, 0x80, 0x0, 0x3, 0x0, 0x10, 0x1, 0x0, 0xa0, 0x0, 0xed, - 0xb1, 0x51, 0x15, 0x58, 0xc7, 0x69, 0x3, 0x26, 0x35, 0xeb, 0x5a, - 0x2d, 0x7a, 0x53, 0x24, 0x26, 0x20, 0xa6, 0x11, 0x7, 0x49, 0x76, - 0xa3, 0xc7, 0x62, 0xf8, 0x3, 0x32, 0xb0, 0x98, 0x17, 0x3d, 0x80}; +static const uint8_t kFrame1[] = {OBU_TEMPORAL_DELIMITER, OBU_SEQUENCE_HEADER, + OBU_FRAME_1}; + +static const uint8_t kFrame2[] = {OBU_TEMPORAL_DELIMITER, OBU_FRAME_2}; + +static const uint8_t kFrame1WithHdrCllAndHdrMdcv[] = { + OBU_TEMPORAL_DELIMITER, OBU_SEQUENCE_HEADER, OBU_METADATA_HDR_CLL, + OBU_METADATA_HDR_MDCV, OBU_FRAME_1}; + +static const uint8_t kFrame2WithItutT35[] = { + OBU_TEMPORAL_DELIMITER, OBU_METADATA_ITUT_T35, OBU_FRAME_2}; typedef struct DecoderTest { Libgav1Decoder* decoder; @@ -429,12 +409,68 @@ static void DecoderTestNonFrameParallelModeInvalidFrameAfterEOS(void) { test.decoder = NULL; } +static void DecoderTestMetadataObu(void) { + DecoderTest test; + DecoderTestInit(&test); + DecoderTestSetUp(&test); + + Libgav1StatusCode status; + const Libgav1DecoderBuffer* buffer; + + // Enqueue frame1 for decoding. + status = Libgav1DecoderEnqueueFrame(test.decoder, kFrame1WithHdrCllAndHdrMdcv, + sizeof(kFrame1WithHdrCllAndHdrMdcv), 0, + (uint8_t*)&kFrame1WithHdrCllAndHdrMdcv); + ASSERT_EQ(status, kLibgav1StatusOk); + ASSERT_EQ(test.frames_in_use, 0); + + // Dequeue the output of frame1. + status = Libgav1DecoderDequeueFrame(test.decoder, &buffer); + ASSERT_EQ(status, kLibgav1StatusOk); + ASSERT_NE(buffer, NULL); + ASSERT_EQ(buffer->has_hdr_cll, 1); + ASSERT_EQ(buffer->has_hdr_mdcv, 1); + ASSERT_EQ(buffer->has_itut_t35, 0); + ASSERT_EQ(test.released_input_buffer, &kFrame1WithHdrCllAndHdrMdcv); + + ASSERT_EQ(test.frames_in_use, 1); + ASSERT_EQ(test.buffer_private_data, buffer->buffer_private_data); + + // Enqueue frame2 for decoding. + status = Libgav1DecoderEnqueueFrame(test.decoder, kFrame2WithItutT35, + sizeof(kFrame2WithItutT35), 0, + (uint8_t*)&kFrame2WithItutT35); + ASSERT_EQ(status, kLibgav1StatusOk); + + ASSERT_EQ(test.frames_in_use, 1); + + // Dequeue the output of frame2. + status = Libgav1DecoderDequeueFrame(test.decoder, &buffer); + ASSERT_EQ(status, kLibgav1StatusOk); + ASSERT_NE(buffer, NULL); + ASSERT_EQ(buffer->has_hdr_cll, 0); + ASSERT_EQ(buffer->has_hdr_mdcv, 0); + ASSERT_EQ(buffer->has_itut_t35, 1); + ASSERT_NE(buffer->itut_t35.payload_bytes, NULL); + ASSERT_NE(buffer->itut_t35.payload_size, 0); + ASSERT_EQ(test.released_input_buffer, &kFrame2WithItutT35); + + ASSERT_EQ(test.frames_in_use, 2); + ASSERT_EQ(test.buffer_private_data, buffer->buffer_private_data); + + status = Libgav1DecoderSignalEOS(test.decoder); + ASSERT_EQ(test.frames_in_use, 0); + + Libgav1DecoderDestroy(test.decoder); +} + int main(void) { fprintf(stderr, "C DecoderTest started\n"); DecoderTestAPIFlowForNonFrameParallelMode(); DecoderTestNonFrameParallelModeEnqueueMultipleFramesWithoutDequeuing(); DecoderTestNonFrameParallelModeEOSBeforeDequeuingLastFrame(); DecoderTestNonFrameParallelModeInvalidFrameAfterEOS(); + DecoderTestMetadataObu(); fprintf(stderr, "C DecoderTest passed\n"); return 0; } diff --git a/src/decoder_impl.cc b/src/decoder_impl.cc index dbb9e81..e8de64a 100644 --- a/src/decoder_impl.cc +++ b/src/decoder_impl.cc @@ -1171,6 +1171,24 @@ StatusCode DecoderImpl::CopyFrameToOutputBuffer( buffer_.spatial_id = frame->spatial_id(); buffer_.temporal_id = frame->temporal_id(); buffer_.buffer_private_data = frame->buffer_private_data(); + if (frame->hdr_cll_set()) { + buffer_.has_hdr_cll = 1; + buffer_.hdr_cll = frame->hdr_cll(); + } else { + buffer_.has_hdr_cll = 0; + } + if (frame->hdr_mdcv_set()) { + buffer_.has_hdr_mdcv = 1; + buffer_.hdr_mdcv = frame->hdr_mdcv(); + } else { + buffer_.has_hdr_mdcv = 0; + } + if (frame->itut_t35_set()) { + buffer_.has_itut_t35 = 1; + buffer_.itut_t35 = frame->itut_t35(); + } else { + buffer_.has_itut_t35 = 0; + } output_frame_ = frame; return kStatusOk; } @@ -1602,7 +1620,7 @@ StatusCode DecoderImpl::ApplyFilmGrain( (*film_grain_frame)->buffer()->stride(kPlaneV)); const int output_stride_uv = (*film_grain_frame)->buffer()->stride(kPlaneU); #if LIBGAV1_MAX_BITDEPTH >= 10 - if (displayable_frame->buffer()->bitdepth() > 8) { + if (displayable_frame->buffer()->bitdepth() == 10) { FilmGrain<10> film_grain(displayable_frame->film_grain_params(), displayable_frame->buffer()->is_monochrome(), color_matrix_is_identity, @@ -1625,6 +1643,30 @@ StatusCode DecoderImpl::ApplyFilmGrain( return kStatusOk; } #endif // LIBGAV1_MAX_BITDEPTH >= 10 +#if LIBGAV1_MAX_BITDEPTH == 12 + if (displayable_frame->buffer()->bitdepth() == 12) { + FilmGrain<12> film_grain(displayable_frame->film_grain_params(), + displayable_frame->buffer()->is_monochrome(), + color_matrix_is_identity, + displayable_frame->buffer()->subsampling_x(), + displayable_frame->buffer()->subsampling_y(), + displayable_frame->upscaled_width(), + displayable_frame->frame_height(), thread_pool); + if (!film_grain.AddNoise( + displayable_frame->buffer()->data(kPlaneY), + displayable_frame->buffer()->stride(kPlaneY), + displayable_frame->buffer()->data(kPlaneU), + displayable_frame->buffer()->data(kPlaneV), input_stride_uv, + (*film_grain_frame)->buffer()->data(kPlaneY), + (*film_grain_frame)->buffer()->stride(kPlaneY), + (*film_grain_frame)->buffer()->data(kPlaneU), + (*film_grain_frame)->buffer()->data(kPlaneV), output_stride_uv)) { + LIBGAV1_DLOG(ERROR, "film_grain.AddNoise() failed."); + return kStatusOutOfMemory; + } + return kStatusOk; + } +#endif // LIBGAV1_MAX_BITDEPTH == 12 FilmGrain<8> film_grain(displayable_frame->film_grain_params(), displayable_frame->buffer()->is_monochrome(), color_matrix_is_identity, diff --git a/src/decoder_impl.h b/src/decoder_impl.h index b52ecdf..b75417d 100644 --- a/src/decoder_impl.h +++ b/src/decoder_impl.h @@ -141,8 +141,9 @@ class DecoderImpl : public Allocable { int64_t user_private_data, void* buffer_private_data); StatusCode DequeueFrame(const DecoderBuffer** out_ptr); static constexpr int GetMaxBitdepth() { - static_assert(LIBGAV1_MAX_BITDEPTH == 8 || LIBGAV1_MAX_BITDEPTH == 10, - "LIBGAV1_MAX_BITDEPTH must be 8 or 10."); + static_assert(LIBGAV1_MAX_BITDEPTH == 8 || LIBGAV1_MAX_BITDEPTH == 10 || + LIBGAV1_MAX_BITDEPTH == 12, + "LIBGAV1_MAX_BITDEPTH must be 8, 10 or 12."); return LIBGAV1_MAX_BITDEPTH; } diff --git a/src/decoder_test.cc b/src/decoder_test.cc index de7d490..e274122 100644 --- a/src/decoder_test.cc +++ b/src/decoder_test.cc @@ -20,44 +20,22 @@ #include <new> #include "gtest/gtest.h" +#include "src/decoder_test_data.h" namespace libgav1 { namespace { -// These two frames come from the libaom test vector av1-1-b8-01-size-32x32.ivf -constexpr uint8_t kFrame1[] = { - 0x12, 0x0, 0xa, 0xa, 0x0, 0x0, 0x0, 0x2, 0x27, 0xfe, 0xff, 0xfc, - 0xc0, 0x20, 0x32, 0x93, 0x2, 0x10, 0x0, 0xa8, 0x80, 0x0, 0x3, 0x0, - 0x10, 0x10, 0x30, 0x0, 0xd3, 0xc6, 0xc6, 0x82, 0xaa, 0x5e, 0xbf, 0x82, - 0xf2, 0xa4, 0xa4, 0x29, 0xab, 0xda, 0xd7, 0x1, 0x5, 0x0, 0xb3, 0xde, - 0xa8, 0x6f, 0x8d, 0xbf, 0x1b, 0xa8, 0x25, 0xc3, 0x84, 0x7c, 0x1a, 0x2b, - 0x8b, 0x0, 0xff, 0x19, 0x1f, 0x45, 0x7e, 0xe0, 0xbe, 0xe1, 0x3a, 0x63, - 0xc2, 0xc6, 0x6e, 0xf4, 0xc8, 0xce, 0x11, 0xe1, 0x9f, 0x48, 0x64, 0x72, - 0xeb, 0xbb, 0x4f, 0xf3, 0x94, 0xb4, 0xb6, 0x9d, 0x4f, 0x4, 0x18, 0x5e, - 0x5e, 0x1b, 0x65, 0x49, 0x74, 0x90, 0x13, 0x50, 0xef, 0x8c, 0xb8, 0xe8, - 0xd9, 0x8e, 0x9c, 0xc9, 0x4d, 0xda, 0x60, 0x6a, 0xa, 0xf9, 0x75, 0xd0, - 0x62, 0x69, 0xd, 0xf5, 0xdc, 0xa9, 0xb9, 0x4c, 0x8, 0x9e, 0x33, 0x15, - 0xa3, 0xe1, 0x42, 0x0, 0xe2, 0xb0, 0x46, 0xd0, 0xf7, 0xad, 0x55, 0xbc, - 0x75, 0xe9, 0xe3, 0x1f, 0xa3, 0x41, 0x11, 0xba, 0xaa, 0x81, 0xf3, 0xcb, - 0x82, 0x87, 0x71, 0x0, 0xe6, 0xb9, 0x8c, 0xe1, 0xe9, 0xd3, 0x21, 0xcc, - 0xcd, 0xe7, 0x12, 0xb9, 0xe, 0x43, 0x6a, 0xa3, 0x76, 0x5c, 0x35, 0x90, - 0x45, 0x36, 0x52, 0xb4, 0x2d, 0xa3, 0x55, 0xde, 0x20, 0xf8, 0x80, 0xe1, - 0x26, 0x46, 0x1b, 0x3f, 0x59, 0xc7, 0x2e, 0x5b, 0x4a, 0x73, 0xf8, 0xb3, - 0xf4, 0x62, 0xf4, 0xf5, 0xa4, 0xc2, 0xae, 0x9e, 0xa6, 0x9c, 0x10, 0xbb, - 0xe1, 0xd6, 0x88, 0x75, 0xb9, 0x85, 0x48, 0xe5, 0x7, 0x12, 0xf3, 0x11, - 0x85, 0x8e, 0xa2, 0x95, 0x9d, 0xed, 0x50, 0xfb, 0x6, 0x5a, 0x1, 0x37, - 0xc4, 0x8e, 0x9e, 0x73, 0x9b, 0x96, 0x64, 0xbd, 0x42, 0xb, 0x80, 0xde, - 0x57, 0x86, 0xcb, 0x7d, 0xab, 0x12, 0xb2, 0xcc, 0xe6, 0xea, 0xb5, 0x89, - 0xeb, 0x91, 0xb3, 0x93, 0xb2, 0x4f, 0x2f, 0x5b, 0xf3, 0x72, 0x12, 0x51, - 0x56, 0x75, 0xb3, 0xdd, 0x49, 0xb6, 0x5b, 0x77, 0xbe, 0xc5, 0xd7, 0xd4, - 0xaf, 0xd6, 0x6b, 0x38}; - -constexpr uint8_t kFrame2[] = { - 0x12, 0x0, 0x32, 0x33, 0x30, 0x3, 0xc3, 0x0, 0xa7, 0x2e, 0x46, - 0xa8, 0x80, 0x0, 0x3, 0x0, 0x10, 0x1, 0x0, 0xa0, 0x0, 0xed, - 0xb1, 0x51, 0x15, 0x58, 0xc7, 0x69, 0x3, 0x26, 0x35, 0xeb, 0x5a, - 0x2d, 0x7a, 0x53, 0x24, 0x26, 0x20, 0xa6, 0x11, 0x7, 0x49, 0x76, - 0xa3, 0xc7, 0x62, 0xf8, 0x3, 0x32, 0xb0, 0x98, 0x17, 0x3d, 0x80}; +constexpr uint8_t kFrame1[] = {OBU_TEMPORAL_DELIMITER, OBU_SEQUENCE_HEADER, + OBU_FRAME_1}; + +constexpr uint8_t kFrame2[] = {OBU_TEMPORAL_DELIMITER, OBU_FRAME_2}; + +constexpr uint8_t kFrame1WithHdrCllAndHdrMdcv[] = { + OBU_TEMPORAL_DELIMITER, OBU_SEQUENCE_HEADER, OBU_METADATA_HDR_CLL, + OBU_METADATA_HDR_MDCV, OBU_FRAME_1}; + +constexpr uint8_t kFrame2WithItutT35[] = {OBU_TEMPORAL_DELIMITER, + OBU_METADATA_ITUT_T35, OBU_FRAME_2}; class DecoderTest : public testing::Test { public: @@ -348,5 +326,54 @@ TEST_F(DecoderTest, NonFrameParallelModeInvalidFrameAfterEOS) { EXPECT_EQ(frames_in_use_, 0); } +TEST_F(DecoderTest, MetadataObu) { + StatusCode status; + const DecoderBuffer* buffer; + + // Enqueue frame1 for decoding. + status = decoder_->EnqueueFrame( + kFrame1WithHdrCllAndHdrMdcv, sizeof(kFrame1WithHdrCllAndHdrMdcv), 0, + const_cast<uint8_t*>(kFrame1WithHdrCllAndHdrMdcv)); + ASSERT_EQ(status, kStatusOk); + + // Dequeue the output of frame1. + status = decoder_->DequeueFrame(&buffer); + ASSERT_EQ(status, kStatusOk); + ASSERT_NE(buffer, nullptr); + EXPECT_EQ(buffer->has_hdr_cll, 1); + EXPECT_EQ(buffer->has_hdr_mdcv, 1); + EXPECT_EQ(buffer->has_itut_t35, 0); + EXPECT_EQ(released_input_buffer_, &kFrame1WithHdrCllAndHdrMdcv); + + // libgav1 has decoded frame1 and is holding a reference to it. + EXPECT_EQ(frames_in_use_, 1); + EXPECT_EQ(buffer_private_data_, buffer->buffer_private_data); + + // Enqueue frame2 for decoding. + status = + decoder_->EnqueueFrame(kFrame2WithItutT35, sizeof(kFrame2WithItutT35), 0, + const_cast<uint8_t*>(kFrame2WithItutT35)); + ASSERT_EQ(status, kStatusOk); + + EXPECT_EQ(frames_in_use_, 1); + + // Dequeue the output of frame2. + status = decoder_->DequeueFrame(&buffer); + ASSERT_EQ(status, kStatusOk); + ASSERT_NE(buffer, nullptr); + EXPECT_EQ(buffer->has_hdr_cll, 0); + EXPECT_EQ(buffer->has_hdr_mdcv, 0); + EXPECT_EQ(buffer->has_itut_t35, 1); + EXPECT_NE(buffer->itut_t35.payload_bytes, nullptr); + EXPECT_GT(buffer->itut_t35.payload_size, 0); + EXPECT_EQ(released_input_buffer_, &kFrame2WithItutT35); + + EXPECT_EQ(frames_in_use_, 2); + EXPECT_EQ(buffer_private_data_, buffer->buffer_private_data); + + status = decoder_->SignalEOS(); + EXPECT_EQ(frames_in_use_, 0); +} + } // namespace } // namespace libgav1 diff --git a/src/decoder_test_data.h b/src/decoder_test_data.h new file mode 100644 index 0000000..78b6b46 --- /dev/null +++ b/src/decoder_test_data.h @@ -0,0 +1,65 @@ +/* + * Copyright 2022 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DECODER_TEST_DATA_H_ +#define LIBGAV1_SRC_DECODER_TEST_DATA_H_ + +// The bytes for these two frames come from the libaom test vector +// av1-1-b8-01-size-32x32.ivf +#define OBU_TEMPORAL_DELIMITER 0x12, 0x0 +#define OBU_SEQUENCE_HEADER \ + 0xa, 0xa, 0x0, 0x0, 0x0, 0x2, 0x27, 0xfe, 0xff, 0xfc, 0xc0, 0x20 +#define OBU_FRAME_1 \ + 0x32, 0x93, 0x2, 0x10, 0x0, 0xa8, 0x80, 0x0, 0x3, 0x0, 0x10, 0x10, 0x30, \ + 0x0, 0xd3, 0xc6, 0xc6, 0x82, 0xaa, 0x5e, 0xbf, 0x82, 0xf2, 0xa4, 0xa4, \ + 0x29, 0xab, 0xda, 0xd7, 0x1, 0x5, 0x0, 0xb3, 0xde, 0xa8, 0x6f, 0x8d, \ + 0xbf, 0x1b, 0xa8, 0x25, 0xc3, 0x84, 0x7c, 0x1a, 0x2b, 0x8b, 0x0, 0xff, \ + 0x19, 0x1f, 0x45, 0x7e, 0xe0, 0xbe, 0xe1, 0x3a, 0x63, 0xc2, 0xc6, 0x6e, \ + 0xf4, 0xc8, 0xce, 0x11, 0xe1, 0x9f, 0x48, 0x64, 0x72, 0xeb, 0xbb, 0x4f, \ + 0xf3, 0x94, 0xb4, 0xb6, 0x9d, 0x4f, 0x4, 0x18, 0x5e, 0x5e, 0x1b, 0x65, \ + 0x49, 0x74, 0x90, 0x13, 0x50, 0xef, 0x8c, 0xb8, 0xe8, 0xd9, 0x8e, 0x9c, \ + 0xc9, 0x4d, 0xda, 0x60, 0x6a, 0xa, 0xf9, 0x75, 0xd0, 0x62, 0x69, 0xd, \ + 0xf5, 0xdc, 0xa9, 0xb9, 0x4c, 0x8, 0x9e, 0x33, 0x15, 0xa3, 0xe1, 0x42, \ + 0x0, 0xe2, 0xb0, 0x46, 0xd0, 0xf7, 0xad, 0x55, 0xbc, 0x75, 0xe9, 0xe3, \ + 0x1f, 0xa3, 0x41, 0x11, 0xba, 0xaa, 0x81, 0xf3, 0xcb, 0x82, 0x87, 0x71, \ + 0x0, 0xe6, 0xb9, 0x8c, 0xe1, 0xe9, 0xd3, 0x21, 0xcc, 0xcd, 0xe7, 0x12, \ + 0xb9, 0xe, 0x43, 0x6a, 0xa3, 0x76, 0x5c, 0x35, 0x90, 0x45, 0x36, 0x52, \ + 0xb4, 0x2d, 0xa3, 0x55, 0xde, 0x20, 0xf8, 0x80, 0xe1, 0x26, 0x46, 0x1b, \ + 0x3f, 0x59, 0xc7, 0x2e, 0x5b, 0x4a, 0x73, 0xf8, 0xb3, 0xf4, 0x62, 0xf4, \ + 0xf5, 0xa4, 0xc2, 0xae, 0x9e, 0xa6, 0x9c, 0x10, 0xbb, 0xe1, 0xd6, 0x88, \ + 0x75, 0xb9, 0x85, 0x48, 0xe5, 0x7, 0x12, 0xf3, 0x11, 0x85, 0x8e, 0xa2, \ + 0x95, 0x9d, 0xed, 0x50, 0xfb, 0x6, 0x5a, 0x1, 0x37, 0xc4, 0x8e, 0x9e, \ + 0x73, 0x9b, 0x96, 0x64, 0xbd, 0x42, 0xb, 0x80, 0xde, 0x57, 0x86, 0xcb, \ + 0x7d, 0xab, 0x12, 0xb2, 0xcc, 0xe6, 0xea, 0xb5, 0x89, 0xeb, 0x91, 0xb3, \ + 0x93, 0xb2, 0x4f, 0x2f, 0x5b, 0xf3, 0x72, 0x12, 0x51, 0x56, 0x75, 0xb3, \ + 0xdd, 0x49, 0xb6, 0x5b, 0x77, 0xbe, 0xc5, 0xd7, 0xd4, 0xaf, 0xd6, 0x6b, \ + 0x38 +#define OBU_FRAME_2 \ + 0x32, 0x33, 0x30, 0x3, 0xc3, 0x0, 0xa7, 0x2e, 0x46, 0xa8, 0x80, 0x0, 0x3, \ + 0x0, 0x10, 0x1, 0x0, 0xa0, 0x0, 0xed, 0xb1, 0x51, 0x15, 0x58, 0xc7, \ + 0x69, 0x3, 0x26, 0x35, 0xeb, 0x5a, 0x2d, 0x7a, 0x53, 0x24, 0x26, 0x20, \ + 0xa6, 0x11, 0x7, 0x49, 0x76, 0xa3, 0xc7, 0x62, 0xf8, 0x3, 0x32, 0xb0, \ + 0x98, 0x17, 0x3d, 0x80 +#define OBU_METADATA_HDR_CLL 0x2a, 0x06, 0x01, 0x27, 0x10, 0x0d, 0xdf, 0x80 +#define OBU_METADATA_HDR_MDCV \ + 0x2a, 0x1a, 0x02, 0xae, 0x14, 0x51, 0xec, 0x43, 0xd7, 0xb0, 0xa4, 0x26, \ + 0x66, 0x0f, 0x5c, 0x50, 0x0d, 0x54, 0x39, 0x00, 0x0f, 0xa0, 0x00, 0x00, \ + 0x00, 0x00, 0x52, 0x80 +#define OBU_METADATA_ITUT_T35 \ + 0x2a, 0xf, 0x04, 0xa6, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, \ + 0x00, 0x80, 0x00, 0x00 + +#endif // LIBGAV1_SRC_DECODER_TEST_DATA_H_ diff --git a/src/dsp/arm/common_neon.h b/src/dsp/arm/common_neon.h index 9c46525..c0af2c1 100644 --- a/src/dsp/arm/common_neon.h +++ b/src/dsp/arm/common_neon.h @@ -309,6 +309,12 @@ inline uint8x16_t MaskOverreadsQ(const uint8x16_t source, return dst; } +inline uint16x8_t MaskOverreadsQ(const uint16x8_t source, + const ptrdiff_t over_read_in_bytes) { + return vreinterpretq_u16_u8( + MaskOverreadsQ(vreinterpretq_u8_u16(source), over_read_in_bytes)); +} + inline uint8x8_t Load1MsanU8(const uint8_t* const source, const ptrdiff_t over_read_in_bytes) { return MaskOverreads(vld1_u8(source), over_read_in_bytes); @@ -325,20 +331,6 @@ inline uint16x8_t Load1QMsanU16(const uint16_t* const source, vreinterpretq_u8_u16(vld1q_u16(source)), over_read_in_bytes)); } -inline uint16x8x2_t Load2QMsanU16(const uint16_t* const source, - const ptrdiff_t over_read_in_bytes) { - // Relative source index of elements (2 bytes each): - // dst.val[0]: 00 02 04 06 08 10 12 14 - // dst.val[1]: 01 03 05 07 09 11 13 15 - uint16x8x2_t dst = vld2q_u16(source); - dst.val[0] = vreinterpretq_u16_u8(MaskOverreadsQ( - vreinterpretq_u8_u16(dst.val[0]), over_read_in_bytes >> 1)); - dst.val[1] = vreinterpretq_u16_u8( - MaskOverreadsQ(vreinterpretq_u8_u16(dst.val[1]), - (over_read_in_bytes >> 1) + (over_read_in_bytes % 4))); - return dst; -} - inline uint32x4_t Load1QMsanU32(const uint32_t* const source, const ptrdiff_t over_read_in_bytes) { return vreinterpretq_u32_u8(MaskOverreadsQ( @@ -402,6 +394,24 @@ inline void Store8(void* const buf, const uint16x8_t val) { vst1q_u16(static_cast<uint16_t*>(buf), val); } +inline void Store4QMsanS16(void* const buf, const int16x8x4_t src) { +#if LIBGAV1_MSAN + // The memory shadow is incorrect for vst4q_u16, only marking the first 16 + // bytes of the destination as initialized. To avoid missing truly + // uninitialized memory, check the input vectors first, before marking the + // whole 64 bytes initialized. If any input vector contains unused values, it + // should pass through MaskOverreadsQ first. + __msan_check_mem_is_initialized(&src.val[0], sizeof(src.val[0])); + __msan_check_mem_is_initialized(&src.val[1], sizeof(src.val[1])); + __msan_check_mem_is_initialized(&src.val[2], sizeof(src.val[2])); + __msan_check_mem_is_initialized(&src.val[3], sizeof(src.val[3])); + vst4q_s16(static_cast<int16_t*>(buf), src); + __msan_unpoison(buf, sizeof(int16x8x4_t)); +#else + vst4q_s16(static_cast<int16_t*>(buf), src); +#endif // LIBGAV1_MSAN +} + //------------------------------------------------------------------------------ // Pointer helpers. @@ -587,7 +597,8 @@ inline int8x8_t VQTbl1S8(const int8x16_t a, const uint8x8_t index) { //------------------------------------------------------------------------------ // Saturation helpers. -inline int16x4_t Clip3S16(int16x4_t val, int16x4_t low, int16x4_t high) { +inline int16x4_t Clip3S16(const int16x4_t val, const int16x4_t low, + const int16x4_t high) { return vmin_s16(vmax_s16(val, low), high); } @@ -596,7 +607,7 @@ inline int16x8_t Clip3S16(const int16x8_t val, const int16x8_t low, return vminq_s16(vmaxq_s16(val, low), high); } -inline uint16x8_t ConvertToUnsignedPixelU16(int16x8_t val, int bitdepth) { +inline uint16x8_t ConvertToUnsignedPixelU16(const int16x8_t val, int bitdepth) { const int16x8_t low = vdupq_n_s16(0); const uint16x8_t high = vdupq_n_u16((1 << bitdepth) - 1); @@ -727,7 +738,7 @@ inline uint16x8_t Transpose64(const uint16x8_t a) { return vextq_u16(a, a, 4); } // Output: // b0.val[0]: 00 01 02 03 16 17 18 19 // b0.val[1]: 04 05 06 07 20 21 22 23 -inline int16x8x2_t VtrnqS64(int32x4_t a0, int32x4_t a1) { +inline int16x8x2_t VtrnqS64(const int32x4_t a0, const int32x4_t a1) { int16x8x2_t b0; b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)), vreinterpret_s16_s32(vget_low_s32(a1))); @@ -736,7 +747,7 @@ inline int16x8x2_t VtrnqS64(int32x4_t a0, int32x4_t a1) { return b0; } -inline uint16x8x2_t VtrnqU64(uint32x4_t a0, uint32x4_t a1) { +inline uint16x8x2_t VtrnqU64(const uint32x4_t a0, const uint32x4_t a1) { uint16x8x2_t b0; b0.val[0] = vcombine_u16(vreinterpret_u16_u32(vget_low_u32(a0)), vreinterpret_u16_u32(vget_low_u32(a1))); @@ -750,6 +761,11 @@ inline uint16x8x2_t VtrnqU64(uint32x4_t a0, uint32x4_t a1) { // 10 11 12 13 // 20 21 22 23 // 30 31 32 33 +// Output: +// 00 10 20 30 +// 01 11 21 31 +// 02 12 22 32 +// 03 13 23 33 inline void Transpose4x4(uint16x4_t a[4]) { // b: // 00 10 02 12 diff --git a/src/dsp/arm/convolve_10bit_neon.cc b/src/dsp/arm/convolve_10bit_neon.cc index b7205df..389f029 100644 --- a/src/dsp/arm/convolve_10bit_neon.cc +++ b/src/dsp/arm/convolve_10bit_neon.cc @@ -45,12 +45,12 @@ namespace { // Pixel output range: [ 0, 1023] // Compound output range: [ 3988, 61532] -template <int filter_index> +template <int num_taps> int32x4x2_t SumOnePassTaps(const uint16x8_t* const src, const int16x4_t* const taps) { const auto* ssrc = reinterpret_cast<const int16x8_t*>(src); int32x4x2_t sum; - if (filter_index < 2) { + if (num_taps == 6) { // 6 taps. sum.val[0] = vmull_s16(vget_low_s16(ssrc[0]), taps[0]); sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[1]), taps[1]); @@ -65,7 +65,7 @@ int32x4x2_t SumOnePassTaps(const uint16x8_t* const src, sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[3]), taps[3]); sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[4]), taps[4]); sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[5]), taps[5]); - } else if (filter_index == 2) { + } else if (num_taps == 8) { // 8 taps. sum.val[0] = vmull_s16(vget_low_s16(ssrc[0]), taps[0]); sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[1]), taps[1]); @@ -84,7 +84,7 @@ int32x4x2_t SumOnePassTaps(const uint16x8_t* const src, sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[5]), taps[5]); sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[6]), taps[6]); sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[7]), taps[7]); - } else if (filter_index == 3) { + } else if (num_taps == 2) { // 2 taps. sum.val[0] = vmull_s16(vget_low_s16(ssrc[0]), taps[0]); sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[1]), taps[1]); @@ -106,12 +106,12 @@ int32x4x2_t SumOnePassTaps(const uint16x8_t* const src, return sum; } -template <int filter_index> +template <int num_taps> int32x4_t SumOnePassTaps(const uint16x4_t* const src, const int16x4_t* const taps) { const auto* ssrc = reinterpret_cast<const int16x4_t*>(src); int32x4_t sum; - if (filter_index < 2) { + if (num_taps == 6) { // 6 taps. sum = vmull_s16(ssrc[0], taps[0]); sum = vmlal_s16(sum, ssrc[1], taps[1]); @@ -119,7 +119,7 @@ int32x4_t SumOnePassTaps(const uint16x4_t* const src, sum = vmlal_s16(sum, ssrc[3], taps[3]); sum = vmlal_s16(sum, ssrc[4], taps[4]); sum = vmlal_s16(sum, ssrc[5], taps[5]); - } else if (filter_index == 2) { + } else if (num_taps == 8) { // 8 taps. sum = vmull_s16(ssrc[0], taps[0]); sum = vmlal_s16(sum, ssrc[1], taps[1]); @@ -129,7 +129,7 @@ int32x4_t SumOnePassTaps(const uint16x4_t* const src, sum = vmlal_s16(sum, ssrc[5], taps[5]); sum = vmlal_s16(sum, ssrc[6], taps[6]); sum = vmlal_s16(sum, ssrc[7], taps[7]); - } else if (filter_index == 3) { + } else if (num_taps == 2) { // 2 taps. sum = vmull_s16(ssrc[0], taps[0]); sum = vmlal_s16(sum, ssrc[1], taps[1]); @@ -143,7 +143,7 @@ int32x4_t SumOnePassTaps(const uint16x4_t* const src, return sum; } -template <int filter_index, bool is_compound, bool is_2d> +template <int num_taps, bool is_compound, bool is_2d> void FilterHorizontalWidth8AndUp(const uint16_t* LIBGAV1_RESTRICT src, const ptrdiff_t src_stride, void* LIBGAV1_RESTRICT const dest, @@ -162,15 +162,15 @@ void FilterHorizontalWidth8AndUp(const uint16_t* LIBGAV1_RESTRICT src, const uint16x8_t src_long_hi = vld1q_u16(s + 8); uint16x8_t v_src[8]; int32x4x2_t v_sum; - if (filter_index < 2) { + if (num_taps == 6) { v_src[0] = src_long; v_src[1] = vextq_u16(src_long, src_long_hi, 1); v_src[2] = vextq_u16(src_long, src_long_hi, 2); v_src[3] = vextq_u16(src_long, src_long_hi, 3); v_src[4] = vextq_u16(src_long, src_long_hi, 4); v_src[5] = vextq_u16(src_long, src_long_hi, 5); - v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 1); - } else if (filter_index == 2) { + v_sum = SumOnePassTaps<num_taps>(v_src, v_tap + 1); + } else if (num_taps == 8) { v_src[0] = src_long; v_src[1] = vextq_u16(src_long, src_long_hi, 1); v_src[2] = vextq_u16(src_long, src_long_hi, 2); @@ -179,17 +179,17 @@ void FilterHorizontalWidth8AndUp(const uint16_t* LIBGAV1_RESTRICT src, v_src[5] = vextq_u16(src_long, src_long_hi, 5); v_src[6] = vextq_u16(src_long, src_long_hi, 6); v_src[7] = vextq_u16(src_long, src_long_hi, 7); - v_sum = SumOnePassTaps<filter_index>(v_src, v_tap); - } else if (filter_index == 3) { + v_sum = SumOnePassTaps<num_taps>(v_src, v_tap); + } else if (num_taps == 2) { v_src[0] = src_long; v_src[1] = vextq_u16(src_long, src_long_hi, 1); - v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 3); - } else { // filter_index > 3 + v_sum = SumOnePassTaps<num_taps>(v_src, v_tap + 3); + } else { // 4 taps v_src[0] = src_long; v_src[1] = vextq_u16(src_long, src_long_hi, 1); v_src[2] = vextq_u16(src_long, src_long_hi, 2); v_src[3] = vextq_u16(src_long, src_long_hi, 3); - v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 2); + v_sum = SumOnePassTaps<num_taps>(v_src, v_tap + 2); } const int16x4_t d0 = @@ -213,15 +213,15 @@ void FilterHorizontalWidth8AndUp(const uint16_t* LIBGAV1_RESTRICT src, const uint16x8_t src_long_hi = vld1q_u16(src + x + 8); uint16x8_t v_src[8]; int32x4x2_t v_sum; - if (filter_index < 2) { + if (num_taps == 6) { v_src[0] = src_long; v_src[1] = vextq_u16(src_long, src_long_hi, 1); v_src[2] = vextq_u16(src_long, src_long_hi, 2); v_src[3] = vextq_u16(src_long, src_long_hi, 3); v_src[4] = vextq_u16(src_long, src_long_hi, 4); v_src[5] = vextq_u16(src_long, src_long_hi, 5); - v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 1); - } else if (filter_index == 2) { + v_sum = SumOnePassTaps<num_taps>(v_src, v_tap + 1); + } else if (num_taps == 8) { v_src[0] = src_long; v_src[1] = vextq_u16(src_long, src_long_hi, 1); v_src[2] = vextq_u16(src_long, src_long_hi, 2); @@ -230,17 +230,17 @@ void FilterHorizontalWidth8AndUp(const uint16_t* LIBGAV1_RESTRICT src, v_src[5] = vextq_u16(src_long, src_long_hi, 5); v_src[6] = vextq_u16(src_long, src_long_hi, 6); v_src[7] = vextq_u16(src_long, src_long_hi, 7); - v_sum = SumOnePassTaps<filter_index>(v_src, v_tap); - } else if (filter_index == 3) { + v_sum = SumOnePassTaps<num_taps>(v_src, v_tap); + } else if (num_taps == 2) { v_src[0] = src_long; v_src[1] = vextq_u16(src_long, src_long_hi, 1); - v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 3); - } else { // filter_index > 3 + v_sum = SumOnePassTaps<num_taps>(v_src, v_tap + 3); + } else { // 4 taps v_src[0] = src_long; v_src[1] = vextq_u16(src_long, src_long_hi, 1); v_src[2] = vextq_u16(src_long, src_long_hi, 2); v_src[3] = vextq_u16(src_long, src_long_hi, 3); - v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 2); + v_sum = SumOnePassTaps<num_taps>(v_src, v_tap + 2); } if (is_compound) { const int16x4_t v_compound_offset = vdup_n_s16(kCompoundOffset); @@ -276,7 +276,7 @@ void FilterHorizontalWidth8AndUp(const uint16_t* LIBGAV1_RESTRICT src, } while (--y != 0); } -template <int filter_index, bool is_compound, bool is_2d> +template <int num_taps, bool is_compound, bool is_2d> void FilterHorizontalWidth4(const uint16_t* LIBGAV1_RESTRICT src, const ptrdiff_t src_stride, void* LIBGAV1_RESTRICT const dest, @@ -291,14 +291,14 @@ void FilterHorizontalWidth4(const uint16_t* LIBGAV1_RESTRICT src, int32x4_t v_sum; const uint16x8_t src_long = vld1q_u16(src); v_src[0] = vget_low_u16(src_long); - if (filter_index == 3) { + if (num_taps == 2) { v_src[1] = vget_low_u16(vextq_u16(src_long, v_zero, 1)); - v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 3); + v_sum = SumOnePassTaps<num_taps>(v_src, v_tap + 3); } else { v_src[1] = vget_low_u16(vextq_u16(src_long, v_zero, 1)); v_src[2] = vget_low_u16(vextq_u16(src_long, v_zero, 2)); v_src[3] = vget_low_u16(vextq_u16(src_long, v_zero, 3)); - v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 2); + v_sum = SumOnePassTaps<num_taps>(v_src, v_tap + 2); } if (is_compound || is_2d) { const int16x4_t d0 = vqrshrn_n_s32(v_sum, kInterRoundBitsHorizontal - 1); @@ -321,7 +321,7 @@ void FilterHorizontalWidth4(const uint16_t* LIBGAV1_RESTRICT src, } while (--y != 0); } -template <int filter_index, bool is_2d> +template <int num_taps, bool is_2d> void FilterHorizontalWidth2(const uint16_t* LIBGAV1_RESTRICT src, const ptrdiff_t src_stride, void* LIBGAV1_RESTRICT const dest, @@ -336,7 +336,7 @@ void FilterHorizontalWidth2(const uint16_t* LIBGAV1_RESTRICT src, const int16x8_t input1 = vreinterpretq_s16_u16(vld1q_u16(src + src_stride)); const int16x8x2_t input = vzipq_s16(input0, input1); int32x4_t v_sum; - if (filter_index == 3) { + if (num_taps == 2) { v_sum = vmull_s16(vget_low_s16(input.val[0]), v_tap[3]); v_sum = vmlal_s16(v_sum, vget_low_s16(vextq_s16(input.val[0], input.val[1], 2)), @@ -387,7 +387,7 @@ void FilterHorizontalWidth2(const uint16_t* LIBGAV1_RESTRICT src, assert(height % 2 == 1); const int16x8_t input = vreinterpretq_s16_u16(vld1q_u16(src)); int32x4_t v_sum; - if (filter_index == 3) { + if (num_taps == 2) { v_sum = vmull_s16(vget_low_s16(input), v_tap[3]); v_sum = vmlal_s16(v_sum, vget_low_s16(vextq_s16(input, input, 1)), v_tap[4]); @@ -406,17 +406,17 @@ void FilterHorizontalWidth2(const uint16_t* LIBGAV1_RESTRICT src, } } -template <int filter_index, bool is_compound, bool is_2d> +template <int num_taps, bool is_compound, bool is_2d> void FilterHorizontal(const uint16_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride, void* LIBGAV1_RESTRICT const dest, const ptrdiff_t pred_stride, const int width, const int height, const int16x4_t* const v_tap) { - assert(width < 8 || filter_index <= 3); + assert(width < 8 || num_taps != 4); // Don't simplify the redundant if conditions with the template parameters, // which helps the compiler generate compact code. - if (width >= 8 && filter_index <= 3) { - FilterHorizontalWidth8AndUp<filter_index, is_compound, is_2d>( + if (width >= 8 && num_taps != 4) { + FilterHorizontalWidth8AndUp<num_taps, is_compound, is_2d>( src, src_stride, dest, pred_stride, width, height, v_tap); return; } @@ -424,17 +424,17 @@ void FilterHorizontal(const uint16_t* LIBGAV1_RESTRICT const src, // Horizontal passes only needs to account for number of taps 2 and 4 when // |width| <= 4. assert(width <= 4); - assert(filter_index >= 3 && filter_index <= 5); - if (filter_index >= 3 && filter_index <= 5) { + assert(num_taps == 2 || num_taps == 4); + if (num_taps == 2 || num_taps == 4) { if (width == 4) { - FilterHorizontalWidth4<filter_index, is_compound, is_2d>( + FilterHorizontalWidth4<num_taps, is_compound, is_2d>( src, src_stride, dest, pred_stride, height, v_tap); return; } assert(width == 2); if (!is_compound) { - FilterHorizontalWidth2<filter_index, is_2d>(src, src_stride, dest, - pred_stride, height, v_tap); + FilterHorizontalWidth2<num_taps, is_2d>(src, src_stride, dest, + pred_stride, height, v_tap); } } } @@ -455,22 +455,17 @@ LIBGAV1_ALWAYS_INLINE void DoHorizontalPass( } if (filter_index == 2) { // 8 tap. - FilterHorizontal<2, is_compound, is_2d>(src, src_stride, dst, dst_stride, + FilterHorizontal<8, is_compound, is_2d>(src, src_stride, dst, dst_stride, width, height, v_tap); - } else if (filter_index == 1) { // 6 tap. - FilterHorizontal<1, is_compound, is_2d>(src + 1, src_stride, dst, + } else if (filter_index < 2) { // 6 tap. + FilterHorizontal<6, is_compound, is_2d>(src + 1, src_stride, dst, dst_stride, width, height, v_tap); - } else if (filter_index == 0) { // 6 tap. - FilterHorizontal<0, is_compound, is_2d>(src + 1, src_stride, dst, - dst_stride, width, height, v_tap); - } else if (filter_index == 4) { // 4 tap. + } else if ((filter_index & 0x4) != 0) { // 4 tap. + // ((filter_index == 4) | (filter_index == 5)) FilterHorizontal<4, is_compound, is_2d>(src + 2, src_stride, dst, dst_stride, width, height, v_tap); - } else if (filter_index == 5) { // 4 tap. - FilterHorizontal<5, is_compound, is_2d>(src + 2, src_stride, dst, - dst_stride, width, height, v_tap); } else { // 2 tap. - FilterHorizontal<3, is_compound, is_2d>(src + 3, src_stride, dst, + FilterHorizontal<2, is_compound, is_2d>(src + 3, src_stride, dst, dst_stride, width, height, v_tap); } } @@ -510,13 +505,12 @@ void ConvolveCompoundHorizontal_NEON( filter_index); } -template <int filter_index, bool is_compound = false> +template <int num_taps, bool is_compound = false> void FilterVertical(const uint16_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride, void* LIBGAV1_RESTRICT const dst, const ptrdiff_t dst_stride, const int width, const int height, const int16x4_t* const taps) { - const int num_taps = GetNumTapsInFilter(filter_index); const int next_row = num_taps - 1; const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1); auto* const dst16 = static_cast<uint16_t*>(dst); @@ -555,7 +549,7 @@ void FilterVertical(const uint16_t* LIBGAV1_RESTRICT const src, srcs[next_row] = vld1q_u16(src_x); src_x += src_stride; - const int32x4x2_t v_sum = SumOnePassTaps<filter_index>(srcs, taps); + const int32x4x2_t v_sum = SumOnePassTaps<num_taps>(srcs, taps); if (is_compound) { const int16x4_t v_compound_offset = vdup_n_s16(kCompoundOffset); const int16x4_t d0 = @@ -593,13 +587,12 @@ void FilterVertical(const uint16_t* LIBGAV1_RESTRICT const src, } while (x < width); } -template <int filter_index, bool is_compound = false> +template <int num_taps, bool is_compound = false> void FilterVertical4xH(const uint16_t* LIBGAV1_RESTRICT src, const ptrdiff_t src_stride, void* LIBGAV1_RESTRICT const dst, const ptrdiff_t dst_stride, const int height, const int16x4_t* const taps) { - const int num_taps = GetNumTapsInFilter(filter_index); const int next_row = num_taps - 1; const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1); auto* dst16 = static_cast<uint16_t*>(dst); @@ -633,8 +626,8 @@ void FilterVertical4xH(const uint16_t* LIBGAV1_RESTRICT src, srcs[num_taps] = vld1_u16(src); src += src_stride; - const int32x4_t v_sum = SumOnePassTaps<filter_index>(srcs, taps); - const int32x4_t v_sum_1 = SumOnePassTaps<filter_index>(srcs + 1, taps); + const int32x4_t v_sum = SumOnePassTaps<num_taps>(srcs, taps); + const int32x4_t v_sum_1 = SumOnePassTaps<num_taps>(srcs + 1, taps); if (is_compound) { const int16x4_t d0 = vqrshrn_n_s32(v_sum, kInterRoundBitsHorizontal - 1); const int16x4_t d1 = @@ -673,13 +666,12 @@ void FilterVertical4xH(const uint16_t* LIBGAV1_RESTRICT src, } while (y != 0); } -template <int filter_index> +template <int num_taps> void FilterVertical2xH(const uint16_t* LIBGAV1_RESTRICT src, const ptrdiff_t src_stride, void* LIBGAV1_RESTRICT const dst, const ptrdiff_t dst_stride, const int height, const int16x4_t* const taps) { - const int num_taps = GetNumTapsInFilter(filter_index); const int next_row = num_taps - 1; const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1); auto* dst16 = static_cast<uint16_t*>(dst); @@ -718,7 +710,7 @@ void FilterVertical2xH(const uint16_t* LIBGAV1_RESTRICT src, src += src_stride; srcs[next_row] = vext_u16(srcs[next_row - 1], srcs[num_taps], 2); - const int32x4_t v_sum = SumOnePassTaps<filter_index>(srcs, taps); + const int32x4_t v_sum = SumOnePassTaps<num_taps>(srcs, taps); const uint16x4_t d0 = vmin_u16(vqrshrun_n_s32(v_sum, kFilterBits - 1), v_max_bitdepth); Store2<0>(dst16, d0); @@ -1180,13 +1172,13 @@ void ConvolveVertical_NEON( if (filter_index == 0) { // 6 tap. if (width == 2) { - FilterVertical2xH<0>(src, src_stride, dest, dest_stride, height, + FilterVertical2xH<6>(src, src_stride, dest, dest_stride, height, taps + 1); } else if (width == 4) { - FilterVertical4xH<0>(src, src_stride, dest, dest_stride, height, + FilterVertical4xH<6>(src, src_stride, dest, dest_stride, height, taps + 1); } else { - FilterVertical<0>(src, src_stride, dest, dest_stride, width, height, + FilterVertical<6>(src, src_stride, dest, dest_stride, width, height, taps + 1); } } else if ((static_cast<int>(filter_index == 1) & @@ -1196,33 +1188,33 @@ void ConvolveVertical_NEON( static_cast<int>(vertical_filter_id == 9) | static_cast<int>(vertical_filter_id == 15))) != 0) { // 6 tap. if (width == 2) { - FilterVertical2xH<1>(src, src_stride, dest, dest_stride, height, + FilterVertical2xH<6>(src, src_stride, dest, dest_stride, height, taps + 1); } else if (width == 4) { - FilterVertical4xH<1>(src, src_stride, dest, dest_stride, height, + FilterVertical4xH<6>(src, src_stride, dest, dest_stride, height, taps + 1); } else { - FilterVertical<1>(src, src_stride, dest, dest_stride, width, height, + FilterVertical<6>(src, src_stride, dest, dest_stride, width, height, taps + 1); } } else if (filter_index == 2) { // 8 tap. if (width == 2) { - FilterVertical2xH<2>(src, src_stride, dest, dest_stride, height, taps); + FilterVertical2xH<8>(src, src_stride, dest, dest_stride, height, taps); } else if (width == 4) { - FilterVertical4xH<2>(src, src_stride, dest, dest_stride, height, taps); + FilterVertical4xH<8>(src, src_stride, dest, dest_stride, height, taps); } else { - FilterVertical<2>(src, src_stride, dest, dest_stride, width, height, + FilterVertical<8>(src, src_stride, dest, dest_stride, width, height, taps); } } else if (filter_index == 3) { // 2 tap. if (width == 2) { - FilterVertical2xH<3>(src, src_stride, dest, dest_stride, height, + FilterVertical2xH<2>(src, src_stride, dest, dest_stride, height, taps + 3); } else if (width == 4) { - FilterVertical4xH<3>(src, src_stride, dest, dest_stride, height, + FilterVertical4xH<2>(src, src_stride, dest, dest_stride, height, taps + 3); } else { - FilterVertical<3>(src, src_stride, dest, dest_stride, width, height, + FilterVertical<2>(src, src_stride, dest, dest_stride, width, height, taps + 3); } } else { @@ -1240,13 +1232,13 @@ void ConvolveVertical_NEON( // treating it as though it has 4. if (filter_index == 1) src += src_stride; if (width == 2) { - FilterVertical2xH<5>(src, src_stride, dest, dest_stride, height, + FilterVertical2xH<4>(src, src_stride, dest, dest_stride, height, taps + 2); } else if (width == 4) { - FilterVertical4xH<5>(src, src_stride, dest, dest_stride, height, + FilterVertical4xH<4>(src, src_stride, dest, dest_stride, height, taps + 2); } else { - FilterVertical<5>(src, src_stride, dest, dest_stride, width, height, + FilterVertical<4>(src, src_stride, dest, dest_stride, width, height, taps + 2); } } @@ -1274,10 +1266,10 @@ void ConvolveCompoundVertical_NEON( if (filter_index == 0) { // 6 tap. if (width == 4) { - FilterVertical4xH<0, /*is_compound=*/true>(src, src_stride, dest, 4, + FilterVertical4xH<6, /*is_compound=*/true>(src, src_stride, dest, 4, height, taps + 1); } else { - FilterVertical<0, /*is_compound=*/true>(src, src_stride, dest, width, + FilterVertical<6, /*is_compound=*/true>(src, src_stride, dest, width, width, height, taps + 1); } } else if ((static_cast<int>(filter_index == 1) & @@ -1287,26 +1279,26 @@ void ConvolveCompoundVertical_NEON( static_cast<int>(vertical_filter_id == 9) | static_cast<int>(vertical_filter_id == 15))) != 0) { // 6 tap. if (width == 4) { - FilterVertical4xH<1, /*is_compound=*/true>(src, src_stride, dest, 4, + FilterVertical4xH<6, /*is_compound=*/true>(src, src_stride, dest, 4, height, taps + 1); } else { - FilterVertical<1, /*is_compound=*/true>(src, src_stride, dest, width, + FilterVertical<6, /*is_compound=*/true>(src, src_stride, dest, width, width, height, taps + 1); } } else if (filter_index == 2) { // 8 tap. if (width == 4) { - FilterVertical4xH<2, /*is_compound=*/true>(src, src_stride, dest, 4, + FilterVertical4xH<8, /*is_compound=*/true>(src, src_stride, dest, 4, height, taps); } else { - FilterVertical<2, /*is_compound=*/true>(src, src_stride, dest, width, + FilterVertical<8, /*is_compound=*/true>(src, src_stride, dest, width, width, height, taps); } } else if (filter_index == 3) { // 2 tap. if (width == 4) { - FilterVertical4xH<3, /*is_compound=*/true>(src, src_stride, dest, 4, + FilterVertical4xH<2, /*is_compound=*/true>(src, src_stride, dest, 4, height, taps + 3); } else { - FilterVertical<3, /*is_compound=*/true>(src, src_stride, dest, width, + FilterVertical<2, /*is_compound=*/true>(src, src_stride, dest, width, width, height, taps + 3); } } else { @@ -1323,10 +1315,10 @@ void ConvolveCompoundVertical_NEON( // treating it as though it has 4. if (filter_index == 1) src += src_stride; if (width == 4) { - FilterVertical4xH<5, /*is_compound=*/true>(src, src_stride, dest, 4, + FilterVertical4xH<4, /*is_compound=*/true>(src, src_stride, dest, 4, height, taps + 2); } else { - FilterVertical<5, /*is_compound=*/true>(src, src_stride, dest, width, + FilterVertical<4, /*is_compound=*/true>(src, src_stride, dest, width, width, height, taps + 2); } } @@ -1980,7 +1972,7 @@ inline void ConvolveKernelHorizontal2Tap( PermuteSrcVals(src_bytes, src_lookup[1])}; vst1_s16(intermediate, - vrshrn_n_s32(SumOnePassTaps</*filter_index=*/3>(src, taps), + vrshrn_n_s32(SumOnePassTaps</*num_taps=*/2>(src, taps), kInterRoundBitsHorizontal - 1)); src_y = AddByteStride(src_y, src_stride); intermediate += kIntermediateStride; @@ -2034,13 +2026,12 @@ inline void ConvolveKernelHorizontal2Tap( const uint16x4_t src_high[2] = {vget_high_u16(src[0]), vget_high_u16(src[1])}; - vst1_s16(intermediate_x, vrshrn_n_s32(SumOnePassTaps</*filter_index=*/3>( - src_low, taps_low), - kInterRoundBitsHorizontal - 1)); - vst1_s16( - intermediate_x + 4, - vrshrn_n_s32(SumOnePassTaps</*filter_index=*/3>(src_high, taps_high), - kInterRoundBitsHorizontal - 1)); + vst1_s16(intermediate_x, + vrshrn_n_s32(SumOnePassTaps</*num_taps=*/2>(src_low, taps_low), + kInterRoundBitsHorizontal - 1)); + vst1_s16(intermediate_x + 4, + vrshrn_n_s32(SumOnePassTaps</*num_taps=*/2>(src_high, taps_high), + kInterRoundBitsHorizontal - 1)); // Avoid right shifting the stride. src_x = AddByteStride(src_x, src_stride); intermediate_x += kIntermediateStride; @@ -2123,7 +2114,7 @@ inline void ConvolveKernelHorizontalPositive4Tap( PermuteSrcVals(src_bytes, src_lookup[3])}; vst1_s16(intermediate, - vrshrn_n_s32(SumOnePassTaps</*filter_index=*/5>(src, taps), + vrshrn_n_s32(SumOnePassTaps</*num_taps=*/4>(src, taps), kInterRoundBitsHorizontal - 1)); src_y = AddByteStride(src_y, src_stride); intermediate += kIntermediateStride; @@ -2202,7 +2193,7 @@ inline void ConvolveKernelHorizontalSigned4Tap( PermuteSrcVals(src_bytes, src_lookup[3])}; vst1_s16(intermediate, - vrshrn_n_s32(SumOnePassTaps</*filter_index=*/4>(src, taps), + vrshrn_n_s32(SumOnePassTaps</*num_taps=*/4>(src, taps), kInterRoundBitsHorizontal - 1)); src_y = AddByteStride(src_y, src_stride); intermediate += kIntermediateStride; @@ -2297,13 +2288,12 @@ inline void ConvolveKernelHorizontalSigned6Tap( src_high[i] = vget_high_u16(src_i); } - vst1_s16(intermediate_x, vrshrn_n_s32(SumOnePassTaps</*filter_index=*/0>( - src_low, taps_low), - kInterRoundBitsHorizontal - 1)); - vst1_s16( - intermediate_x + 4, - vrshrn_n_s32(SumOnePassTaps</*filter_index=*/0>(src_high, taps_high), - kInterRoundBitsHorizontal - 1)); + vst1_s16(intermediate_x, + vrshrn_n_s32(SumOnePassTaps</*num_taps=*/6>(src_low, taps_low), + kInterRoundBitsHorizontal - 1)); + vst1_s16(intermediate_x + 4, + vrshrn_n_s32(SumOnePassTaps</*num_taps=*/6>(src_high, taps_high), + kInterRoundBitsHorizontal - 1)); // Avoid right shifting the stride. src_x = AddByteStride(src_x, src_stride); intermediate_x += kIntermediateStride; @@ -2401,13 +2391,12 @@ inline void ConvolveKernelHorizontalMixed6Tap( src_high[i] = vget_high_u16(src_i); } - vst1_s16(intermediate_x, vrshrn_n_s32(SumOnePassTaps</*filter_index=*/0>( - src_low, taps_low), - kInterRoundBitsHorizontal - 1)); - vst1_s16( - intermediate_x + 4, - vrshrn_n_s32(SumOnePassTaps</*filter_index=*/0>(src_high, taps_high), - kInterRoundBitsHorizontal - 1)); + vst1_s16(intermediate_x, + vrshrn_n_s32(SumOnePassTaps</*num_taps=*/6>(src_low, taps_low), + kInterRoundBitsHorizontal - 1)); + vst1_s16(intermediate_x + 4, + vrshrn_n_s32(SumOnePassTaps</*num_taps=*/6>(src_high, taps_high), + kInterRoundBitsHorizontal - 1)); // Avoid right shifting the stride. src_x = AddByteStride(src_x, src_stride); intermediate_x += kIntermediateStride; @@ -2505,13 +2494,12 @@ inline void ConvolveKernelHorizontalSigned8Tap( src_high[i] = vget_high_u16(src_i); } - vst1_s16(intermediate_x, vrshrn_n_s32(SumOnePassTaps</*filter_index=*/2>( - src_low, taps_low), - kInterRoundBitsHorizontal - 1)); - vst1_s16( - intermediate_x + 4, - vrshrn_n_s32(SumOnePassTaps</*filter_index=*/2>(src_high, taps_high), - kInterRoundBitsHorizontal - 1)); + vst1_s16(intermediate_x, + vrshrn_n_s32(SumOnePassTaps</*num_taps=*/8>(src_low, taps_low), + kInterRoundBitsHorizontal - 1)); + vst1_s16(intermediate_x + 4, + vrshrn_n_s32(SumOnePassTaps</*num_taps=*/8>(src_high, taps_high), + kInterRoundBitsHorizontal - 1)); // Avoid right shifting the stride. src_x = AddByteStride(src_x, src_stride); intermediate_x += kIntermediateStride; diff --git a/src/dsp/arm/distance_weighted_blend_neon.cc b/src/dsp/arm/distance_weighted_blend_neon.cc index 7d287c8..6087276 100644 --- a/src/dsp/arm/distance_weighted_blend_neon.cc +++ b/src/dsp/arm/distance_weighted_blend_neon.cc @@ -36,44 +36,48 @@ constexpr int kInterPostRoundBit = 4; namespace low_bitdepth { namespace { -inline int16x8_t ComputeWeightedAverage8(const int16x8_t pred0, +inline uint8x8_t ComputeWeightedAverage8(const int16x8_t pred0, const int16x8_t pred1, - const int16x4_t weights[2]) { - // TODO(https://issuetracker.google.com/issues/150325685): Investigate range. - const int32x4_t wpred0_lo = vmull_s16(weights[0], vget_low_s16(pred0)); - const int32x4_t wpred0_hi = vmull_s16(weights[0], vget_high_s16(pred0)); - const int32x4_t blended_lo = - vmlal_s16(wpred0_lo, weights[1], vget_low_s16(pred1)); - const int32x4_t blended_hi = - vmlal_s16(wpred0_hi, weights[1], vget_high_s16(pred1)); - - return vcombine_s16(vqrshrn_n_s32(blended_lo, kInterPostRoundBit + 4), - vqrshrn_n_s32(blended_hi, kInterPostRoundBit + 4)); + const int16x8_t weight) { + // Given: p0,p1 in range [-5132,9212] and w0 = 16 - w1, w1 = 16 - w0 + // Output: (p0 * w0 + p1 * w1 + 128(=rounding bit)) >> + // 8(=kInterPostRoundBit + 4) + // The formula is manipulated to avoid lengthening to 32 bits. + // p0 * w0 + p1 * w1 = p0 * w0 + (16 - w0) * p1 + // = (p0 - p1) * w0 + 16 * p1 + // Maximum value of p0 - p1 is 9212 + 5132 = 0x3808. + const int16x8_t diff = vsubq_s16(pred0, pred1); + // (((p0 - p1) * (w0 << 11) << 1) >> 16) + ((16 * p1) >> 4) + const int16x8_t weighted_diff = vqdmulhq_s16(diff, weight); + // ((p0 - p1) * w0 >> 4) + p1 + const int16x8_t upscaled_average = vaddq_s16(weighted_diff, pred1); + // (((p0 - p1) * w0 >> 4) + p1 + (128 >> 4)) >> 4 + return vqrshrun_n_s16(upscaled_average, kInterPostRoundBit); } -template <int width, int height> +template <int width> inline void DistanceWeightedBlendSmall_NEON( const int16_t* LIBGAV1_RESTRICT prediction_0, - const int16_t* LIBGAV1_RESTRICT prediction_1, const int16x4_t weights[2], - void* LIBGAV1_RESTRICT const dest, const ptrdiff_t dest_stride) { + const int16_t* LIBGAV1_RESTRICT prediction_1, const int height, + const int16x8_t weight, void* LIBGAV1_RESTRICT const dest, + const ptrdiff_t dest_stride) { auto* dst = static_cast<uint8_t*>(dest); constexpr int step = 16 / width; - for (int y = 0; y < height; y += step) { + int y = height; + do { const int16x8_t src_00 = vld1q_s16(prediction_0); const int16x8_t src_10 = vld1q_s16(prediction_1); prediction_0 += 8; prediction_1 += 8; - const int16x8_t res0 = ComputeWeightedAverage8(src_00, src_10, weights); + const uint8x8_t result0 = ComputeWeightedAverage8(src_00, src_10, weight); const int16x8_t src_01 = vld1q_s16(prediction_0); const int16x8_t src_11 = vld1q_s16(prediction_1); prediction_0 += 8; prediction_1 += 8; - const int16x8_t res1 = ComputeWeightedAverage8(src_01, src_11, weights); + const uint8x8_t result1 = ComputeWeightedAverage8(src_01, src_11, weight); - const uint8x8_t result0 = vqmovun_s16(res0); - const uint8x8_t result1 = vqmovun_s16(res1); if (width == 4) { StoreLo4(dst, result0); dst += dest_stride; @@ -90,12 +94,13 @@ inline void DistanceWeightedBlendSmall_NEON( vst1_u8(dst, result1); dst += dest_stride; } - } + y -= step; + } while (y != 0); } inline void DistanceWeightedBlendLarge_NEON( const int16_t* LIBGAV1_RESTRICT prediction_0, - const int16_t* LIBGAV1_RESTRICT prediction_1, const int16x4_t weights[2], + const int16_t* LIBGAV1_RESTRICT prediction_1, const int16x8_t weight, const int width, const int height, void* LIBGAV1_RESTRICT const dest, const ptrdiff_t dest_stride) { auto* dst = static_cast<uint8_t*>(dest); @@ -106,16 +111,15 @@ inline void DistanceWeightedBlendLarge_NEON( do { const int16x8_t src0_lo = vld1q_s16(prediction_0 + x); const int16x8_t src1_lo = vld1q_s16(prediction_1 + x); - const int16x8_t res_lo = - ComputeWeightedAverage8(src0_lo, src1_lo, weights); + const uint8x8_t res_lo = + ComputeWeightedAverage8(src0_lo, src1_lo, weight); const int16x8_t src0_hi = vld1q_s16(prediction_0 + x + 8); const int16x8_t src1_hi = vld1q_s16(prediction_1 + x + 8); - const int16x8_t res_hi = - ComputeWeightedAverage8(src0_hi, src1_hi, weights); + const uint8x8_t res_hi = + ComputeWeightedAverage8(src0_hi, src1_hi, weight); - const uint8x16_t result = - vcombine_u8(vqmovun_s16(res_lo), vqmovun_s16(res_hi)); + const uint8x16_t result = vcombine_u8(res_lo, res_hi); vst1q_u8(dst + x, result); x += 16; } while (x < width); @@ -128,52 +132,25 @@ inline void DistanceWeightedBlendLarge_NEON( inline void DistanceWeightedBlend_NEON( const void* LIBGAV1_RESTRICT prediction_0, const void* LIBGAV1_RESTRICT prediction_1, const uint8_t weight_0, - const uint8_t weight_1, const int width, const int height, + const uint8_t /*weight_1*/, const int width, const int height, void* LIBGAV1_RESTRICT const dest, const ptrdiff_t dest_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); - int16x4_t weights[2] = {vdup_n_s16(weight_0), vdup_n_s16(weight_1)}; - // TODO(johannkoenig): Investigate the branching. May be fine to call with a - // variable height. + // Upscale the weight for vqdmulh. + const int16x8_t weight = vdupq_n_s16(weight_0 << 11); if (width == 4) { - if (height == 4) { - DistanceWeightedBlendSmall_NEON<4, 4>(pred_0, pred_1, weights, dest, - dest_stride); - } else if (height == 8) { - DistanceWeightedBlendSmall_NEON<4, 8>(pred_0, pred_1, weights, dest, - dest_stride); - } else { - assert(height == 16); - DistanceWeightedBlendSmall_NEON<4, 16>(pred_0, pred_1, weights, dest, - dest_stride); - } + DistanceWeightedBlendSmall_NEON<4>(pred_0, pred_1, height, weight, dest, + dest_stride); return; } if (width == 8) { - switch (height) { - case 4: - DistanceWeightedBlendSmall_NEON<8, 4>(pred_0, pred_1, weights, dest, - dest_stride); - return; - case 8: - DistanceWeightedBlendSmall_NEON<8, 8>(pred_0, pred_1, weights, dest, - dest_stride); - return; - case 16: - DistanceWeightedBlendSmall_NEON<8, 16>(pred_0, pred_1, weights, dest, - dest_stride); - return; - default: - assert(height == 32); - DistanceWeightedBlendSmall_NEON<8, 32>(pred_0, pred_1, weights, dest, - dest_stride); - - return; - } + DistanceWeightedBlendSmall_NEON<8>(pred_0, pred_1, height, weight, dest, + dest_stride); + return; } - DistanceWeightedBlendLarge_NEON(pred_0, pred_1, weights, width, height, dest, + DistanceWeightedBlendLarge_NEON(pred_0, pred_1, weight, width, height, dest, dest_stride); } diff --git a/src/dsp/arm/film_grain_neon.cc b/src/dsp/arm/film_grain_neon.cc index 0b1b481..76e1151 100644 --- a/src/dsp/arm/film_grain_neon.cc +++ b/src/dsp/arm/film_grain_neon.cc @@ -18,23 +18,21 @@ #if LIBGAV1_ENABLE_NEON #include <arm_neon.h> -#include <algorithm> #include <cassert> #include <cstddef> #include <cstdint> #include <cstring> -#include <new> #include "src/dsp/arm/common_neon.h" -#include "src/dsp/arm/film_grain_neon.h" -#include "src/dsp/common.h" #include "src/dsp/constants.h" #include "src/dsp/dsp.h" #include "src/dsp/film_grain_common.h" +#include "src/utils/array_2d.h" #include "src/utils/common.h" #include "src/utils/compiler_attributes.h" -#include "src/utils/logging.h" +#include "src/utils/constants.h" #include "src/utils/memory.h" +#include "src/utils/types.h" namespace libgav1 { namespace dsp { @@ -52,10 +50,8 @@ inline int16x8_t GetSignedSource8(const uint8_t* src) { return ZeroExtend(vld1_u8(src)); } -inline int16x8_t GetSignedSource8Msan(const uint8_t* src, int /*valid_range*/) { - // TODO(b/194217060): restore |valid_range| usage after correcting call sites - // causing test vector failures. - return ZeroExtend(Load1MsanU8(src, 0)); +inline int16x8_t GetSignedSource8Msan(const uint8_t* src, int valid_range) { + return ZeroExtend(Load1MsanU8(src, 8 - valid_range)); } inline void StoreUnsigned8(uint8_t* dest, const uint16x8_t data) { @@ -69,11 +65,8 @@ inline int16x8_t GetSignedSource8(const uint16_t* src) { return vreinterpretq_s16_u16(vld1q_u16(src)); } -inline int16x8_t GetSignedSource8Msan(const uint16_t* src, - int /*valid_range*/) { - // TODO(b/194217060): restore |valid_range| usage after correcting call sites - // causing test vector failures. - return vreinterpretq_s16_u16(Load1QMsanU16(src, 0)); +inline int16x8_t GetSignedSource8Msan(const uint16_t* src, int valid_range) { + return vreinterpretq_s16_u16(Load1QMsanU16(src, 16 - valid_range)); } inline void StoreUnsigned8(uint16_t* dest, const uint16x8_t data) { @@ -198,17 +191,13 @@ inline uint16x8_t GetAverageLuma(const uint8_t* const luma, int subsampling_x) { } inline uint16x8_t GetAverageLumaMsan(const uint8_t* const luma, - int subsampling_x, int /*valid_range*/) { + int subsampling_x, int valid_range) { if (subsampling_x != 0) { - // TODO(b/194217060): restore |valid_range| usage after correcting call - // sites causing test vector failures. - const uint8x16_t src = Load1QMsanU8(luma, 0); - + const uint8x16_t src = MaskOverreadsQ(vld1q_u8(luma), 16 - valid_range); + // MemorySanitizer registers vpaddlq_u8 as a use of the memory. return vrshrq_n_u16(vpaddlq_u8(src), 1); } - // TODO(b/194217060): restore |valid_range| usage after correcting call sites - // causing test vector failures. - return vmovl_u8(Load1MsanU8(luma, 0)); + return MaskOverreadsQ(vmovl_u8(vld1_u8(luma)), 16 - valid_range); } #if LIBGAV1_MAX_BITDEPTH >= 10 @@ -252,16 +241,13 @@ inline uint16x8_t GetAverageLuma(const uint16_t* const luma, } inline uint16x8_t GetAverageLumaMsan(const uint16_t* const luma, - int subsampling_x, int /*valid_range*/) { + int subsampling_x, int valid_range) { if (subsampling_x != 0) { - // TODO(b/194217060): restore |valid_range| usage after correcting call - // sites causing test vector failures. - const uint16x8x2_t src = Load2QMsanU16(luma, 0); - return vrhaddq_u16(src.val[0], src.val[1]); + const uint16x8x2_t src = vld2q_u16(luma); + const uint16x8_t result = vrhaddq_u16(src.val[0], src.val[1]); + return MaskOverreadsQ(result, 16 - valid_range); } - // TODO(b/194217060): restore |valid_range| usage after correcting call sites - // causing test vector failures. - return Load1QMsanU16(luma, 0); + return Load1QMsanU16(luma, 16 - valid_range); } #endif // LIBGAV1_MAX_BITDEPTH >= 10 @@ -614,8 +600,7 @@ void InitializeScalingLookupTable_NEON(int num_points, } static_assert(sizeof(scaling_lut[0]) == 2, ""); Memset(scaling_lut, point_scaling[0], - std::max(static_cast<int>(point_value[0]), 1) - << (bitdepth - kBitdepth8)); + (static_cast<int>(point_value[0]) + 1) << (bitdepth - kBitdepth8)); const int32x4_t steps = vmovl_s16(vcreate_s16(0x0003000200010000)); const int32x4_t rounding = vdupq_n_s32(32768); for (int i = 0; i < num_points - 1; ++i) { @@ -666,7 +651,7 @@ void InitializeScalingLookupTable_NEON(int num_points, const int16x8x4_t result = { start, vaddq_s16(start, vrshrq_n_s16(delta, 2)), vaddq_s16(start, delta2), vaddq_s16(start, delta3)}; - vst4q_s16(&scaling_lut[x_base], result); + Store4QMsanS16(&scaling_lut[x_base], result); } else { vst1q_s16(&scaling_lut[x_base], full_interp); } @@ -696,13 +681,29 @@ inline int16x8_t Clip3(const int16x8_t value, const int16x8_t low, } template <int bitdepth, typename Pixel> -inline int16x8_t GetScalingFactors( - const int16_t scaling_lut[kScalingLookupTableSize], const Pixel* source) { +inline int16x8_t GetScalingFactors(const int16_t scaling_lut[], + const Pixel* source) { int16_t start_vals[8]; static_assert(bitdepth <= kBitdepth10, "NEON Film Grain is not yet implemented for 12bpp."); +#if LIBGAV1_MSAN + memset(start_vals, 0, sizeof(start_vals)); +#endif for (int i = 0; i < 8; ++i) { - assert(source[i] < kScalingLookupTableSize << (bitdepth - 2)); + assert(source[i] < (kScalingLookupTableSize << (bitdepth - kBitdepth8))); + start_vals[i] = scaling_lut[source[i]]; + } + return vld1q_s16(start_vals); +} + +template <int bitdepth, typename Pixel> +inline int16x8_t GetScalingFactors(const int16_t scaling_lut[], + const Pixel* source, const int valid_range) { + int16_t start_vals[8]; + static_assert(bitdepth <= kBitdepth10, + "NEON Film Grain is not yet implemented for 12bpp."); + for (int i = 0; i < valid_range; ++i) { + assert(source[i] < (kScalingLookupTableSize << (bitdepth - kBitdepth8))); start_vals[i] = scaling_lut[source[i]]; } return vld1q_s16(start_vals); @@ -743,10 +744,11 @@ void BlendNoiseWithImageLuma_NEON( const int16x8_t scaling_shift_vect = vdupq_n_s16( (bitdepth == kBitdepth10) ? 15 - scaling_shift : -scaling_shift); + const int safe_width = width & ~15; int y = 0; do { int x = 0; - do { + for (; x + 8 <= safe_width; x += 8) { // This operation on the unsigned input is safe in 8bpp because the vector // is widened before it is reinterpreted. const int16x8_t orig0 = GetSignedSource8(&in_y_row[x]); @@ -767,8 +769,8 @@ void BlendNoiseWithImageLuma_NEON( // This operation on the unsigned input is safe in 8bpp because the vector // is widened before it is reinterpreted. const int16x8_t orig1 = GetSignedSource8(&in_y_row[x]); - const int16x8_t scaling1 = GetScalingFactors<bitdepth, Pixel>( - scaling_lut_y, &in_y_row[std::min(x, width)]); + const int16x8_t scaling1 = + GetScalingFactors<bitdepth, Pixel>(scaling_lut_y, &in_y_row[x]); noise = GetSignedSource8(&(noise_image[kPlaneY][y + start_height][x])); noise = ScaleNoise<bitdepth>(noise, scaling1, scaling_shift_vect); @@ -778,8 +780,41 @@ void BlendNoiseWithImageLuma_NEON( // function for just that case, though the gain would be very small. StoreUnsigned8(&out_y_row[x], vreinterpretq_u16_s16(Clip3(combined1, floor, ceiling))); - x += 8; - } while (x < width); + } + + if (x < width) { + assert(width - x < 16); + if (x < width - 8) { + const int16x8_t orig = GetSignedSource8(&in_y_row[x]); + const int16x8_t scaling = + GetScalingFactors<bitdepth, Pixel>(scaling_lut_y, &in_y_row[x]); + int16x8_t noise = + GetSignedSource8(&(noise_image[kPlaneY][y + start_height][x])); + + noise = ScaleNoise<bitdepth>(noise, scaling, scaling_shift_vect); + const int16x8_t combined = vaddq_s16(orig, noise); + // In 8bpp, when params_.clip_to_restricted_range == false, we can + // replace clipping with vqmovun_s16, but it's not likely to be worth + // copying the function for just that case, though the gain would be + // very small. + StoreUnsigned8(&out_y_row[x], + vreinterpretq_u16_s16(Clip3(combined, floor, ceiling))); + x += 8; + } + const int valid_range_pixels = width - x; + const int valid_range_bytes = (width - x) * sizeof(in_y_row[0]); + const int16x8_t orig = + GetSignedSource8Msan(&in_y_row[x], valid_range_bytes); + const int16x8_t scaling = GetScalingFactors<bitdepth, Pixel>( + scaling_lut_y, &in_y_row[x], valid_range_pixels); + int16x8_t noise = + GetSignedSource8(&(noise_image[kPlaneY][y + start_height][x])); + noise = ScaleNoise<bitdepth>(noise, scaling, scaling_shift_vect); + + const int16x8_t combined = vaddq_s16(orig, noise); + StoreUnsigned8(&out_y_row[x], + vreinterpretq_u16_s16(Clip3(combined, floor, ceiling))); + } in_y_row += source_stride_y; out_y_row += dest_stride_y; } while (++y < height); @@ -787,13 +822,9 @@ void BlendNoiseWithImageLuma_NEON( template <int bitdepth, typename GrainType, typename Pixel> inline int16x8_t BlendChromaValsWithCfl( - const Pixel* LIBGAV1_RESTRICT average_luma_buffer, - const int16_t* LIBGAV1_RESTRICT scaling_lut, const Pixel* LIBGAV1_RESTRICT chroma_cursor, const GrainType* LIBGAV1_RESTRICT noise_image_cursor, - const int16x8_t scaling_shift_vect) { - const int16x8_t scaling = - GetScalingFactors<bitdepth, Pixel>(scaling_lut, average_luma_buffer); + const int16x8_t scaling, const int16x8_t scaling_shift_vect) { const int16x8_t orig = GetSignedSource8(chroma_cursor); int16x8_t noise = GetSignedSource8(noise_image_cursor); noise = ScaleNoise<bitdepth>(noise, scaling, scaling_shift_vect); @@ -812,7 +843,6 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlaneWithCfl_NEON( const int16x8_t floor = vdupq_n_s16(min_value); const int16x8_t ceiling = vdupq_n_s16(max_chroma); Pixel luma_buffer[16]; - memset(luma_buffer, 0, sizeof(luma_buffer)); // In 8bpp, the maximum upscaled noise is 127*255 = 0x7E81, which is safe // for 16 bit signed integers. In higher bitdepths, however, we have to // expand to 32 to protect the sign bit. @@ -831,40 +861,45 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlaneWithCfl_NEON( int y = 0; do { int x = 0; - do { + for (; x + 8 <= safe_chroma_width; x += 8) { const int luma_x = x << subsampling_x; const uint16x8_t average_luma = GetAverageLuma(&in_y_row[luma_x], subsampling_x); StoreUnsigned8(average_luma_buffer, average_luma); + const int16x8_t scaling = + GetScalingFactors<bitdepth, Pixel>(scaling_lut, average_luma_buffer); const int16x8_t blended = BlendChromaValsWithCfl<bitdepth, GrainType, Pixel>( - average_luma_buffer, scaling_lut, &in_chroma_row[x], - &(noise_image[y + start_height][x]), scaling_shift_vect); + &in_chroma_row[x], &(noise_image[y + start_height][x]), scaling, + scaling_shift_vect); // In 8bpp, when params_.clip_to_restricted_range == false, we can replace // clipping with vqmovun_s16, but it's not likely to be worth copying the // function for just that case. StoreUnsigned8(&out_chroma_row[x], vreinterpretq_u16_s16(Clip3(blended, floor, ceiling))); - x += 8; - } while (x < safe_chroma_width); + } if (x < chroma_width) { const int luma_x = x << subsampling_x; const int valid_range_pixels = width - luma_x; + const int valid_range_chroma_pixels = chroma_width - x; const int valid_range_bytes = valid_range_pixels * sizeof(in_y_row[0]); + assert(valid_range_pixels < 16); memcpy(luma_buffer, &in_y_row[luma_x], valid_range_bytes); luma_buffer[valid_range_pixels] = in_y_row[width - 1]; const uint16x8_t average_luma = GetAverageLumaMsan( - luma_buffer, subsampling_x, valid_range_bytes + sizeof(in_y_row[0])); + luma_buffer, subsampling_x, valid_range_chroma_pixels << 1); StoreUnsigned8(average_luma_buffer, average_luma); + const int16x8_t scaling = GetScalingFactors<bitdepth, Pixel>( + scaling_lut, average_luma_buffer, valid_range_chroma_pixels); const int16x8_t blended = BlendChromaValsWithCfl<bitdepth, GrainType, Pixel>( - average_luma_buffer, scaling_lut, &in_chroma_row[x], - &(noise_image[y + start_height][x]), scaling_shift_vect); + &in_chroma_row[x], &(noise_image[y + start_height][x]), scaling, + scaling_shift_vect); // In 8bpp, when params_.clip_to_restricted_range == false, we can replace // clipping with vqmovun_s16, but it's not likely to be worth copying the // function for just that case. @@ -915,7 +950,8 @@ inline int16x8_t BlendChromaValsNoCfl( const int16_t* LIBGAV1_RESTRICT scaling_lut, const int16x8_t orig, const int8_t* LIBGAV1_RESTRICT noise_image_cursor, const int16x8_t& average_luma, const int16x8_t& scaling_shift_vect, - const int16x8_t& offset, int luma_multiplier, int chroma_multiplier) { + const int16x8_t& offset, int luma_multiplier, int chroma_multiplier, + bool restrict_scaling_lookup, int valid_range_pixels = 0) { uint8_t merged_buffer[8]; const int16x8_t weighted_luma = vmulq_n_s16(average_luma, luma_multiplier); const int16x8_t weighted_chroma = vmulq_n_s16(orig, chroma_multiplier); @@ -925,8 +961,12 @@ inline int16x8_t BlendChromaValsNoCfl( // 0x7E81 + 0x1FE0 = 0x9E61, therefore another halving add is required. const uint8x8_t merged = vqshrun_n_s16(vhaddq_s16(offset, combined), 4); vst1_u8(merged_buffer, merged); + const int16x8_t scaling = - GetScalingFactors<kBitdepth8, uint8_t>(scaling_lut, merged_buffer); + restrict_scaling_lookup + ? GetScalingFactors<kBitdepth8, uint8_t>(scaling_lut, merged_buffer, + valid_range_pixels) + : GetScalingFactors<kBitdepth8, uint8_t>(scaling_lut, merged_buffer); int16x8_t noise = GetSignedSource8(noise_image_cursor); noise = ScaleNoise<kBitdepth8>(noise, scaling, scaling_shift_vect); return vaddq_s16(orig, noise); @@ -952,34 +992,28 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlane8bpp_NEON( const int chroma_width = (width + subsampling_x) >> subsampling_x; const int safe_chroma_width = chroma_width & ~7; uint8_t luma_buffer[16]; -#if LIBGAV1_MSAN - // Quiet msan warnings. - memset(luma_buffer, 0, sizeof(luma_buffer)); -#endif const int16x8_t offset = vdupq_n_s16(chroma_offset << 5); start_height >>= subsampling_y; int y = 0; do { int x = 0; - do { + for (; x + 8 <= safe_chroma_width; x += 8) { const int luma_x = x << subsampling_x; - const int valid_range = width - luma_x; + const int valid_range_chroma_pixels = chroma_width - x; const int16x8_t orig_chroma = GetSignedSource8(&in_chroma_row[x]); - const int16x8_t average_luma = vreinterpretq_s16_u16( - GetAverageLumaMsan(&in_y_row[luma_x], subsampling_x, valid_range)); + const int16x8_t average_luma = vreinterpretq_s16_u16(GetAverageLumaMsan( + &in_y_row[luma_x], subsampling_x, valid_range_chroma_pixels << 1)); const int16x8_t blended = BlendChromaValsNoCfl( scaling_lut, orig_chroma, &(noise_image[y + start_height][x]), average_luma, scaling_shift_vect, offset, luma_multiplier, - chroma_multiplier); + chroma_multiplier, /*restrict_scaling_lookup=*/false); // In 8bpp, when params_.clip_to_restricted_range == false, we can // replace clipping with vqmovun_s16, but the gain would be small. StoreUnsigned8(&out_chroma_row[x], vreinterpretq_u16_s16(Clip3(blended, floor, ceiling))); - - x += 8; - } while (x < safe_chroma_width); + } if (x < chroma_width) { // Begin right edge iteration. Same as the normal iterations, but the @@ -988,19 +1022,20 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlane8bpp_NEON( const int luma_x = x << subsampling_x; const int valid_range_pixels = width - luma_x; const int valid_range_bytes = valid_range_pixels * sizeof(in_y_row[0]); + assert(valid_range_pixels < 16); memcpy(luma_buffer, &in_y_row[luma_x], valid_range_bytes); luma_buffer[valid_range_pixels] = in_y_row[width - 1]; - const int valid_range_chroma_bytes = - (chroma_width - x) * sizeof(in_chroma_row[0]); + const int valid_range_chroma_pixels = chroma_width - x; const int16x8_t orig_chroma = - GetSignedSource8Msan(&in_chroma_row[x], valid_range_chroma_bytes); + GetSignedSource8Msan(&in_chroma_row[x], valid_range_chroma_pixels); const int16x8_t average_luma = vreinterpretq_s16_u16(GetAverageLumaMsan( - luma_buffer, subsampling_x, valid_range_bytes + sizeof(in_y_row[0]))); + luma_buffer, subsampling_x, valid_range_chroma_pixels << 1)); const int16x8_t blended = BlendChromaValsNoCfl( scaling_lut, orig_chroma, &(noise_image[y + start_height][x]), average_luma, scaling_shift_vect, offset, luma_multiplier, - chroma_multiplier); + chroma_multiplier, /*restrict_scaling_lookup=*/true, + valid_range_chroma_pixels); StoreUnsigned8(&out_chroma_row[x], vreinterpretq_u16_s16(Clip3(blended, floor, ceiling))); // End of right edge iteration. @@ -1267,7 +1302,8 @@ inline int16x8_t BlendChromaValsNoCfl( const int16_t* LIBGAV1_RESTRICT scaling_lut, const int16x8_t orig, const int16_t* LIBGAV1_RESTRICT noise_image_cursor, const int16x8_t& average_luma, const int16x8_t& scaling_shift_vect, - const int32x4_t& offset, int luma_multiplier, int chroma_multiplier) { + const int32x4_t& offset, int luma_multiplier, int chroma_multiplier, + bool restrict_scaling_lookup, int valid_range_pixels = 0) { uint16_t merged_buffer[8]; const int32x4_t weighted_luma_low = vmull_n_s16(vget_low_s16(average_luma), luma_multiplier); @@ -1287,7 +1323,11 @@ inline int16x8_t BlendChromaValsNoCfl( vst1q_u16(merged_buffer, vminq_u16(vcombine_u16(merged_low, merged_high), max_pixel)); const int16x8_t scaling = - GetScalingFactors<kBitdepth10, uint16_t>(scaling_lut, merged_buffer); + restrict_scaling_lookup + ? GetScalingFactors<kBitdepth10, uint16_t>(scaling_lut, merged_buffer, + valid_range_pixels) + : GetScalingFactors<kBitdepth10, uint16_t>(scaling_lut, + merged_buffer); const int16x8_t noise = GetSignedSource8(noise_image_cursor); const int16x8_t scaled_noise = ScaleNoise<kBitdepth10>(noise, scaling, scaling_shift_vect); @@ -1311,11 +1351,6 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlane10bpp_NEON( const int chroma_width = (width + subsampling_x) >> subsampling_x; const int safe_chroma_width = chroma_width & ~7; uint16_t luma_buffer[16]; -#if LIBGAV1_MSAN - // TODO(b/194217060): This can be removed if the range calculations below are - // fixed. - memset(luma_buffer, 0, sizeof(luma_buffer)); -#endif // Offset is added before downshifting in order to take advantage of // saturation, so it has to be upscaled by 6 bits, plus 2 bits for 10bpp. const int32x4_t offset = vdupq_n_s32(chroma_offset << (6 + 2)); @@ -1324,7 +1359,7 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlane10bpp_NEON( int y = 0; do { int x = 0; - do { + for (; x + 8 <= safe_chroma_width; x += 8) { const int luma_x = x << subsampling_x; const int16x8_t average_luma = vreinterpretq_s16_u16( GetAverageLuma(&in_y_row[luma_x], subsampling_x)); @@ -1332,12 +1367,10 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlane10bpp_NEON( const int16x8_t blended = BlendChromaValsNoCfl( scaling_lut, orig_chroma, &(noise_image[y + start_height][x]), average_luma, scaling_shift_vect, offset, luma_multiplier, - chroma_multiplier); + chroma_multiplier, /*restrict_scaling_lookup=*/false); StoreUnsigned8(&out_chroma_row[x], vreinterpretq_u16_s16(Clip3(blended, floor, ceiling))); - - x += 8; - } while (x < safe_chroma_width); + } if (x < chroma_width) { // Begin right edge iteration. Same as the normal iterations, but the @@ -1346,19 +1379,22 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlane10bpp_NEON( const int luma_x = x << subsampling_x; const int valid_range_pixels = width - luma_x; const int valid_range_bytes = valid_range_pixels * sizeof(in_y_row[0]); + assert(valid_range_pixels < 16); memcpy(luma_buffer, &in_y_row[luma_x], valid_range_bytes); luma_buffer[valid_range_pixels] = in_y_row[width - 1]; + const int valid_range_chroma_pixels = chroma_width - x; const int valid_range_chroma_bytes = (chroma_width - x) * sizeof(in_chroma_row[0]); const int16x8_t orig_chroma = GetSignedSource8Msan(&in_chroma_row[x], valid_range_chroma_bytes); const int16x8_t average_luma = vreinterpretq_s16_u16(GetAverageLumaMsan( - luma_buffer, subsampling_x, valid_range_bytes + sizeof(in_y_row[0]))); + luma_buffer, subsampling_x, valid_range_chroma_pixels << 1)); const int16x8_t blended = BlendChromaValsNoCfl( scaling_lut, orig_chroma, &(noise_image[y + start_height][x]), average_luma, scaling_shift_vect, offset, luma_multiplier, - chroma_multiplier); + chroma_multiplier, /*restrict_scaling_lookup=*/true, + valid_range_chroma_pixels); StoreUnsigned8(&out_chroma_row[x], vreinterpretq_u16_s16(Clip3(blended, floor, ceiling))); // End of right edge iteration. @@ -1442,10 +1478,8 @@ void Init10bpp() { dsp->film_grain.initialize_scaling_lut = InitializeScalingLookupTable_NEON<kBitdepth10>; - // TODO(b/194442742): reenable this function after segfault under armv7 ASan - // is fixed. - // dsp->film_grain.blend_noise_luma = - // BlendNoiseWithImageLuma_NEON<kBitdepth10, int16_t, uint16_t>; + dsp->film_grain.blend_noise_luma = + BlendNoiseWithImageLuma_NEON<kBitdepth10, int16_t, uint16_t>; dsp->film_grain.blend_noise_chroma[0] = BlendNoiseWithImageChroma10bpp_NEON; dsp->film_grain.blend_noise_chroma[1] = BlendNoiseWithImageChromaWithCfl_NEON<kBitdepth10, int16_t, uint16_t>; diff --git a/src/dsp/arm/film_grain_neon.h b/src/dsp/arm/film_grain_neon.h index 3ba2eef..09596e2 100644 --- a/src/dsp/arm/film_grain_neon.h +++ b/src/dsp/arm/film_grain_neon.h @@ -39,9 +39,7 @@ void FilmGrainInit_NEON(); #define LIBGAV1_Dsp8bpp_FilmGrainInitializeScalingLutFunc LIBGAV1_DSP_NEON #define LIBGAV1_Dsp10bpp_FilmGrainInitializeScalingLutFunc LIBGAV1_DSP_NEON #define LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseLuma LIBGAV1_DSP_NEON -// TODO(b/194442742): reenable this function after segfault under armv7 ASan is -// fixed. -// #define LIBGAV1_Dsp10bpp_FilmGrainBlendNoiseLuma LIBGAV1_DSP_NEON +#define LIBGAV1_Dsp10bpp_FilmGrainBlendNoiseLuma LIBGAV1_DSP_NEON #define LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseChroma LIBGAV1_DSP_NEON #define LIBGAV1_Dsp10bpp_FilmGrainBlendNoiseChroma LIBGAV1_DSP_NEON #define LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseChromaWithCfl LIBGAV1_DSP_NEON diff --git a/src/dsp/arm/intrapred_directional_neon.cc b/src/dsp/arm/intrapred_directional_neon.cc index 3cad4a6..e9bdcf0 100644 --- a/src/dsp/arm/intrapred_directional_neon.cc +++ b/src/dsp/arm/intrapred_directional_neon.cc @@ -505,20 +505,12 @@ inline void DirectionalZone1Blend_WxH( } while (++y < height); } -// The height at which a load of 16 bytes will not contain enough source pixels -// from |left_column| to supply an accurate row when computing 8 pixels at a -// time. The values are found by inspection. By coincidence, all angles that -// satisfy (ystep >> 6) == 2 map to the same value, so it is enough to look up -// by ystep >> 6. The largest index for this lookup is 1023 >> 6 == 15. -constexpr int kDirectionalZone2ShuffleInvalidHeight[16] = { - 1024, 1024, 16, 16, 16, 16, 0, 0, 18, 0, 0, 0, 0, 0, 0, 40}; - -// 7.11.2.4 (8) 90 < angle > 180 -// The strategy for these functions (4xH and 8+xH) is to know how many blocks -// can be processed with just pixels from |top_ptr|, then handle mixed blocks, -// then handle only blocks that take from |left_ptr|. Additionally, a fast -// index-shuffle approach is used for pred values from |left_column| in sections -// that permit it. +// 7.11.2.4 (8) 90 < angle > 180 +// The strategy for these functions (4xH and 8+xH) is to know how many blocks +// can be processed with just pixels from |top_ptr|, then handle mixed blocks, +// then handle only blocks that take from |left_ptr|. Additionally, a fast +// index-shuffle approach is used for pred values from |left_column| in +// sections that permit it. inline void DirectionalZone2_4xH( uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride, const uint8_t* LIBGAV1_RESTRICT const top_row, @@ -544,13 +536,6 @@ inline void DirectionalZone2_4xH( assert(xstep >= 3); const int min_top_only_x = std::min((height * xstep) >> 6, /* width */ 4); - // For steep angles, the source pixels from |left_column| may not fit in a - // 16-byte load for shuffling. - // TODO(petersonab): Find a more precise formula for this subject to x. - // TODO(johannkoenig): Revisit this for |width| == 4. - const int max_shuffle_height = - std::min(kDirectionalZone2ShuffleInvalidHeight[ystep >> 6], height); - // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1 int xstep_bounds_base = (xstep == 64) ? 0 : xstep - 1; @@ -569,9 +554,9 @@ inline void DirectionalZone2_4xH( // blocks that have a mixture of values computed from top or left. The final // stage covers blocks that are only computed from the left. if (min_top_only_x > 0) { - // Round down to the nearest multiple of 8. - // TODO(johannkoenig): This never hits for Wx4 blocks but maybe it should. - const int max_top_only_y = std::min((1 << 6) / xstep, height) & ~7; + // Round down to the nearest multiple of 8 (or 4, if height is 4). + const int max_top_only_y = + std::min((1 << 6) / xstep, height) & ~(min_height - 1); DirectionalZone1_WxH<4>(dst, stride, max_top_only_y, top_row, -xstep, upsampled_top); @@ -584,18 +569,11 @@ inline void DirectionalZone2_4xH( // All rows from |min_left_only_y| down for this set of columns only need // |left_column| to compute. const int min_left_only_y = std::min((4 << 6) / xstep, height); - // At high angles such that min_left_only_y < 8, ystep is low and xstep is - // high. This means that max_shuffle_height is unbounded and xstep_bounds - // will overflow in 16 bits. This is prevented by stopping the first - // blending loop at min_left_only_y for such cases, which means we skip over - // the second blending loop as well. - const int left_shuffle_stop_y = - std::min(max_shuffle_height, min_left_only_y); int xstep_bounds = xstep_bounds_base + xstep_y; int top_x = -xstep - xstep_y; // +8 increment is OK because if height is 4 this only goes once. - for (; y < left_shuffle_stop_y; + for (; y < min_left_only_y; y += 8, dst += stride8, xstep_bounds += xstep8, top_x -= xstep8) { DirectionalZone2FromLeftCol_WxH<4>( dst, stride, min_height, @@ -607,21 +585,8 @@ inline void DirectionalZone2_4xH( upsample_top_shift); } - // Pick up from the last y-value, using the slower but secure method for - // left prediction. - const int16_t base_left_y = vgetq_lane_s16(left_y, 0); - for (; y < min_left_only_y; - y += 8, dst += stride8, xstep_bounds += xstep8, top_x -= xstep8) { - DirectionalZone3_WxH<4>( - dst, stride, min_height, - left_column + ((y - left_base_increment) << upsample_left_shift), - base_left_y, -ystep, upsample_left_shift); - - DirectionalZone1Blend_WxH<4>(dst, stride, min_height, top_row, - xstep_bounds, top_x, xstep, - upsample_top_shift); - } // Loop over y for left_only rows. + const int16_t base_left_y = vgetq_lane_s16(left_y, 0); for (; y < height; y += 8, dst += stride8) { DirectionalZone3_WxH<4>( dst, stride, min_height, @@ -634,34 +599,88 @@ inline void DirectionalZone2_4xH( } } -// Process a multiple of 8 |width|. -inline void DirectionalZone2_8( +template <bool shuffle_left_column> +inline void DirectionalZone2_8xH( uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride, const uint8_t* LIBGAV1_RESTRICT const top_row, - const uint8_t* LIBGAV1_RESTRICT const left_column, const int width, - const int height, const int xstep, const int ystep, + const uint8_t* LIBGAV1_RESTRICT const left_column, const int height, + const int xstep, const int ystep, const int x, const int left_offset, + const int xstep_bounds_base, const int16x8_t left_y, const bool upsampled_top, const bool upsampled_left) { const int upsample_left_shift = static_cast<int>(upsampled_left); const int upsample_top_shift = static_cast<int>(upsampled_top); - // Helper vector. - const int16x8_t zero_to_seven = {0, 1, 2, 3, 4, 5, 6, 7}; - // Loop incrementers for moving by block (8x8). This function handles blocks // with height 4 as well. They are calculated in one pass so these variables // do not get used. const ptrdiff_t stride8 = stride << 3; const int xstep8 = xstep << 3; - const int ystep8 = ystep << 3; - // Process Wx4 blocks. + // Cover 8x4 case. const int min_height = (height == 4) ? 4 : 8; - // All columns from |min_top_only_x| to the right will only need |top_row| to - // compute and can therefore call the Zone1 functions. This assumes |xstep| is - // at least 3. - assert(xstep >= 3); - const int min_top_only_x = std::min((height * xstep) >> 6, width); + // The first stage, before the first y-loop, covers blocks that are only + // computed from the top row. The second stage, comprising two y-loops, covers + // blocks that have a mixture of values computed from top or left. The final + // stage covers blocks that are only computed from the left. + uint8_t* dst_x = dst + x; + // Round down to the nearest multiple of 8 (or 4, if height is 4). + const int max_top_only_y = + std::min((1 << 6) / xstep, height) & ~(min_height - 1); + DirectionalZone1_WxH<8>(dst_x, stride, max_top_only_y, + top_row + (x << upsample_top_shift), -xstep, + upsampled_top); + + if (max_top_only_y == height) return; + + int y = max_top_only_y; + dst_x += stride * y; + const int xstep_y = xstep * y; + + // All rows from |min_left_only_y| down for this set of columns only need + // |left_column| to compute. Round up to the nearest 8. + const int min_left_only_y = + Align(std::min(((x + 8) << 6) / xstep, height), 8); + int xstep_bounds = xstep_bounds_base + xstep_y; + int top_x = -xstep - xstep_y; + + const int16_t base_left_y = vgetq_lane_s16(left_y, 0); + for (; y < min_left_only_y; + y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) { + if (shuffle_left_column) { + DirectionalZone2FromLeftCol_WxH<8>( + dst_x, stride, min_height, + left_column + ((left_offset + y) << upsample_left_shift), left_y, + upsample_left_shift); + } else { + DirectionalZone3_WxH<8>( + dst_x, stride, min_height, + left_column + ((left_offset + y) << upsample_left_shift), base_left_y, + -ystep, upsample_left_shift); + } + + DirectionalZone1Blend_WxH<8>( + dst_x, stride, min_height, top_row + (x << upsample_top_shift), + xstep_bounds, top_x, xstep, upsample_top_shift); + } + + // Loop over y for left_only rows. + for (; y < height; y += 8, dst_x += stride8) { + DirectionalZone3_WxH<8>( + dst_x, stride, min_height, + left_column + ((left_offset + y) << upsample_left_shift), base_left_y, + -ystep, upsample_left_shift); + } +} + +// Process a multiple of 8 |width|. +inline void DirectionalZone2_WxH( + uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride, + const uint8_t* LIBGAV1_RESTRICT const top_row, + const uint8_t* LIBGAV1_RESTRICT const left_column, const int width, + const int height, const int xstep, const int ystep, + const bool upsampled_top, const bool upsampled_left) { + const int ystep8 = ystep << 3; // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1 int xstep_bounds_base = (xstep == 64) ? 0 : xstep - 1; @@ -677,90 +696,43 @@ inline void DirectionalZone2_8( // left_y vector omits the portion which is covered under the left_column // offset. Following values need the full ystep as a relative offset. const int16x8_t remainder = vdupq_n_s16(-ystep_remainder); + const int16x8_t zero_to_seven = {0, 1, 2, 3, 4, 5, 6, 7}; int16x8_t left_y = vmlaq_n_s16(remainder, zero_to_seven, -ystep); + // For ystep > 90, at least two sets of 8 columns can be fully computed from + // top_row only. + const int min_top_only_x = std::min((height * xstep) >> 6, width); + // Analysis finds that, for most angles (ystep < 132), all segments that use + // both top_row and left_column can compute from left_column using byte + // shuffles from a single vector. For steeper angles, the shuffle is also + // fully reliable when x >= 32. + const int shuffle_left_col_x = (ystep < 132) ? 0 : 32; + const int min_shuffle_x = std::min(min_top_only_x, shuffle_left_col_x); + // This loop treats each set of 4 columns in 3 stages with y-value boundaries. // The first stage, before the first y-loop, covers blocks that are only // computed from the top row. The second stage, comprising two y-loops, covers // blocks that have a mixture of values computed from top or left. The final // stage covers blocks that are only computed from the left. int x = 0; - // For steep angles, the source pixels from |left_column| may not fit in a - // 16-byte load for shuffling. |d| represents the number of pixels that can - // fit in one contiguous vector when stepping by |ystep|. For a given x - // position, the left column values can be obtained by VTBL as long as the - // values at row[x + d] and beyond come from the top row. However, this does - // not guarantee that the vector will also contain all of the values needed - // from top row. - const int d = 16 / ((ystep >> 6) + 1); + for (int left_offset = -left_base_increment; x < min_shuffle_x; x += 8, + xstep_bounds_base -= (8 << 6), + left_y = vsubq_s16(left_y, increment_left8), + left_offset -= left_base_increment8) { + DirectionalZone2_8xH<false>(dst, stride, top_row, left_column, height, + xstep, ystep, x, left_offset, xstep_bounds_base, + left_y, upsampled_top, upsampled_left); + } for (int left_offset = -left_base_increment; x < min_top_only_x; x += 8, xstep_bounds_base -= (8 << 6), left_y = vsubq_s16(left_y, increment_left8), left_offset -= left_base_increment8) { - uint8_t* dst_x = dst + x; - const int max_shuffle_height = - std::min(((x + d) << 6) / xstep, height) & ~7; - // Round down to the nearest multiple of 8. - const int max_top_only_y = std::min(((x + 1) << 6) / xstep, height) & ~7; - DirectionalZone1_WxH<8>(dst_x, stride, max_top_only_y, - top_row + (x << upsample_top_shift), -xstep, - upsampled_top); - - if (max_top_only_y == height) continue; - - int y = max_top_only_y; - dst_x += stride * y; - const int xstep_y = xstep * y; - - // All rows from |min_left_only_y| down for this set of columns only need - // |left_column| to compute. - const int min_left_only_y = std::min(((x + 8) << 6) / xstep, height); - // At high angles such that min_left_only_y < 8, ystep is low and xstep is - // high. This means that max_shuffle_height is unbounded and xstep_bounds - // will overflow in 16 bits. This is prevented by stopping the first - // blending loop at min_left_only_y for such cases, which means we skip over - // the second blending loop as well. - const int left_shuffle_stop_y = - std::min(max_shuffle_height, min_left_only_y); - int xstep_bounds = xstep_bounds_base + xstep_y; - int top_x = -xstep - xstep_y; - - for (; y < left_shuffle_stop_y; - y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) { - DirectionalZone2FromLeftCol_WxH<8>( - dst_x, stride, min_height, - left_column + ((left_offset + y) << upsample_left_shift), left_y, - upsample_left_shift); - - DirectionalZone1Blend_WxH<8>( - dst_x, stride, min_height, top_row + (x << upsample_top_shift), - xstep_bounds, top_x, xstep, upsample_top_shift); - } - - // Pick up from the last y-value, using the slower but secure method for - // left prediction. - const int16_t base_left_y = vgetq_lane_s16(left_y, 0); - for (; y < min_left_only_y; - y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) { - DirectionalZone3_WxH<8>( - dst_x, stride, min_height, - left_column + ((left_offset + y) << upsample_left_shift), base_left_y, - -ystep, upsample_left_shift); - - DirectionalZone1Blend_WxH<8>( - dst_x, stride, min_height, top_row + (x << upsample_top_shift), - xstep_bounds, top_x, xstep, upsample_top_shift); - } - // Loop over y for left_only rows. - for (; y < height; y += 8, dst_x += stride8) { - DirectionalZone3_WxH<8>( - dst_x, stride, min_height, - left_column + ((left_offset + y) << upsample_left_shift), base_left_y, - -ystep, upsample_left_shift); - } + DirectionalZone2_8xH<true>(dst, stride, top_row, left_column, height, xstep, + ystep, x, left_offset, xstep_bounds_base, left_y, + upsampled_top, upsampled_left); } - // TODO(johannkoenig): May be able to remove this branch. if (x < width) { + const int upsample_top_shift = static_cast<int>(upsampled_top); DirectionalZone1_WxH(dst + x, stride, width - x, height, top_row + (x << upsample_top_shift), -xstep, upsampled_top); @@ -792,8 +764,8 @@ void DirectionalIntraPredictorZone2_NEON( DirectionalZone2_4xH(dst, stride, top_ptr, left_ptr, height, xstep, ystep, upsampled_top, upsampled_left); } else { - DirectionalZone2_8(dst, stride, top_ptr, left_ptr, width, height, xstep, - ystep, upsampled_top, upsampled_left); + DirectionalZone2_WxH(dst, stride, top_ptr, left_ptr, width, height, xstep, + ystep, upsampled_top, upsampled_left); } } @@ -935,6 +907,16 @@ inline uint16x8_t WeightedBlend(const uint16x8_t a, const uint16x8_t b, return vrshrq_n_u16(sum, 5 /*log2(32)*/); } +// Blend two values based on weights that sum to 32. +inline uint16x8_t WeightedBlend(const uint16x8_t a, const uint16x8_t b, + const uint16x8_t a_weight, + const uint16x8_t b_weight) { + const uint16x8_t a_product = vmulq_u16(a, a_weight); + const uint16x8_t sum = vmlaq_u16(a_product, b, b_weight); + + return vrshrq_n_u16(sum, 5 /*log2(32)*/); +} + // Each element of |dest| contains values associated with one weight value. inline void LoadEdgeVals(uint16x4x2_t* dest, const uint16_t* LIBGAV1_RESTRICT const source, @@ -959,6 +941,24 @@ inline void LoadEdgeVals(uint16x8x2_t* dest, } } +// For Wx4 blocks, load the source for 2 columns. The source for the second +// column is held in the high half of each vector. +inline void LoadEdgeVals2x4(uint16x8x2_t* dest, + const uint16_t* LIBGAV1_RESTRICT const source_low, + const uint16_t* LIBGAV1_RESTRICT const source_high, + const bool upsampled) { + if (upsampled) { + const uint16x4x2_t low = vld2_u16(source_low); + const uint16x4x2_t high = vld2_u16(source_high); + dest->val[0] = vcombine_u16(low.val[0], high.val[0]); + dest->val[1] = vcombine_u16(low.val[1], high.val[1]); + } else { + dest->val[0] = vcombine_u16(vld1_u16(source_low), vld1_u16(source_high)); + dest->val[1] = + vcombine_u16(vld1_u16(source_low + 1), vld1_u16(source_high + 1)); + } +} + template <bool upsampled> inline void DirectionalZone1_4xH(uint16_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride, const int height, @@ -1286,18 +1286,162 @@ inline void DirectionalZone3_4x4(uint8_t* LIBGAV1_RESTRICT dst, } template <bool upsampled> +inline void DirectionalZone3_8x4(uint8_t* LIBGAV1_RESTRICT dst, + const ptrdiff_t stride, + const uint16_t* LIBGAV1_RESTRICT const left, + const int ystep, const int base_left_y = 0) { + const int upsample_shift = static_cast<int>(upsampled); + const int index_scale_bits = 6 - upsample_shift; + const uint16x8_t inverter = vdupq_n_u16(32); + + uint16x8x2_t sampled_left_col; + // Compute two columns at a time, then transpose for storage. + uint16x8_t result[4]; + + // The low half of pre-transpose vectors contains columns 0 through 3. + int left_y_low = base_left_y + ystep; + int left_offset_low = left_y_low >> index_scale_bits; + int shift_low = (LeftShift(left_y_low, upsample_shift) & 0x3F) >> 1; + + // The high half of pre-transpose vectors contains columns 4 through 7. + int left_y_high = left_y_low + (ystep << 2); + int left_offset_high = left_y_high >> index_scale_bits; + int shift_high = (LeftShift(left_y_high, upsample_shift) & 0x3F) >> 1; + uint16x8_t weights_0 = + vcombine_u16(vdup_n_u16(shift_low), vdup_n_u16(shift_high)); + uint16x8_t weights_1 = vsubq_u16(inverter, weights_0); + LoadEdgeVals2x4(&sampled_left_col, &left[left_offset_low], + &left[left_offset_high], upsampled); + result[0] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], + weights_1, weights_0); + + left_y_low += ystep; + left_offset_low = left_y_low >> index_scale_bits; + shift_low = (LeftShift(left_y_low, upsample_shift) & 0x3F) >> 1; + + left_y_high += ystep; + left_offset_high = left_y_high >> index_scale_bits; + shift_high = (LeftShift(left_y_high, upsample_shift) & 0x3F) >> 1; + weights_0 = vcombine_u16(vdup_n_u16(shift_low), vdup_n_u16(shift_high)); + weights_1 = vsubq_u16(inverter, weights_0); + LoadEdgeVals2x4(&sampled_left_col, &left[left_offset_low], + &left[left_offset_high], upsampled); + result[1] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], + weights_1, weights_0); + + left_y_high += ystep; + left_y_low += ystep; + left_offset_low = left_y_low >> index_scale_bits; + shift_low = (LeftShift(left_y_low, upsample_shift) & 0x3F) >> 1; + + left_offset_high = left_y_high >> index_scale_bits; + shift_high = (LeftShift(left_y_high, upsample_shift) & 0x3F) >> 1; + weights_0 = vcombine_u16(vdup_n_u16(shift_low), vdup_n_u16(shift_high)); + weights_1 = vsubq_u16(inverter, weights_0); + LoadEdgeVals2x4(&sampled_left_col, &left[left_offset_low], + &left[left_offset_high], upsampled); + result[2] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], + weights_1, weights_0); + + left_y_low += ystep; + left_offset_low = left_y_low >> index_scale_bits; + shift_low = (LeftShift(left_y_low, upsample_shift) & 0x3F) >> 1; + + left_y_high += ystep; + left_offset_high = left_y_high >> index_scale_bits; + shift_high = (LeftShift(left_y_high, upsample_shift) & 0x3F) >> 1; + weights_0 = vcombine_u16(vdup_n_u16(shift_low), vdup_n_u16(shift_high)); + weights_1 = vsubq_u16(inverter, weights_0); + LoadEdgeVals2x4(&sampled_left_col, &left[left_offset_low], + &left[left_offset_high], upsampled); + result[3] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], + weights_1, weights_0); + + Transpose4x8(result); + Store8(dst, result[0]); + dst += stride; + Store8(dst, result[1]); + dst += stride; + Store8(dst, result[2]); + dst += stride; + Store8(dst, result[3]); +} + +template <bool upsampled> +inline void DirectionalZone3_4x8(uint8_t* LIBGAV1_RESTRICT dst, + const ptrdiff_t stride, + const uint16_t* LIBGAV1_RESTRICT const left, + const int ystep, const int base_left_y = 0) { + const int upsample_shift = static_cast<int>(upsampled); + const int index_scale_bits = 6 - upsample_shift; + + // Compute one column at a time, then transpose for storage. + uint16x8_t result[4]; + + int left_y = base_left_y + ystep; + int left_offset = left_y >> index_scale_bits; + int shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1; + int shift_1 = 32 - shift_0; + uint16x8x2_t sampled_left_col; + LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled); + result[0] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], + shift_1, shift_0); + + left_y += ystep; + left_offset = left_y >> index_scale_bits; + shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1; + shift_1 = 32 - shift_0; + LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled); + result[1] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], + shift_1, shift_0); + + left_y += ystep; + left_offset = left_y >> index_scale_bits; + shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1; + shift_1 = 32 - shift_0; + LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled); + result[2] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], + shift_1, shift_0); + + left_y += ystep; + left_offset = left_y >> index_scale_bits; + shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1; + shift_1 = 32 - shift_0; + LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled); + result[3] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], + shift_1, shift_0); + + Transpose4x8(result); + Store4(dst, vget_low_u16(result[0])); + dst += stride; + Store4(dst, vget_low_u16(result[1])); + dst += stride; + Store4(dst, vget_low_u16(result[2])); + dst += stride; + Store4(dst, vget_low_u16(result[3])); + dst += stride; + Store4(dst, vget_high_u16(result[0])); + dst += stride; + Store4(dst, vget_high_u16(result[1])); + dst += stride; + Store4(dst, vget_high_u16(result[2])); + dst += stride; + Store4(dst, vget_high_u16(result[3])); +} + +template <bool upsampled> inline void DirectionalZone3_4xH(uint8_t* LIBGAV1_RESTRICT dest, const ptrdiff_t stride, const int height, const uint16_t* LIBGAV1_RESTRICT const left, const int ystep) { + assert(height == 8 || height == 16); const int upsample_shift = static_cast<int>(upsampled); - int y = 0; - do { - DirectionalZone3_4x4<upsampled>(dest, stride, left + (y << upsample_shift), + DirectionalZone3_4x8<upsampled>(dest, stride, left, ystep); + if (height == 16) { + dest += stride << 3; + DirectionalZone3_4x8<upsampled>(dest, stride, left + (8 << upsample_shift), ystep); - dest += 4 * stride; - y += 4; - } while (y < height); + } } template <bool upsampled> @@ -1305,16 +1449,17 @@ inline void DirectionalZone3_Wx4(uint8_t* LIBGAV1_RESTRICT dest, const ptrdiff_t stride, const int width, const uint16_t* LIBGAV1_RESTRICT const left, const int ystep) { - int x = 0; - int base_left_y = 0; - do { - // TODO(petersonab): Establish 8x4 transpose to reserve this function for - // 8x4 and 16x4. - DirectionalZone3_4x4<upsampled>(dest + 2 * x, stride, left, ystep, - base_left_y); - base_left_y += 4 * ystep; - x += 4; - } while (x < width); + assert(width <= 16); + if (width == 4) { + DirectionalZone3_4x4<upsampled>(dest, stride, left, ystep); + return; + } + DirectionalZone3_8x4<upsampled>(dest, stride, left, ystep); + if (width == 16) { + const int base_left_y = ystep << 3; + DirectionalZone3_8x4<upsampled>(dest + 8 * sizeof(uint16_t), stride, left, + ystep, base_left_y); + } } template <bool upsampled> @@ -1460,17 +1605,17 @@ void DirectionalIntraPredictorZone3_NEON( } while (y != 0); return; } - if (width == 4) { + if (height == 4) { if (upsampled_left) { - DirectionalZone3_4xH<true>(dst, stride, height, left, ystep); + DirectionalZone3_Wx4<true>(dst, stride, width, left, ystep); } else { - DirectionalZone3_4xH<false>(dst, stride, height, left, ystep); + DirectionalZone3_Wx4<false>(dst, stride, width, left, ystep); } - } else if (height == 4) { + } else if (width == 4) { if (upsampled_left) { - DirectionalZone3_Wx4<true>(dst, stride, width, left, ystep); + DirectionalZone3_4xH<true>(dst, stride, height, left, ystep); } else { - DirectionalZone3_Wx4<false>(dst, stride, width, left, ystep); + DirectionalZone3_4xH<false>(dst, stride, height, left, ystep); } } else { if (upsampled_left) { @@ -1532,16 +1677,6 @@ inline uint16x4_t WeightedBlend(const uint16x4_t a, const uint16x4_t b, return vrshr_n_u16(sum, 5 /*log2(32)*/); } -// Blend two values based on weight pairs that each sum to 32. -inline uint16x8_t WeightedBlend(const uint16x8_t a, const uint16x8_t b, - const uint16x8_t a_weight, - const uint16x8_t b_weight) { - const uint16x8_t a_product = vmulq_u16(a, a_weight); - const uint16x8_t sum = vmlaq_u16(a_product, b, b_weight); - - return vrshrq_n_u16(sum, 5 /*log2(32)*/); -} - // Because the source values "move backwards" as the row index increases, the // indices derived from ystep are generally negative in localized functions. // This is accommodated by making sure the relative indices are within [-15, 0] @@ -1608,8 +1743,8 @@ inline void DirectionalZone2FromLeftCol_4xH( } while (++y < height); } -inline void DirectionalZone2FromLeftCol_8xH( - uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride, const int height, +inline void DirectionalZone2FromLeftCol_8x8( + uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride, const uint16_t* LIBGAV1_RESTRICT const left_column, const int16x8_t left_y, const bool upsampled) { const int upsample_shift = static_cast<int>(upsampled); @@ -1653,8 +1788,7 @@ inline void DirectionalZone2FromLeftCol_8xH( vreinterpretq_u16_s16(vshrq_n_s16(shift_masked, 1)); const uint16x8_t shift_1 = vsubq_u16(vdupq_n_u16(32), shift_0); - int y = 0; - do { + for (int y = 0; y < 8; ++y) { uint16x8_t src_left, src_right; LoadStepwise( left_column - kPositiveIndexOffsetPixels + (y << upsample_shift), @@ -1664,7 +1798,7 @@ inline void DirectionalZone2FromLeftCol_8xH( Store8(dst, val); dst += stride; - } while (++y < height); + } } template <bool upsampled> @@ -1704,8 +1838,8 @@ inline void DirectionalZone1Blend_4xH( } template <bool upsampled> -inline void DirectionalZone1Blend_8xH( - uint8_t* LIBGAV1_RESTRICT dest, const ptrdiff_t stride, const int height, +inline void DirectionalZone1Blend_8x8( + uint8_t* LIBGAV1_RESTRICT dest, const ptrdiff_t stride, const uint16_t* LIBGAV1_RESTRICT const top_row, int zone_bounds, int top_x, const int xstep) { const int upsample_shift = static_cast<int>(upsampled); @@ -1716,8 +1850,7 @@ inline void DirectionalZone1Blend_8xH( const int16x8_t indices = {0, 1, 2, 3, 4, 5, 6, 7}; uint16x8x2_t top_vals; - int y = height; - do { + for (int y = 0; y < 8; ++y) { const uint16_t* const src = top_row + (top_x >> scale_bits_x); LoadEdgeVals(&top_vals, src, upsampled); @@ -1736,20 +1869,9 @@ inline void DirectionalZone1Blend_8xH( dest += stride; zone_bounds += xstep; top_x -= xstep; - } while (--y != 0); + } } -// The height at which a load of 16 bytes will not contain enough source pixels -// from |left_column| to supply an accurate row when computing 8 pixels at a -// time. The values are found by inspection. By coincidence, all angles that -// satisfy (ystep >> 6) == 2 map to the same value, so it is enough to look up -// by ystep >> 6. The largest index for this lookup is 1023 >> 6 == 15. Indices -// that do not correspond to angle derivatives are left at zero. -// Notably, in cases with upsampling, the shuffle-invalid height is always -// greater than the prediction height (which is 8 at maximum). -constexpr int kDirectionalZone2ShuffleInvalidHeight[16] = { - 1024, 1024, 16, 16, 16, 16, 0, 0, 18, 0, 0, 0, 0, 0, 0, 40}; - // 7.11.2.4 (8) 90 < angle > 180 // The strategy for these functions (4xH and 8+xH) is to know how many blocks // can be processed with just pixels from |top_ptr|, then handle mixed blocks, @@ -1796,9 +1918,9 @@ inline void DirectionalZone2_4xH( // computed from the top row. The second stage, comprising two y-loops, covers // blocks that have a mixture of values computed from top or left. The final // stage covers blocks that are only computed from the left. - // Round down to the nearest multiple of 8. - // TODO(petersonab): Check if rounding to the nearest 4 is okay. - const int max_top_only_y = std::min((1 << 6) / xstep, height) & ~7; + // Round down to the nearest multiple of 8 (or 4, if height is 4). + const int max_top_only_y = + std::min((1 << 6) / xstep, height) & ~(min_height - 1); DirectionalZone1_4xH<upsampled_top>(reinterpret_cast<uint16_t*>(dst), stride >> 1, max_top_only_y, top_row, -xstep); @@ -1827,12 +1949,15 @@ inline void DirectionalZone2_4xH( xstep_bounds, top_x, xstep); } - // Loop over y for left-only rows. - for (; y < height; y += 8, dst += stride8) { - // Angle expected by Zone3 is flipped about the 180 degree vector, which - // is the x-axis. + // Left-only section. |height| - |y| is assumed equivalent to: + // (y == 0) && (height == 4) + if (height - y == 4) { + DirectionalZone3_4x4<upsampled_left>(dst, stride, left_column, -ystep); + return; + } + if (y < height) { DirectionalZone3_4xH<upsampled_left>( - dst, stride, min_height, left_column + (y << upsample_left_shift), + dst, stride, height - y, left_column + (y << upsample_left_shift), -ystep); } } @@ -1882,9 +2007,75 @@ inline void DirectionalZone2_Wx4( } } +template <bool shuffle_left_column, bool upsampled_top, bool upsampled_left> +inline void DirectionalZone2_8xH( + uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride, + const uint16_t* LIBGAV1_RESTRICT const top_row, + const uint16_t* LIBGAV1_RESTRICT const left_column, const int height, + const int xstep, const int ystep, const int x, const int left_offset, + const int xstep_bounds_base, const int16x8_t left_y) { + const int upsample_left_shift = static_cast<int>(upsampled_left); + const int upsample_top_shift = static_cast<int>(upsampled_top); + + // Loop incrementers for moving by block (8x8). This function handles blocks + // with height 4 as well. They are calculated in one pass so these variables + // do not get used. + const ptrdiff_t stride8 = stride << 3; + const int xstep8 = xstep << 3; + + // The first stage, before the first y-loop, covers blocks that are only + // computed from the top row. The second stage, comprising two y-loops, covers + // blocks that have a mixture of values computed from top or left. The final + // stage covers blocks that are only computed from the left. + uint8_t* dst_x = dst + x * sizeof(uint16_t); + // Round down to the nearest multiple of 8. + const int max_top_only_y = std::min(((x + 1) << 6) / xstep, height) & ~7; + DirectionalZone1_WxH<upsampled_top>( + reinterpret_cast<uint16_t*>(dst_x), stride >> 1, 8, max_top_only_y, + top_row + (x << upsample_top_shift), -xstep); + + if (max_top_only_y == height) return; + + int y = max_top_only_y; + dst_x += stride * y; + const int xstep_y = xstep * y; + + // All rows from |min_left_only_y| down for this set of columns only need + // |left_column| to compute. Round up to the nearest 8. + const int min_left_only_y = + Align(std::min(((x + 8) << 6) / xstep, height), 8); + int xstep_bounds = xstep_bounds_base + xstep_y; + int top_x = -xstep - xstep_y; + + for (; y < min_left_only_y; + y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) { + if (shuffle_left_column) { + DirectionalZone2FromLeftCol_8x8( + dst_x, stride, + left_column + ((left_offset + y) << upsample_left_shift), left_y, + upsampled_left); + } else { + DirectionalZone3_8x8<upsampled_left>( + dst_x, stride, left_column + (y << upsample_left_shift), -ystep, + -ystep * x); + } + + DirectionalZone1Blend_8x8<upsampled_top>( + dst_x, stride, top_row + (x << upsample_top_shift), xstep_bounds, top_x, + xstep); + } + + // Loop over y for left_only rows. + for (; y < height; y += 8, dst_x += stride8) { + DirectionalZone3_8x8<upsampled_left>( + dst_x, stride, left_column + (y << upsample_left_shift), -ystep, + -ystep * x); + } +} + // Process a multiple of 8 |width|. template <bool upsampled_top, bool upsampled_left> -inline void DirectionalZone2_8( +inline void DirectionalZone2_NEON( uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride, const uint16_t* LIBGAV1_RESTRICT const top_row, const uint16_t* LIBGAV1_RESTRICT const left_column, const int width, @@ -1894,30 +2085,24 @@ inline void DirectionalZone2_8( dst, stride, top_row, left_column, width, xstep, ystep); return; } - const int upsample_left_shift = static_cast<int>(upsampled_left); const int upsample_top_shift = static_cast<int>(upsampled_top); // Helper vector. const int16x8_t zero_to_seven = {0, 1, 2, 3, 4, 5, 6, 7}; - // Loop increments for moving by block (8x8). This function handles blocks - // with height 4 as well. They are calculated in one pass so these variables - // do not get used. - const ptrdiff_t stride8 = stride << 3; - const int xstep8 = xstep << 3; const int ystep8 = ystep << 3; // All columns from |min_top_only_x| to the right will only need |top_row| to // compute and can therefore call the Zone1 functions. This assumes |xstep| is // at least 3. assert(xstep >= 3); - const int min_top_only_x = std::min((height * xstep) >> 6, width); - - // For steep angles, the source pixels from |left_column| may not fit in a - // 16-byte load for shuffling. - // TODO(petersonab): Find a more precise formula for this subject to x. - const int max_shuffle_height = - std::min(kDirectionalZone2ShuffleInvalidHeight[ystep >> 6], height); + const int min_top_only_x = Align(std::min((height * xstep) >> 6, width), 8); + // Analysis finds that, for most angles (ystep < 132), all segments that use + // both top_row and left_column can compute from left_column using byte + // shuffles from a single vector. For steeper angles, the shuffle is also + // fully reliable when x >= 32. + const int shuffle_left_col_x = (ystep < 132) ? 0 : 32; + const int min_shuffle_x = std::min(min_top_only_x, shuffle_left_col_x); // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1 int xstep_bounds_base = (xstep == 64) ? 0 : xstep - 1; @@ -1935,73 +2120,22 @@ inline void DirectionalZone2_8( int16x8_t left_y = vmlaq_n_s16(vdupq_n_s16(-ystep_remainder), zero_to_seven, -ystep); - // This loop treats each set of 4 columns in 3 stages with y-value boundaries. - // The first stage, before the first y-loop, covers blocks that are only - // computed from the top row. The second stage, comprising two y-loops, covers - // blocks that have a mixture of values computed from top or left. The final - // stage covers blocks that are only computed from the left. int x = 0; + for (int left_offset = -left_base_increment; x < min_shuffle_x; x += 8, + xstep_bounds_base -= (8 << 6), + left_y = vsubq_s16(left_y, increment_left8), + left_offset -= left_base_increment8) { + DirectionalZone2_8xH<false, upsampled_top, upsampled_left>( + dst, stride, top_row, left_column, height, xstep, ystep, x, left_offset, + xstep_bounds_base, left_y); + } for (int left_offset = -left_base_increment; x < min_top_only_x; x += 8, xstep_bounds_base -= (8 << 6), left_y = vsubq_s16(left_y, increment_left8), left_offset -= left_base_increment8) { - uint8_t* dst_x = dst + x * sizeof(uint16_t); - - // Round down to the nearest multiple of 8. - const int max_top_only_y = std::min(((x + 1) << 6) / xstep, height) & ~7; - DirectionalZone1_WxH<upsampled_top>( - reinterpret_cast<uint16_t*>(dst_x), stride >> 1, 8, max_top_only_y, - top_row + (x << upsample_top_shift), -xstep); - - if (max_top_only_y == height) continue; - - int y = max_top_only_y; - dst_x += stride * y; - const int xstep_y = xstep * y; - - // All rows from |min_left_only_y| down for this set of columns only need - // |left_column| to compute. - const int min_left_only_y = std::min(((x + 8) << 6) / xstep, height); - // At high angles such that min_left_only_y < 8, ystep is low and xstep is - // high. This means that max_shuffle_height is unbounded and xstep_bounds - // will overflow in 16 bits. This is prevented by stopping the first - // blending loop at min_left_only_y for such cases, which means we skip over - // the second blending loop as well. - const int left_shuffle_stop_y = - std::min(max_shuffle_height, min_left_only_y); - int xstep_bounds = xstep_bounds_base + xstep_y; - int top_x = -xstep - xstep_y; - - for (; y < left_shuffle_stop_y; - y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) { - DirectionalZone2FromLeftCol_8xH( - dst_x, stride, 8, - left_column + ((left_offset + y) << upsample_left_shift), left_y, - upsample_left_shift); - - DirectionalZone1Blend_8xH<upsampled_top>( - dst_x, stride, 8, top_row + (x << upsample_top_shift), xstep_bounds, - top_x, xstep); - } - - // Pick up from the last y-value, using the slower but secure method for - // left prediction. - for (; y < min_left_only_y; - y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) { - DirectionalZone3_8x8<upsampled_left>( - dst_x, stride, left_column + (y << upsample_left_shift), -ystep, - -ystep * x); - - DirectionalZone1Blend_8xH<upsampled_top>( - dst_x, stride, 8, top_row + (x << upsample_top_shift), xstep_bounds, - top_x, xstep); - } - // Loop over y for left_only rows. - for (; y < height; y += 8, dst_x += stride8) { - DirectionalZone3_8x8<upsampled_left>( - dst_x, stride, left_column + (y << upsample_left_shift), -ystep, - -ystep * x); - } + DirectionalZone2_8xH<true, upsampled_top, upsampled_left>( + dst, stride, top_row, left_column, height, xstep, ystep, x, left_offset, + xstep_bounds_base, left_y); } // Reached |min_top_only_x|. if (x < width) { @@ -2129,18 +2263,18 @@ void DirectionalIntraPredictorZone2_NEON( } if (upsampled_top) { if (upsampled_left) { - DirectionalZone2_8<true, true>(dst, stride, top_ptr, left_ptr, width, - height, xstep, ystep); + DirectionalZone2_NEON<true, true>(dst, stride, top_ptr, left_ptr, width, + height, xstep, ystep); } else { - DirectionalZone2_8<true, false>(dst, stride, top_ptr, left_ptr, width, - height, xstep, ystep); + DirectionalZone2_NEON<true, false>(dst, stride, top_ptr, left_ptr, width, + height, xstep, ystep); } } else if (upsampled_left) { - DirectionalZone2_8<false, true>(dst, stride, top_ptr, left_ptr, width, - height, xstep, ystep); + DirectionalZone2_NEON<false, true>(dst, stride, top_ptr, left_ptr, width, + height, xstep, ystep); } else { - DirectionalZone2_8<false, false>(dst, stride, top_ptr, left_ptr, width, - height, xstep, ystep); + DirectionalZone2_NEON<false, false>(dst, stride, top_ptr, left_ptr, width, + height, xstep, ystep); } } diff --git a/src/dsp/arm/intrapred_neon.cc b/src/dsp/arm/intrapred_neon.cc index cd47a22..d1adbdf 100644 --- a/src/dsp/arm/intrapred_neon.cc +++ b/src/dsp/arm/intrapred_neon.cc @@ -407,13 +407,9 @@ inline void Paeth4Or8xN_NEON(void* LIBGAV1_RESTRICT const dest, inline uint8x16_t XLeTopLeft(const uint8x16_t x_dist, const uint16x8_t top_left_dist_low, const uint16x8_t top_left_dist_high) { - // TODO(johannkoenig): cle() should work with vmovn(top_left_dist) instead of - // using movl(x_dist). - const uint8x8_t x_le_top_left_low = - vmovn_u16(vcleq_u16(vmovl_u8(vget_low_u8(x_dist)), top_left_dist_low)); - const uint8x8_t x_le_top_left_high = - vmovn_u16(vcleq_u16(vmovl_u8(vget_high_u8(x_dist)), top_left_dist_high)); - return vcombine_u8(x_le_top_left_low, x_le_top_left_high); + const uint8x16_t top_left_dist = vcombine_u8(vqmovn_u16(top_left_dist_low), + vqmovn_u16(top_left_dist_high)); + return vcleq_u8(x_dist, top_left_dist); } // Select the closest values and collect them. diff --git a/src/dsp/arm/intrapred_smooth_neon.cc b/src/dsp/arm/intrapred_smooth_neon.cc index bcda131..d6c1450 100644 --- a/src/dsp/arm/intrapred_smooth_neon.cc +++ b/src/dsp/arm/intrapred_smooth_neon.cc @@ -31,7 +31,6 @@ namespace libgav1 { namespace dsp { - namespace low_bitdepth { namespace { @@ -42,20 +41,15 @@ constexpr uint8_t kSmoothWeights[] = { #include "src/dsp/smooth_weights.inc" }; -inline uint16x4_t CalculatePred(const uint16x4_t weighted_top, - const uint16x4_t weighted_left, - const uint16x4_t weighted_bl, - const uint16x4_t weighted_tr) { - const uint32x4_t pred_0 = vaddl_u16(weighted_top, weighted_left); - const uint32x4_t pred_1 = vaddl_u16(weighted_bl, weighted_tr); - const uint32x4_t pred_2 = vaddq_u32(pred_0, pred_1); - return vrshrn_n_u32(pred_2, kSmoothWeightScale + 1); +// 256 - v = vneg_s8(v) +inline uint8x8_t NegateS8(const uint8x8_t v) { + return vreinterpret_u8_s8(vneg_s8(vreinterpret_s8_u8(v))); } template <int height> -inline void Smooth4xN_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, - const void* LIBGAV1_RESTRICT const top_row, - const void* LIBGAV1_RESTRICT const left_column) { +void Smooth4xN_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { constexpr int width = 4; const auto* const top = static_cast<const uint8_t*>(top_row); const auto* const left = static_cast<const uint8_t*>(left_column); @@ -68,47 +62,49 @@ inline void Smooth4xN_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, const uint8x8_t top_right_v = vdup_n_u8(top_right); const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left); const uint8x8_t weights_x_v = Load4(kSmoothWeights + width - 4); - // 256 - weights = vneg_s8(weights) - const uint8x8_t scaled_weights_x = - vreinterpret_u8_s8(vneg_s8(vreinterpret_s8_u8(weights_x_v))); + const uint8x8_t scaled_weights_x = NegateS8(weights_x_v); + const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v); for (int y = 0; y < height; ++y) { const uint8x8_t left_v = vdup_n_u8(left[y]); const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]); - const uint8x8_t scaled_weights_y = - vreinterpret_u8_s8(vneg_s8(vreinterpret_s8_u8(weights_y_v))); - const uint16x4_t weighted_bl = - vget_low_u16(vmull_u8(scaled_weights_y, bottom_left_v)); - - const uint16x4_t weighted_top = vget_low_u16(vmull_u8(weights_y_v, top_v)); - const uint16x4_t weighted_left = - vget_low_u16(vmull_u8(weights_x_v, left_v)); - const uint16x4_t weighted_tr = - vget_low_u16(vmull_u8(scaled_weights_x, top_right_v)); - const uint16x4_t result = - CalculatePred(weighted_top, weighted_left, weighted_bl, weighted_tr); - - StoreLo4(dst, vmovn_u16(vcombine_u16(result, result))); + const uint8x8_t scaled_weights_y = NegateS8(weights_y_v); + const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v); + const uint16x8_t weighted_top_bl = + vmlal_u8(weighted_bl, weights_y_v, top_v); + const uint16x8_t weighted_left_tr = + vmlal_u8(weighted_tr, weights_x_v, left_v); + // Maximum value of each parameter: 0xFF00 + const uint16x8_t avg = vhaddq_u16(weighted_top_bl, weighted_left_tr); + const uint8x8_t result = vrshrn_n_u16(avg, kSmoothWeightScale); + + StoreLo4(dst, result); dst += stride; } } -inline uint8x8_t CalculatePred(const uint16x8_t weighted_top, - const uint16x8_t weighted_left, - const uint16x8_t weighted_bl, - const uint16x8_t weighted_tr) { - // Maximum value: 0xFF00 - const uint16x8_t pred_0 = vaddq_u16(weighted_top, weighted_bl); - // Maximum value: 0xFF00 - const uint16x8_t pred_1 = vaddq_u16(weighted_left, weighted_tr); - const uint16x8_t pred_2 = vhaddq_u16(pred_0, pred_1); - return vrshrn_n_u16(pred_2, kSmoothWeightScale); +inline uint8x8_t CalculatePred(const uint16x8_t weighted_top_bl, + const uint16x8_t weighted_left_tr) { + // Maximum value of each parameter: 0xFF00 + const uint16x8_t avg = vhaddq_u16(weighted_top_bl, weighted_left_tr); + return vrshrn_n_u16(avg, kSmoothWeightScale); +} + +inline uint8x8_t CalculateWeightsAndPred( + const uint8x8_t top, const uint8x8_t left, const uint16x8_t weighted_tr, + const uint8x8_t bottom_left, const uint8x8_t weights_x, + const uint8x8_t scaled_weights_y, const uint8x8_t weights_y) { + const uint16x8_t weighted_top = vmull_u8(weights_y, top); + const uint16x8_t weighted_top_bl = + vmlal_u8(weighted_top, scaled_weights_y, bottom_left); + const uint16x8_t weighted_left_tr = vmlal_u8(weighted_tr, weights_x, left); + return CalculatePred(weighted_top_bl, weighted_left_tr); } template <int height> -inline void Smooth8xN_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, - const void* LIBGAV1_RESTRICT const top_row, - const void* LIBGAV1_RESTRICT const left_column) { +void Smooth8xN_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { constexpr int width = 8; const auto* const top = static_cast<const uint8_t*>(top_row); const auto* const left = static_cast<const uint8_t*>(left_column); @@ -121,21 +117,16 @@ inline void Smooth8xN_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, const uint8x8_t top_right_v = vdup_n_u8(top_right); const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left); const uint8x8_t weights_x_v = vld1_u8(kSmoothWeights + width - 4); - // 256 - weights = vneg_s8(weights) - const uint8x8_t scaled_weights_x = - vreinterpret_u8_s8(vneg_s8(vreinterpret_s8_u8(weights_x_v))); + const uint8x8_t scaled_weights_x = NegateS8(weights_x_v); + const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v); for (int y = 0; y < height; ++y) { const uint8x8_t left_v = vdup_n_u8(left[y]); const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]); - const uint8x8_t scaled_weights_y = vdup_n_u8(256 - weights_y[y]); - const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v); - - const uint16x8_t weighted_top = vmull_u8(weights_y_v, top_v); - const uint16x8_t weighted_left = vmull_u8(weights_x_v, left_v); - const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v); + const uint8x8_t scaled_weights_y = NegateS8(weights_y_v); const uint8x8_t result = - CalculatePred(weighted_top, weighted_left, weighted_bl, weighted_tr); + CalculateWeightsAndPred(top_v, left_v, weighted_tr, bottom_left_v, + weights_x_v, scaled_weights_y, weights_y_v); vst1_u8(dst, result); dst += stride; @@ -146,28 +137,34 @@ inline uint8x16_t CalculateWeightsAndPred( const uint8x16_t top, const uint8x8_t left, const uint8x8_t top_right, const uint8x8_t weights_y, const uint8x16_t weights_x, const uint8x16_t scaled_weights_x, const uint16x8_t weighted_bl) { - const uint16x8_t weighted_top_low = vmull_u8(weights_y, vget_low_u8(top)); + const uint16x8_t weighted_top_bl_low = + vmlal_u8(weighted_bl, weights_y, vget_low_u8(top)); const uint16x8_t weighted_left_low = vmull_u8(vget_low_u8(weights_x), left); - const uint16x8_t weighted_tr_low = - vmull_u8(vget_low_u8(scaled_weights_x), top_right); - const uint8x8_t result_low = CalculatePred( - weighted_top_low, weighted_left_low, weighted_bl, weighted_tr_low); + const uint16x8_t weighted_left_tr_low = + vmlal_u8(weighted_left_low, vget_low_u8(scaled_weights_x), top_right); + const uint8x8_t result_low = + CalculatePred(weighted_top_bl_low, weighted_left_tr_low); - const uint16x8_t weighted_top_high = vmull_u8(weights_y, vget_high_u8(top)); + const uint16x8_t weighted_top_bl_high = + vmlal_u8(weighted_bl, weights_y, vget_high_u8(top)); const uint16x8_t weighted_left_high = vmull_u8(vget_high_u8(weights_x), left); - const uint16x8_t weighted_tr_high = - vmull_u8(vget_high_u8(scaled_weights_x), top_right); - const uint8x8_t result_high = CalculatePred( - weighted_top_high, weighted_left_high, weighted_bl, weighted_tr_high); + const uint16x8_t weighted_left_tr_high = + vmlal_u8(weighted_left_high, vget_high_u8(scaled_weights_x), top_right); + const uint8x8_t result_high = + CalculatePred(weighted_top_bl_high, weighted_left_tr_high); return vcombine_u8(result_low, result_high); } +// 256 - v = vneg_s8(v) +inline uint8x16_t NegateS8(const uint8x16_t v) { + return vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(v))); +} + template <int width, int height> -inline void Smooth16PlusxN_NEON( - void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, - const void* LIBGAV1_RESTRICT const top_row, - const void* LIBGAV1_RESTRICT const left_column) { +void Smooth16PlusxN_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint8_t*>(top_row); const auto* const left = static_cast<const uint8_t*>(left_column); const uint8_t top_right = top[width - 1]; @@ -188,9 +185,6 @@ inline void Smooth16PlusxN_NEON( const uint8x8_t top_right_v = vdup_n_u8(top_right); const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left); - // TODO(johannkoenig): Consider re-reading top_v and weights_x_v in the loop. - // This currently has a performance slope similar to Paeth so it does not - // appear to be register bound for arm64. uint8x16_t weights_x_v[4]; weights_x_v[0] = vld1q_u8(kSmoothWeights + width - 4); if (width > 16) { @@ -202,23 +196,19 @@ inline void Smooth16PlusxN_NEON( } uint8x16_t scaled_weights_x[4]; - scaled_weights_x[0] = - vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x_v[0]))); + scaled_weights_x[0] = NegateS8(weights_x_v[0]); if (width > 16) { - scaled_weights_x[1] = - vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x_v[1]))); + scaled_weights_x[1] = NegateS8(weights_x_v[1]); if (width == 64) { - scaled_weights_x[2] = - vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x_v[2]))); - scaled_weights_x[3] = - vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x_v[3]))); + scaled_weights_x[2] = NegateS8(weights_x_v[2]); + scaled_weights_x[3] = NegateS8(weights_x_v[3]); } } for (int y = 0; y < height; ++y) { const uint8x8_t left_v = vdup_n_u8(left[y]); const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]); - const uint8x8_t scaled_weights_y = vdup_n_u8(256 - weights_y[y]); + const uint8x8_t scaled_weights_y = NegateS8(weights_y_v); const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v); vst1q_u8(dst, CalculateWeightsAndPred(top_v[0], left_v, top_right_v, @@ -246,10 +236,10 @@ inline void Smooth16PlusxN_NEON( } template <int width, int height> -inline void SmoothVertical4Or8xN_NEON( - void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, - const void* LIBGAV1_RESTRICT const top_row, - const void* LIBGAV1_RESTRICT const left_column) { +void SmoothVertical4Or8xN_NEON(void* LIBGAV1_RESTRICT const dest, + ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint8_t*>(top_row); const auto* const left = static_cast<const uint8_t*>(left_column); const uint8_t bottom_left = left[height - 1]; @@ -267,17 +257,17 @@ inline void SmoothVertical4Or8xN_NEON( for (int y = 0; y < height; ++y) { const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]); - const uint8x8_t scaled_weights_y = vdup_n_u8(256 - weights_y[y]); + const uint8x8_t scaled_weights_y = NegateS8(weights_y_v); const uint16x8_t weighted_top = vmull_u8(weights_y_v, top_v); - const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v); - const uint16x8_t pred = vaddq_u16(weighted_top, weighted_bl); - const uint8x8_t pred_scaled = vrshrn_n_u16(pred, kSmoothWeightScale); + const uint16x8_t weighted_top_bl = + vmlal_u8(weighted_top, scaled_weights_y, bottom_left_v); + const uint8x8_t pred = vrshrn_n_u16(weighted_top_bl, kSmoothWeightScale); if (width == 4) { - StoreLo4(dst, pred_scaled); + StoreLo4(dst, pred); } else { // width == 8 - vst1_u8(dst, pred_scaled); + vst1_u8(dst, pred); } dst += stride; } @@ -286,10 +276,10 @@ inline void SmoothVertical4Or8xN_NEON( inline uint8x16_t CalculateVerticalWeightsAndPred( const uint8x16_t top, const uint8x8_t weights_y, const uint16x8_t weighted_bl) { - const uint16x8_t weighted_top_low = vmull_u8(weights_y, vget_low_u8(top)); - const uint16x8_t weighted_top_high = vmull_u8(weights_y, vget_high_u8(top)); - const uint16x8_t pred_low = vaddq_u16(weighted_top_low, weighted_bl); - const uint16x8_t pred_high = vaddq_u16(weighted_top_high, weighted_bl); + const uint16x8_t pred_low = + vmlal_u8(weighted_bl, weights_y, vget_low_u8(top)); + const uint16x8_t pred_high = + vmlal_u8(weighted_bl, weights_y, vget_high_u8(top)); const uint8x8_t pred_scaled_low = vrshrn_n_u16(pred_low, kSmoothWeightScale); const uint8x8_t pred_scaled_high = vrshrn_n_u16(pred_high, kSmoothWeightScale); @@ -297,7 +287,7 @@ inline uint8x16_t CalculateVerticalWeightsAndPred( } template <int width, int height> -inline void SmoothVertical16PlusxN_NEON( +void SmoothVertical16PlusxN_NEON( void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, const void* LIBGAV1_RESTRICT const top_row, const void* LIBGAV1_RESTRICT const left_column) { @@ -321,7 +311,7 @@ inline void SmoothVertical16PlusxN_NEON( for (int y = 0; y < height; ++y) { const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]); - const uint8x8_t scaled_weights_y = vdup_n_u8(256 - weights_y[y]); + const uint8x8_t scaled_weights_y = NegateS8(weights_y_v); const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v); const uint8x16_t pred_0 = @@ -349,7 +339,7 @@ inline void SmoothVertical16PlusxN_NEON( } template <int width, int height> -inline void SmoothHorizontal4Or8xN_NEON( +void SmoothHorizontal4Or8xN_NEON( void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, const void* LIBGAV1_RESTRICT const top_row, const void* LIBGAV1_RESTRICT const left_column) { @@ -361,22 +351,19 @@ inline void SmoothHorizontal4Or8xN_NEON( const uint8x8_t top_right_v = vdup_n_u8(top_right); // Over-reads for 4xN but still within the array. const uint8x8_t weights_x = vld1_u8(kSmoothWeights + width - 4); - // 256 - weights = vneg_s8(weights) - const uint8x8_t scaled_weights_x = - vreinterpret_u8_s8(vneg_s8(vreinterpret_s8_u8(weights_x))); + const uint8x8_t scaled_weights_x = NegateS8(weights_x); + const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v); for (int y = 0; y < height; ++y) { const uint8x8_t left_v = vdup_n_u8(left[y]); - - const uint16x8_t weighted_left = vmull_u8(weights_x, left_v); - const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v); - const uint16x8_t pred = vaddq_u16(weighted_left, weighted_tr); - const uint8x8_t pred_scaled = vrshrn_n_u16(pred, kSmoothWeightScale); + const uint16x8_t weighted_left_tr = + vmlal_u8(weighted_tr, weights_x, left_v); + const uint8x8_t pred = vrshrn_n_u16(weighted_left_tr, kSmoothWeightScale); if (width == 4) { - StoreLo4(dst, pred_scaled); + StoreLo4(dst, pred); } else { // width == 8 - vst1_u8(dst, pred_scaled); + vst1_u8(dst, pred); } dst += stride; } @@ -386,23 +373,22 @@ inline uint8x16_t CalculateHorizontalWeightsAndPred( const uint8x8_t left, const uint8x8_t top_right, const uint8x16_t weights_x, const uint8x16_t scaled_weights_x) { const uint16x8_t weighted_left_low = vmull_u8(vget_low_u8(weights_x), left); - const uint16x8_t weighted_tr_low = - vmull_u8(vget_low_u8(scaled_weights_x), top_right); - const uint16x8_t pred_low = vaddq_u16(weighted_left_low, weighted_tr_low); - const uint8x8_t pred_scaled_low = vrshrn_n_u16(pred_low, kSmoothWeightScale); + const uint16x8_t weighted_left_tr_low = + vmlal_u8(weighted_left_low, vget_low_u8(scaled_weights_x), top_right); + const uint8x8_t pred_scaled_low = + vrshrn_n_u16(weighted_left_tr_low, kSmoothWeightScale); const uint16x8_t weighted_left_high = vmull_u8(vget_high_u8(weights_x), left); - const uint16x8_t weighted_tr_high = - vmull_u8(vget_high_u8(scaled_weights_x), top_right); - const uint16x8_t pred_high = vaddq_u16(weighted_left_high, weighted_tr_high); + const uint16x8_t weighted_left_tr_high = + vmlal_u8(weighted_left_high, vget_high_u8(scaled_weights_x), top_right); const uint8x8_t pred_scaled_high = - vrshrn_n_u16(pred_high, kSmoothWeightScale); + vrshrn_n_u16(weighted_left_tr_high, kSmoothWeightScale); return vcombine_u8(pred_scaled_low, pred_scaled_high); } template <int width, int height> -inline void SmoothHorizontal16PlusxN_NEON( +void SmoothHorizontal16PlusxN_NEON( void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, const void* LIBGAV1_RESTRICT const top_row, const void* LIBGAV1_RESTRICT const left_column) { @@ -424,16 +410,12 @@ inline void SmoothHorizontal16PlusxN_NEON( } uint8x16_t scaled_weights_x[4]; - scaled_weights_x[0] = - vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x[0]))); + scaled_weights_x[0] = NegateS8(weights_x[0]); if (width > 16) { - scaled_weights_x[1] = - vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x[1]))); + scaled_weights_x[1] = NegateS8(weights_x[1]); if (width == 64) { - scaled_weights_x[2] = - vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x[2]))); - scaled_weights_x[3] = - vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x[3]))); + scaled_weights_x[2] = NegateS8(weights_x[2]); + scaled_weights_x[3] = NegateS8(weights_x[3]); } } @@ -633,10 +615,15 @@ constexpr uint16_t kSmoothWeights[] = { #include "src/dsp/smooth_weights.inc" }; +// 256 - v = vneg_s8(v) +inline uint16x4_t NegateS8(const uint16x4_t v) { + return vreinterpret_u16_s8(vneg_s8(vreinterpret_s8_u16(v))); +} + template <int height> -inline void Smooth4xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, - const void* LIBGAV1_RESTRICT const top_row, - const void* LIBGAV1_RESTRICT const left_column) { +void Smooth4xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint16_t*>(top_row); const auto* const left = static_cast<const uint16_t*>(left_column); const uint16_t top_right = top[3]; @@ -647,9 +634,7 @@ inline void Smooth4xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, const uint16x4_t top_v = vld1_u16(top); const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left); const uint16x4_t weights_x_v = vld1_u16(kSmoothWeights); - const uint16x4_t scaled_weights_x = vsub_u16(vdup_n_u16(256), weights_x_v); - - // Weighted top right doesn't change with each row. + const uint16x4_t scaled_weights_x = NegateS8(weights_x_v); const uint32x4_t weighted_tr = vmull_n_u16(scaled_weights_x, top_right); for (int y = 0; y < height; ++y) { @@ -670,10 +655,10 @@ inline void Smooth4xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, // Common code between 8xH and [16|32|64]xH. inline void CalculatePred8(uint16_t* LIBGAV1_RESTRICT dst, - const uint32x4_t& weighted_corners_low, - const uint32x4_t& weighted_corners_high, - const uint16x4x2_t& top_vals, - const uint16x4x2_t& weights_x, const uint16_t left_y, + const uint32x4_t weighted_corners_low, + const uint32x4_t weighted_corners_high, + const uint16x4x2_t top_vals, + const uint16x4x2_t weights_x, const uint16_t left_y, const uint16_t weight_y) { // Each variable in the running summation is named for the last item to be // accumulated. @@ -697,9 +682,9 @@ inline void CalculatePred8(uint16_t* LIBGAV1_RESTRICT dst, } template <int height> -inline void Smooth8xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, - const void* LIBGAV1_RESTRICT const top_row, - const void* LIBGAV1_RESTRICT const left_column) { +void Smooth8xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint16_t*>(top_row); const auto* const left = static_cast<const uint16_t*>(left_column); const uint16_t top_right = top[7]; @@ -712,14 +697,12 @@ inline void Smooth8xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left); const uint16x4x2_t weights_x = {vld1_u16(kSmoothWeights + 4), vld1_u16(kSmoothWeights + 8)}; - // Weighted top right doesn't change with each row. const uint32x4_t weighted_tr_low = - vmull_n_u16(vsub_u16(vdup_n_u16(256), weights_x.val[0]), top_right); + vmull_n_u16(NegateS8(weights_x.val[0]), top_right); const uint32x4_t weighted_tr_high = - vmull_n_u16(vsub_u16(vdup_n_u16(256), weights_x.val[1]), top_right); + vmull_n_u16(NegateS8(weights_x.val[1]), top_right); for (int y = 0; y < height; ++y) { - // |weighted_bl| is invariant across the row. const uint32x4_t weighted_bl = vmull_n_u16(bottom_left_v, 256 - weights_y[y]); const uint32x4_t weighted_corners_low = @@ -735,9 +718,9 @@ inline void Smooth8xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, // For width 16 and above. template <int width, int height> -inline void SmoothWxH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, - const void* LIBGAV1_RESTRICT const top_row, - const void* LIBGAV1_RESTRICT const left_column) { +void SmoothWxH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint16_t*>(top_row); const auto* const left = static_cast<const uint16_t*>(left_column); const uint16_t top_right = top[width - 1]; @@ -746,23 +729,19 @@ inline void SmoothWxH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, auto* dst = static_cast<uint8_t*>(dest); - const uint16x4_t weight_scaling = vdup_n_u16(256); // Precompute weighted values that don't vary with |y|. uint32x4_t weighted_tr_low[width >> 3]; uint32x4_t weighted_tr_high[width >> 3]; for (int i = 0; i < width >> 3; ++i) { const int x = i << 3; const uint16x4_t weights_x_low = vld1_u16(kSmoothWeights + width - 4 + x); - weighted_tr_low[i] = - vmull_n_u16(vsub_u16(weight_scaling, weights_x_low), top_right); + weighted_tr_low[i] = vmull_n_u16(NegateS8(weights_x_low), top_right); const uint16x4_t weights_x_high = vld1_u16(kSmoothWeights + width + x); - weighted_tr_high[i] = - vmull_n_u16(vsub_u16(weight_scaling, weights_x_high), top_right); + weighted_tr_high[i] = vmull_n_u16(NegateS8(weights_x_high), top_right); } const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left); for (int y = 0; y < height; ++y) { - // |weighted_bl| is invariant across the row. const uint32x4_t weighted_bl = vmull_n_u16(bottom_left_v, 256 - weights_y[y]); auto* dst_x = reinterpret_cast<uint16_t*>(dst); @@ -785,10 +764,9 @@ inline void SmoothWxH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, } template <int height> -inline void SmoothVertical4xH_NEON( - void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, - const void* LIBGAV1_RESTRICT const top_row, - const void* LIBGAV1_RESTRICT const left_column) { +void SmoothVertical4xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint16_t*>(top_row); const auto* const left = static_cast<const uint16_t*>(left_column); const uint16_t bottom_left = left[height - 1]; @@ -812,10 +790,10 @@ inline void SmoothVertical4xH_NEON( } template <int height> -inline void SmoothVertical8xH_NEON( - void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, - const void* LIBGAV1_RESTRICT const top_row, - const void* LIBGAV1_RESTRICT const left_column) { +void SmoothVertical8xH_NEON(void* LIBGAV1_RESTRICT const dest, + const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint16_t*>(top_row); const auto* const left = static_cast<const uint16_t*>(left_column); const uint16_t bottom_left = left[height - 1]; @@ -829,7 +807,6 @@ inline void SmoothVertical8xH_NEON( for (int y = 0; y < height; ++y) { auto* dst16 = reinterpret_cast<uint16_t*>(dst); - // |weighted_bl| is invariant across the row. const uint32x4_t weighted_bl = vmull_n_u16(bottom_left_v, 256 - weights_y[y]); @@ -846,10 +823,10 @@ inline void SmoothVertical8xH_NEON( // For width 16 and above. template <int width, int height> -inline void SmoothVerticalWxH_NEON( - void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, - const void* LIBGAV1_RESTRICT const top_row, - const void* LIBGAV1_RESTRICT const left_column) { +void SmoothVerticalWxH_NEON(void* LIBGAV1_RESTRICT const dest, + const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint16_t*>(top_row); const auto* const left = static_cast<const uint16_t*>(left_column); const uint16_t bottom_left = left[height - 1]; @@ -865,7 +842,6 @@ inline void SmoothVerticalWxH_NEON( const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left); for (int y = 0; y < height; ++y) { - // |weighted_bl| is invariant across the row. const uint32x4_t weighted_bl = vmull_n_u16(bottom_left_v, 256 - weights_y[y]); @@ -885,10 +861,10 @@ inline void SmoothVerticalWxH_NEON( } template <int height> -inline void SmoothHorizontal4xH_NEON( - void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, - const void* LIBGAV1_RESTRICT const top_row, - const void* LIBGAV1_RESTRICT const left_column) { +void SmoothHorizontal4xH_NEON(void* LIBGAV1_RESTRICT const dest, + ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint16_t*>(top_row); const auto* const left = static_cast<const uint16_t*>(left_column); const uint16_t top_right = top[3]; @@ -896,7 +872,7 @@ inline void SmoothHorizontal4xH_NEON( auto* dst = static_cast<uint8_t*>(dest); const uint16x4_t weights_x = vld1_u16(kSmoothWeights); - const uint16x4_t scaled_weights_x = vsub_u16(vdup_n_u16(256), weights_x); + const uint16x4_t scaled_weights_x = NegateS8(weights_x); const uint32x4_t weighted_tr = vmull_n_u16(scaled_weights_x, top_right); for (int y = 0; y < height; ++y) { @@ -909,10 +885,10 @@ inline void SmoothHorizontal4xH_NEON( } template <int height> -inline void SmoothHorizontal8xH_NEON( - void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, - const void* LIBGAV1_RESTRICT const top_row, - const void* LIBGAV1_RESTRICT const left_column) { +void SmoothHorizontal8xH_NEON(void* LIBGAV1_RESTRICT const dest, + ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint16_t*>(top_row); const auto* const left = static_cast<const uint16_t*>(left_column); const uint16_t top_right = top[7]; @@ -923,9 +899,9 @@ inline void SmoothHorizontal8xH_NEON( vld1_u16(kSmoothWeights + 8)}; const uint32x4_t weighted_tr_low = - vmull_n_u16(vsub_u16(vdup_n_u16(256), weights_x.val[0]), top_right); + vmull_n_u16(NegateS8(weights_x.val[0]), top_right); const uint32x4_t weighted_tr_high = - vmull_n_u16(vsub_u16(vdup_n_u16(256), weights_x.val[1]), top_right); + vmull_n_u16(NegateS8(weights_x.val[1]), top_right); for (int y = 0; y < height; ++y) { auto* dst16 = reinterpret_cast<uint16_t*>(dst); @@ -943,18 +919,16 @@ inline void SmoothHorizontal8xH_NEON( // For width 16 and above. template <int width, int height> -inline void SmoothHorizontalWxH_NEON( - void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, - const void* LIBGAV1_RESTRICT const top_row, - const void* LIBGAV1_RESTRICT const left_column) { +void SmoothHorizontalWxH_NEON(void* LIBGAV1_RESTRICT const dest, + ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column) { const auto* const top = static_cast<const uint16_t*>(top_row); const auto* const left = static_cast<const uint16_t*>(left_column); const uint16_t top_right = top[width - 1]; auto* dst = static_cast<uint8_t*>(dest); - const uint16x4_t weight_scaling = vdup_n_u16(256); - uint16x4_t weights_x_low[width >> 3]; uint16x4_t weights_x_high[width >> 3]; uint32x4_t weighted_tr_low[width >> 3]; @@ -962,11 +936,9 @@ inline void SmoothHorizontalWxH_NEON( for (int i = 0; i < width >> 3; ++i) { const int x = i << 3; weights_x_low[i] = vld1_u16(kSmoothWeights + width - 4 + x); - weighted_tr_low[i] = - vmull_n_u16(vsub_u16(weight_scaling, weights_x_low[i]), top_right); + weighted_tr_low[i] = vmull_n_u16(NegateS8(weights_x_low[i]), top_right); weights_x_high[i] = vld1_u16(kSmoothWeights + width + x); - weighted_tr_high[i] = - vmull_n_u16(vsub_u16(weight_scaling, weights_x_high[i]), top_right); + weighted_tr_high[i] = vmull_n_u16(NegateS8(weights_x_high[i]), top_right); } for (int y = 0; y < height; ++y) { @@ -1141,6 +1113,7 @@ void Init10bpp() { dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothHorizontal] = SmoothHorizontalWxH_NEON<64, 64>; } + } // namespace } // namespace high_bitdepth #endif // LIBGAV1_MAX_BITDEPTH >= 10 diff --git a/src/dsp/arm/inverse_transform_10bit_neon.cc b/src/dsp/arm/inverse_transform_10bit_neon.cc index 617accc..e6f0d9d 100644 --- a/src/dsp/arm/inverse_transform_10bit_neon.cc +++ b/src/dsp/arm/inverse_transform_10bit_neon.cc @@ -282,9 +282,12 @@ LIBGAV1_ALWAYS_INLINE void Dct4_NEON(void* dest, int32_t step, bool is_row, const int32x4_t max = vdupq_n_s32((1 << range) - 1); int32x4_t s[4], x[4]; - LoadSrc<4>(dst, step, 0, x); if (is_row) { - Transpose4x4(x, x); + assert(step == 4); + int32x4x4_t y = vld4q_s32(dst); + for (int i = 0; i < 4; ++i) x[i] = y.val[i]; + } else { + LoadSrc<4>(dst, step, 0, x); } // stage 1. @@ -301,9 +304,12 @@ LIBGAV1_ALWAYS_INLINE void Dct4_NEON(void* dest, int32_t step, bool is_row, for (auto& i : s) { i = vmovl_s16(vqmovn_s32(vqrshlq_s32(i, v_row_shift))); } - Transpose4x4(s, s); + int32x4x4_t y; + for (int i = 0; i < 4; ++i) y.val[i] = s[i]; + vst4q_s32(dst, y); + } else { + StoreDst<4>(dst, step, 0, s); } - StoreDst<4>(dst, step, 0, s); } template <ButterflyRotationFunc butterfly_rotation, @@ -937,9 +943,12 @@ LIBGAV1_ALWAYS_INLINE void Adst4_NEON(void* dest, int32_t step, bool is_row, int32x4_t s[8]; int32x4_t x[4]; - LoadSrc<4>(dst, step, 0, x); if (is_row) { - Transpose4x4(x, x); + assert(step == 4); + int32x4x4_t y = vld4q_s32(dst); + for (int i = 0; i < 4; ++i) x[i] = y.val[i]; + } else { + LoadSrc<4>(dst, step, 0, x); } // stage 1. @@ -981,9 +990,12 @@ LIBGAV1_ALWAYS_INLINE void Adst4_NEON(void* dest, int32_t step, bool is_row, x[1] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[1], v_row_shift))); x[2] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[2], v_row_shift))); x[3] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[3], v_row_shift))); - Transpose4x4(x, x); + int32x4x4_t y; + for (int i = 0; i < 4; ++i) y.val[i] = x[i]; + vst4q_s32(dst, y); + } else { + StoreDst<4>(dst, step, 0, x); } - StoreDst<4>(dst, step, 0, x); } alignas(16) constexpr int32_t kAdst4DcOnlyMultiplier[4] = {1321, 2482, 3344, diff --git a/src/dsp/arm/inverse_transform_neon.cc b/src/dsp/arm/inverse_transform_neon.cc index 1c2e111..452f14a 100644 --- a/src/dsp/arm/inverse_transform_neon.cc +++ b/src/dsp/arm/inverse_transform_neon.cc @@ -41,50 +41,6 @@ namespace { //------------------------------------------------------------------------------ -// TODO(slavarnway): Move transpose functions to transpose_neon.h or -// common_neon.h. - -LIBGAV1_ALWAYS_INLINE void Transpose4x4(const int16x8_t in[4], - int16x8_t out[4]) { - // Swap 16 bit elements. Goes from: - // a0: 00 01 02 03 - // a1: 10 11 12 13 - // a2: 20 21 22 23 - // a3: 30 31 32 33 - // to: - // b0.val[0]: 00 10 02 12 - // b0.val[1]: 01 11 03 13 - // b1.val[0]: 20 30 22 32 - // b1.val[1]: 21 31 23 33 - const int16x4_t a0 = vget_low_s16(in[0]); - const int16x4_t a1 = vget_low_s16(in[1]); - const int16x4_t a2 = vget_low_s16(in[2]); - const int16x4_t a3 = vget_low_s16(in[3]); - - const int16x4x2_t b0 = vtrn_s16(a0, a1); - const int16x4x2_t b1 = vtrn_s16(a2, a3); - - // Swap 32 bit elements resulting in: - // c0.val[0]: 00 10 20 30 04 14 24 34 - // c0.val[1]: 02 12 22 32 06 16 26 36 - // c1.val[0]: 01 11 21 31 05 15 25 35 - // c1.val[1]: 03 13 23 33 07 17 27 37 - const int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]), - vreinterpret_s32_s16(b1.val[0])); - const int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]), - vreinterpret_s32_s16(b1.val[1])); - - const int16x4_t d0 = vreinterpret_s16_s32(c0.val[0]); - const int16x4_t d1 = vreinterpret_s16_s32(c1.val[0]); - const int16x4_t d2 = vreinterpret_s16_s32(c0.val[1]); - const int16x4_t d3 = vreinterpret_s16_s32(c1.val[1]); - - out[0] = vcombine_s16(d0, d0); - out[1] = vcombine_s16(d1, d1); - out[2] = vcombine_s16(d2, d2); - out[3] = vcombine_s16(d3, d3); -} - // Note this is only used in the final stage of Dct32/64 and Adst16 as the in // place version causes additional stack usage with clang. LIBGAV1_ALWAYS_INLINE void Transpose8x8(const int16x8_t in[8], @@ -580,16 +536,19 @@ LIBGAV1_ALWAYS_INLINE void Dct4_NEON(void* dest, int32_t step, bool transpose) { if (stage_is_rectangular) { if (transpose) { - int16x8_t input[8]; - LoadSrc<8, 8>(dst, step, 0, input); - Transpose4x8To8x4(input, x); + assert(step == 4); + int16x8x4_t y = vld4q_s16(dst); + for (int i = 0; i < 4; ++i) x[i] = y.val[i]; } else { LoadSrc<16, 4>(dst, step, 0, x); } } else { - LoadSrc<8, 4>(dst, step, 0, x); if (transpose) { - Transpose4x4(x, x); + assert(step == 4); + int16x4x4_t y = vld4_s16(dst); + for (int i = 0; i < 4; ++i) x[i] = vcombine_s16(y.val[i], y.val[i]); + } else { + LoadSrc<8, 4>(dst, step, 0, x); } } @@ -604,17 +563,20 @@ LIBGAV1_ALWAYS_INLINE void Dct4_NEON(void* dest, int32_t step, bool transpose) { if (stage_is_rectangular) { if (transpose) { - int16x8_t output[8]; - Transpose8x4To4x8(s, output); - StoreDst<8, 8>(dst, step, 0, output); + int16x8x4_t y; + for (int i = 0; i < 4; ++i) y.val[i] = s[i]; + vst4q_s16(dst, y); } else { StoreDst<16, 4>(dst, step, 0, s); } } else { if (transpose) { - Transpose4x4(s, s); + int16x4x4_t y; + for (int i = 0; i < 4; ++i) y.val[i] = vget_low_s16(s[i]); + vst4_s16(dst, y); + } else { + StoreDst<8, 4>(dst, step, 0, s); } - StoreDst<8, 4>(dst, step, 0, s); } } @@ -1204,45 +1166,41 @@ void Dct64_NEON(void* dest, int32_t step, bool is_row, int row_shift) { //------------------------------------------------------------------------------ // Asymmetric Discrete Sine Transforms (ADST). -template <bool stage_is_rectangular> + LIBGAV1_ALWAYS_INLINE void Adst4_NEON(void* dest, int32_t step, bool transpose) { auto* const dst = static_cast<int16_t*>(dest); - int32x4_t s[8]; - int16x8_t x[4]; + int32x4_t s[7]; + int16x4_t x[4]; - if (stage_is_rectangular) { - if (transpose) { - int16x8_t input[8]; - LoadSrc<8, 8>(dst, step, 0, input); - Transpose4x8To8x4(input, x); - } else { - LoadSrc<16, 4>(dst, step, 0, x); - } + if (transpose) { + assert(step == 4); + int16x4x4_t y = vld4_s16(dst); + for (int i = 0; i < 4; ++i) x[i] = y.val[i]; } else { - LoadSrc<8, 4>(dst, step, 0, x); - if (transpose) { - Transpose4x4(x, x); - } + x[0] = vld1_s16(dst); + x[1] = vld1_s16(dst + 1 * step); + x[2] = vld1_s16(dst + 2 * step); + x[3] = vld1_s16(dst + 3 * step); } // stage 1. - s[5] = vmull_n_s16(vget_low_s16(x[3]), kAdst4Multiplier[1]); - s[6] = vmull_n_s16(vget_low_s16(x[3]), kAdst4Multiplier[3]); + s[5] = vmull_n_s16(x[3], kAdst4Multiplier[1]); + s[6] = vmull_n_s16(x[3], kAdst4Multiplier[3]); // stage 2. - const int32x4_t a7 = vsubl_s16(vget_low_s16(x[0]), vget_low_s16(x[2])); - const int32x4_t b7 = vaddw_s16(a7, vget_low_s16(x[3])); + const int32x4_t a7 = vsubl_s16(x[0], x[2]); + const int32x4_t b7 = vaddw_s16(a7, x[3]); // stage 3. - s[0] = vmull_n_s16(vget_low_s16(x[0]), kAdst4Multiplier[0]); - s[1] = vmull_n_s16(vget_low_s16(x[0]), kAdst4Multiplier[1]); + s[0] = vmull_n_s16(x[0], kAdst4Multiplier[0]); + s[1] = vmull_n_s16(x[0], kAdst4Multiplier[1]); // s[0] = s[0] + s[3] - s[0] = vmlal_n_s16(s[0], vget_low_s16(x[2]), kAdst4Multiplier[3]); + s[0] = vmlal_n_s16(s[0], x[2], kAdst4Multiplier[3]); // s[1] = s[1] - s[4] - s[1] = vmlsl_n_s16(s[1], vget_low_s16(x[2]), kAdst4Multiplier[0]); + s[1] = vmlsl_n_s16(s[1], x[2], kAdst4Multiplier[0]); - s[3] = vmull_n_s16(vget_low_s16(x[1]), kAdst4Multiplier[2]); + s[3] = vmull_n_s16(x[1], kAdst4Multiplier[2]); s[2] = vmulq_n_s32(b7, kAdst4Multiplier[2]); // stage 4. @@ -1259,24 +1217,20 @@ LIBGAV1_ALWAYS_INLINE void Adst4_NEON(void* dest, int32_t step, const int16x4_t dst_2 = vqrshrn_n_s32(s[2], 12); const int16x4_t dst_3 = vqrshrn_n_s32(x3, 12); - x[0] = vcombine_s16(dst_0, dst_0); - x[1] = vcombine_s16(dst_1, dst_1); - x[2] = vcombine_s16(dst_2, dst_2); - x[3] = vcombine_s16(dst_3, dst_3); + x[0] = dst_0; + x[1] = dst_1; + x[2] = dst_2; + x[3] = dst_3; - if (stage_is_rectangular) { - if (transpose) { - int16x8_t output[8]; - Transpose8x4To4x8(x, output); - StoreDst<8, 8>(dst, step, 0, output); - } else { - StoreDst<16, 4>(dst, step, 0, x); - } + if (transpose) { + int16x4x4_t y; + for (int i = 0; i < 4; ++i) y.val[i] = x[i]; + vst4_s16(dst, y); } else { - if (transpose) { - Transpose4x4(x, x); - } - StoreDst<8, 4>(dst, step, 0, x); + vst1_s16(dst, x[0]); + vst1_s16(dst + 1 * step, x[1]); + vst1_s16(dst + 2 * step, x[2]); + vst1_s16(dst + 3 * step, x[3]); } } @@ -2705,7 +2659,7 @@ void Adst4TransformLoopRow_NEON(TransformType /*tx_type*/, int i = adjusted_tx_height; auto* data = src; do { - Adst4_NEON<false>(data, /*step=*/4, /*transpose=*/true); + Adst4_NEON(data, /*step=*/4, /*transpose=*/true); data += 16; i -= 4; } while (i != 0); @@ -2732,7 +2686,7 @@ void Adst4TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size, int i = tx_width; auto* data = src; do { - Adst4_NEON<false>(data, tx_width, /*transpose=*/false); + Adst4_NEON(data, tx_width, /*transpose=*/false); data += 4; i -= 4; } while (i != 0); diff --git a/src/dsp/arm/loop_filter_10bit_neon.cc b/src/dsp/arm/loop_filter_10bit_neon.cc new file mode 100644 index 0000000..a9dd98f --- /dev/null +++ b/src/dsp/arm/loop_filter_10bit_neon.cc @@ -0,0 +1,1218 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/loop_filter.h" +#include "src/utils/cpu.h" + +#if LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10 + +#include <arm_neon.h> + +#include <cassert> +#include <cstddef> +#include <cstdint> + +#include "src/dsp/arm/common_neon.h" +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" + +namespace libgav1 { +namespace dsp { +namespace { + +// (abs(p1 - p0) > thresh) || (abs(q1 - q0) > thresh) +inline uint16x4_t Hev(const uint16x8_t abd_p0p1_q0q1, const uint16_t thresh) { + const uint16x8_t a = vcgtq_u16(abd_p0p1_q0q1, vdupq_n_u16(thresh)); + return vorr_u16(vget_low_u16(a), vget_high_u16(a)); +} + +// abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= outer_thresh +inline uint16x4_t OuterThreshold(const uint16x4_t p1, const uint16x4_t p0, + const uint16x4_t q0, const uint16x4_t q1, + const uint16_t outer_thresh) { + const uint16x4_t abd_p0q0 = vabd_u16(p0, q0); + const uint16x4_t abd_p1q1 = vabd_u16(p1, q1); + const uint16x4_t p0q0_double = vshl_n_u16(abd_p0q0, 1); + const uint16x4_t p1q1_half = vshr_n_u16(abd_p1q1, 1); + const uint16x4_t sum = vadd_u16(p0q0_double, p1q1_half); + return vcle_u16(sum, vdup_n_u16(outer_thresh)); +} + +// abs(p1 - p0) <= inner_thresh && abs(q1 - q0) <= inner_thresh && +// OuterThreshold() +inline uint16x4_t NeedsFilter4(const uint16x8_t abd_p0p1_q0q1, + const uint16_t inner_thresh, + const uint16x4_t outer_mask) { + const uint16x8_t a = vcleq_u16(abd_p0p1_q0q1, vdupq_n_u16(inner_thresh)); + const uint16x4_t inner_mask = vand_u16(vget_low_u16(a), vget_high_u16(a)); + return vand_u16(inner_mask, outer_mask); +} + +// abs(p2 - p1) <= inner_thresh && abs(p1 - p0) <= inner_thresh && +// abs(q1 - q0) <= inner_thresh && abs(q2 - q1) <= inner_thresh && +// OuterThreshold() +inline uint16x4_t NeedsFilter6(const uint16x8_t abd_p0p1_q0q1, + const uint16x8_t abd_p1p2_q1q2, + const uint16_t inner_thresh, + const uint16x4_t outer_mask) { + const uint16x8_t a = vmaxq_u16(abd_p0p1_q0q1, abd_p1p2_q1q2); + const uint16x8_t b = vcleq_u16(a, vdupq_n_u16(inner_thresh)); + const uint16x4_t inner_mask = vand_u16(vget_low_u16(b), vget_high_u16(b)); + return vand_u16(inner_mask, outer_mask); +} + +// abs(p3 - p2) <= inner_thresh && abs(p2 - p1) <= inner_thresh && +// abs(p1 - p0) <= inner_thresh && abs(q1 - q0) <= inner_thresh && +// abs(q2 - q1) <= inner_thresh && abs(q3 - q2) <= inner_thresh +// OuterThreshold() +inline uint16x4_t NeedsFilter8(const uint16x8_t abd_p0p1_q0q1, + const uint16x8_t abd_p1p2_q1q2, + const uint16x8_t abd_p2p3_q2q3, + const uint16_t inner_thresh, + const uint16x4_t outer_mask) { + const uint16x8_t a = vmaxq_u16(abd_p0p1_q0q1, abd_p1p2_q1q2); + const uint16x8_t b = vmaxq_u16(a, abd_p2p3_q2q3); + const uint16x8_t c = vcleq_u16(b, vdupq_n_u16(inner_thresh)); + const uint16x4_t inner_mask = vand_u16(vget_low_u16(c), vget_high_u16(c)); + return vand_u16(inner_mask, outer_mask); +} + +// ----------------------------------------------------------------------------- +// FilterNMasks functions. + +inline void Filter4Masks(const uint16x8_t p0q0, const uint16x8_t p1q1, + const uint16_t hev_thresh, const uint16x4_t outer_mask, + const uint16_t inner_thresh, + uint16x4_t* const hev_mask, + uint16x4_t* const needs_filter4_mask) { + const uint16x8_t p0p1_q0q1 = vabdq_u16(p0q0, p1q1); + // This includes cases where NeedsFilter4() is not true and so Filter2() will + // not be applied. + const uint16x4_t hev_tmp_mask = Hev(p0p1_q0q1, hev_thresh); + + *needs_filter4_mask = NeedsFilter4(p0p1_q0q1, inner_thresh, outer_mask); + + // Filter2() will only be applied if both NeedsFilter4() and Hev() are true. + *hev_mask = vand_u16(hev_tmp_mask, *needs_filter4_mask); +} + +// abs(p1 - p0) <= flat_thresh && abs(q1 - q0) <= flat_thresh && +// abs(p2 - p0) <= flat_thresh && abs(q2 - q0) <= flat_thresh +// |flat_thresh| == 4 for 10 bit decode. +inline uint16x4_t IsFlat3(const uint16x8_t abd_p0p1_q0q1, + const uint16x8_t abd_p0p2_q0q2) { + constexpr int flat_thresh = 1 << 2; + const uint16x8_t a = vmaxq_u16(abd_p0p1_q0q1, abd_p0p2_q0q2); + const uint16x8_t b = vcleq_u16(a, vdupq_n_u16(flat_thresh)); + return vand_u16(vget_low_u16(b), vget_high_u16(b)); +} + +inline void Filter6Masks(const uint16x8_t p2q2, const uint16x8_t p1q1, + const uint16x8_t p0q0, const uint16_t hev_thresh, + const uint16x4_t outer_mask, + const uint16_t inner_thresh, + uint16x4_t* const needs_filter6_mask, + uint16x4_t* const is_flat3_mask, + uint16x4_t* const hev_mask) { + const uint16x8_t abd_p0p1_q0q1 = vabdq_u16(p0q0, p1q1); + *hev_mask = Hev(abd_p0p1_q0q1, hev_thresh); + *is_flat3_mask = IsFlat3(abd_p0p1_q0q1, vabdq_u16(p0q0, p2q2)); + *needs_filter6_mask = NeedsFilter6(abd_p0p1_q0q1, vabdq_u16(p1q1, p2q2), + inner_thresh, outer_mask); +} + +// IsFlat4 uses N=1, IsFlatOuter4 uses N=4. +// abs(p[N] - p0) <= flat_thresh && abs(q[N] - q0) <= flat_thresh && +// abs(p[N+1] - p0) <= flat_thresh && abs(q[N+1] - q0) <= flat_thresh && +// abs(p[N+2] - p0) <= flat_thresh && abs(q[N+1] - q0) <= flat_thresh +// |flat_thresh| == 4 for 10 bit decode. +inline uint16x4_t IsFlat4(const uint16x8_t abd_pnp0_qnq0, + const uint16x8_t abd_pn1p0_qn1q0, + const uint16x8_t abd_pn2p0_qn2q0) { + constexpr int flat_thresh = 1 << 2; + const uint16x8_t a = vmaxq_u16(abd_pnp0_qnq0, abd_pn1p0_qn1q0); + const uint16x8_t b = vmaxq_u16(a, abd_pn2p0_qn2q0); + const uint16x8_t c = vcleq_u16(b, vdupq_n_u16(flat_thresh)); + return vand_u16(vget_low_u16(c), vget_high_u16(c)); +} + +inline void Filter8Masks(const uint16x8_t p3q3, const uint16x8_t p2q2, + const uint16x8_t p1q1, const uint16x8_t p0q0, + const uint16_t hev_thresh, const uint16x4_t outer_mask, + const uint16_t inner_thresh, + uint16x4_t* const needs_filter8_mask, + uint16x4_t* const is_flat4_mask, + uint16x4_t* const hev_mask) { + const uint16x8_t abd_p0p1_q0q1 = vabdq_u16(p0q0, p1q1); + *hev_mask = Hev(abd_p0p1_q0q1, hev_thresh); + const uint16x4_t is_flat4 = + IsFlat4(abd_p0p1_q0q1, vabdq_u16(p0q0, p2q2), vabdq_u16(p0q0, p3q3)); + *needs_filter8_mask = + NeedsFilter8(abd_p0p1_q0q1, vabdq_u16(p1q1, p2q2), vabdq_u16(p2q2, p3q3), + inner_thresh, outer_mask); + // |is_flat4_mask| is used to decide where to use the result of Filter8. + // In rare cases, |is_flat4| can be true where |needs_filter8_mask| is false, + // overriding the question of whether to use Filter8. Because Filter4 doesn't + // apply to p2q2, |is_flat4_mask| chooses directly between Filter8 and the + // source value. To be correct, the mask must account for this override. + *is_flat4_mask = vand_u16(is_flat4, *needs_filter8_mask); +} + +// ----------------------------------------------------------------------------- +// FilterN functions. + +// Calculate Filter4() or Filter2() based on |hev_mask|. +inline void Filter4(const uint16x8_t p0q0, const uint16x8_t p0q1, + const uint16x8_t p1q1, const uint16x4_t hev_mask, + uint16x8_t* const p1q1_result, + uint16x8_t* const p0q0_result) { + const uint16x8_t q0p1 = vextq_u16(p0q0, p1q1, 4); + // a = 3 * (q0 - p0) + Clip3(p1 - q1, min_signed_val, max_signed_val); + // q0mp0 means "q0 minus p0". + const int16x8_t q0mp0_p1mq1 = vreinterpretq_s16_u16(vsubq_u16(q0p1, p0q1)); + const int16x4_t q0mp0_3 = vmul_n_s16(vget_low_s16(q0mp0_p1mq1), 3); + + // If this is for Filter2() then include |p1mq1|. Otherwise zero it. + const int16x4_t min_signed_pixel = vdup_n_s16(-(1 << (9 /*bitdepth-1*/))); + const int16x4_t max_signed_pixel = vdup_n_s16((1 << (9 /*bitdepth-1*/)) - 1); + const int16x4_t p1mq1 = vget_high_s16(q0mp0_p1mq1); + const int16x4_t p1mq1_saturated = + Clip3S16(p1mq1, min_signed_pixel, max_signed_pixel); + const int16x4_t hev_option = + vand_s16(vreinterpret_s16_u16(hev_mask), p1mq1_saturated); + + const int16x4_t a = vadd_s16(q0mp0_3, hev_option); + + // Need to figure out what's going on here because there are some unnecessary + // tricks to accommodate 8x8 as smallest 8bpp vector + + // We can not shift with rounding because the clamp comes *before* the + // shifting. a1 = Clip3(a + 4, min_signed_val, max_signed_val) >> 3; a2 = + // Clip3(a + 3, min_signed_val, max_signed_val) >> 3; + const int16x4_t plus_four = + Clip3S16(vadd_s16(a, vdup_n_s16(4)), min_signed_pixel, max_signed_pixel); + const int16x4_t plus_three = + Clip3S16(vadd_s16(a, vdup_n_s16(3)), min_signed_pixel, max_signed_pixel); + const int16x4_t a1 = vshr_n_s16(plus_four, 3); + const int16x4_t a2 = vshr_n_s16(plus_three, 3); + + // a3 = (a1 + 1) >> 1; + const int16x4_t a3 = vrshr_n_s16(a1, 1); + + const int16x8_t a3_ma3 = vcombine_s16(a3, vneg_s16(a3)); + const int16x8_t p1q1_a3 = vaddq_s16(vreinterpretq_s16_u16(p1q1), a3_ma3); + + // Need to shift the second term or we end up with a2_ma2. + const int16x8_t a2_ma1 = vcombine_s16(a2, vneg_s16(a1)); + const int16x8_t p0q0_a = vaddq_s16(vreinterpretq_s16_u16(p0q0), a2_ma1); + *p1q1_result = ConvertToUnsignedPixelU16(p1q1_a3, kBitdepth10); + *p0q0_result = ConvertToUnsignedPixelU16(p0q0_a, kBitdepth10); +} + +void Horizontal4_NEON(void* const dest, const ptrdiff_t stride, + int outer_thresh, int inner_thresh, int hev_thresh) { + auto* const dst = static_cast<uint8_t*>(dest); + auto* const dst_p1 = reinterpret_cast<uint16_t*>(dst - 2 * stride); + auto* const dst_p0 = reinterpret_cast<uint16_t*>(dst - stride); + auto* const dst_q0 = reinterpret_cast<uint16_t*>(dst); + auto* const dst_q1 = reinterpret_cast<uint16_t*>(dst + stride); + + const uint16x4_t src[4] = {vld1_u16(dst_p1), vld1_u16(dst_p0), + vld1_u16(dst_q0), vld1_u16(dst_q1)}; + + // Adjust thresholds to bitdepth. + outer_thresh <<= 2; + inner_thresh <<= 2; + hev_thresh <<= 2; + const uint16x4_t outer_mask = + OuterThreshold(src[0], src[1], src[2], src[3], outer_thresh); + uint16x4_t hev_mask; + uint16x4_t needs_filter4_mask; + const uint16x8_t p0q0 = vcombine_u16(src[1], src[2]); + const uint16x8_t p1q1 = vcombine_u16(src[0], src[3]); + Filter4Masks(p0q0, p1q1, hev_thresh, outer_mask, inner_thresh, &hev_mask, + &needs_filter4_mask); + +#if defined(__aarch64__) + if (vaddv_u16(needs_filter4_mask) == 0) { + // None of the values will be filtered. + return; + } +#endif // defined(__aarch64__) + + // Copy the masks to the high bits for packed comparisons later. + const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); + const uint16x8_t needs_filter4_mask_8 = + vcombine_u16(needs_filter4_mask, needs_filter4_mask); + + uint16x8_t f_p1q1; + uint16x8_t f_p0q0; + const uint16x8_t p0q1 = vcombine_u16(src[1], src[3]); + Filter4(p0q0, p0q1, p1q1, hev_mask, &f_p1q1, &f_p0q0); + + // Already integrated the Hev mask when calculating the filtered values. + const uint16x8_t p0q0_output = vbslq_u16(needs_filter4_mask_8, f_p0q0, p0q0); + + // p1/q1 are unmodified if only Hev() is true. This works because it was and'd + // with |needs_filter4_mask| previously. + const uint16x8_t p1q1_mask = veorq_u16(hev_mask_8, needs_filter4_mask_8); + const uint16x8_t p1q1_output = vbslq_u16(p1q1_mask, f_p1q1, p1q1); + + vst1_u16(dst_p1, vget_low_u16(p1q1_output)); + vst1_u16(dst_p0, vget_low_u16(p0q0_output)); + vst1_u16(dst_q0, vget_high_u16(p0q0_output)); + vst1_u16(dst_q1, vget_high_u16(p1q1_output)); +} + +void Vertical4_NEON(void* const dest, const ptrdiff_t stride, int outer_thresh, + int inner_thresh, int hev_thresh) { + // Offset by 2 uint16_t values to load from first p1 position. + auto* dst = static_cast<uint8_t*>(dest) - 4; + auto* dst_p1 = reinterpret_cast<uint16_t*>(dst); + auto* dst_p0 = reinterpret_cast<uint16_t*>(dst + stride); + auto* dst_q0 = reinterpret_cast<uint16_t*>(dst + stride * 2); + auto* dst_q1 = reinterpret_cast<uint16_t*>(dst + stride * 3); + + uint16x4_t src[4] = {vld1_u16(dst_p1), vld1_u16(dst_p0), vld1_u16(dst_q0), + vld1_u16(dst_q1)}; + Transpose4x4(src); + + // Adjust thresholds to bitdepth. + outer_thresh <<= 2; + inner_thresh <<= 2; + hev_thresh <<= 2; + const uint16x4_t outer_mask = + OuterThreshold(src[0], src[1], src[2], src[3], outer_thresh); + uint16x4_t hev_mask; + uint16x4_t needs_filter4_mask; + const uint16x8_t p0q0 = vcombine_u16(src[1], src[2]); + const uint16x8_t p1q1 = vcombine_u16(src[0], src[3]); + Filter4Masks(p0q0, p1q1, hev_thresh, outer_mask, inner_thresh, &hev_mask, + &needs_filter4_mask); + +#if defined(__aarch64__) + if (vaddv_u16(needs_filter4_mask) == 0) { + // None of the values will be filtered. + return; + } +#endif // defined(__aarch64__) + + // Copy the masks to the high bits for packed comparisons later. + const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); + const uint16x8_t needs_filter4_mask_8 = + vcombine_u16(needs_filter4_mask, needs_filter4_mask); + + uint16x8_t f_p1q1; + uint16x8_t f_p0q0; + const uint16x8_t p0q1 = vcombine_u16(src[1], src[3]); + Filter4(p0q0, p0q1, p1q1, hev_mask, &f_p1q1, &f_p0q0); + + // Already integrated the Hev mask when calculating the filtered values. + const uint16x8_t p0q0_output = vbslq_u16(needs_filter4_mask_8, f_p0q0, p0q0); + + // p1/q1 are unmodified if only Hev() is true. This works because it was and'd + // with |needs_filter4_mask| previously. + const uint16x8_t p1q1_mask = veorq_u16(hev_mask_8, needs_filter4_mask_8); + const uint16x8_t p1q1_output = vbslq_u16(p1q1_mask, f_p1q1, p1q1); + + uint16x4_t output[4] = { + vget_low_u16(p1q1_output), + vget_low_u16(p0q0_output), + vget_high_u16(p0q0_output), + vget_high_u16(p1q1_output), + }; + Transpose4x4(output); + + vst1_u16(dst_p1, output[0]); + vst1_u16(dst_p0, output[1]); + vst1_u16(dst_q0, output[2]); + vst1_u16(dst_q1, output[3]); +} + +inline void Filter6(const uint16x8_t p2q2, const uint16x8_t p1q1, + const uint16x8_t p0q0, uint16x8_t* const p1q1_output, + uint16x8_t* const p0q0_output) { + // Sum p1 and q1 output from opposite directions. + // The formula is regrouped to allow 3 doubling operations to be combined. + // + // p1 = (3 * p2) + (2 * p1) + (2 * p0) + q0 + // ^^^^^^^^ + // q1 = p0 + (2 * q0) + (2 * q1) + (3 * q2) + // ^^^^^^^^ + // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0 + // ^^^^^^^^^^^ + uint16x8_t sum = vaddq_u16(p2q2, p1q1); + + // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0 + // ^^^^^^ + sum = vaddq_u16(sum, p0q0); + + // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0 + // ^^^^^ + sum = vshlq_n_u16(sum, 1); + + // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0 + // ^^^^^^ ^^^^^^ + // Should dual issue with the left shift. + const uint16x8_t q0p0 = Transpose64(p0q0); + const uint16x8_t outer_sum = vaddq_u16(p2q2, q0p0); + sum = vaddq_u16(sum, outer_sum); + + *p1q1_output = vrshrq_n_u16(sum, 3); + + // Convert to p0 and q0 output: + // p0 = p1 - (2 * p2) + q0 + q1 + // q0 = q1 - (2 * q2) + p0 + p1 + // p0q0 = p1q1 - (2 * p2q2) + q0p0 + q1p1 + // ^^^^^^^^ + const uint16x8_t p2q2_double = vshlq_n_u16(p2q2, 1); + // p0q0 = p1q1 - (2 * p2q2) + q0p0 + q1p1 + // ^^^^^^^^ + sum = vsubq_u16(sum, p2q2_double); + const uint16x8_t q1p1 = Transpose64(p1q1); + sum = vaddq_u16(sum, vaddq_u16(q0p0, q1p1)); + + *p0q0_output = vrshrq_n_u16(sum, 3); +} + +void Horizontal6_NEON(void* const dest, const ptrdiff_t stride, + int outer_thresh, int inner_thresh, int hev_thresh) { + auto* const dst = static_cast<uint8_t*>(dest); + auto* const dst_p2 = reinterpret_cast<uint16_t*>(dst - 3 * stride); + auto* const dst_p1 = reinterpret_cast<uint16_t*>(dst - 2 * stride); + auto* const dst_p0 = reinterpret_cast<uint16_t*>(dst - stride); + auto* const dst_q0 = reinterpret_cast<uint16_t*>(dst); + auto* const dst_q1 = reinterpret_cast<uint16_t*>(dst + stride); + auto* const dst_q2 = reinterpret_cast<uint16_t*>(dst + 2 * stride); + + const uint16x4_t src[6] = {vld1_u16(dst_p2), vld1_u16(dst_p1), + vld1_u16(dst_p0), vld1_u16(dst_q0), + vld1_u16(dst_q1), vld1_u16(dst_q2)}; + + // Adjust thresholds to bitdepth. + outer_thresh <<= 2; + inner_thresh <<= 2; + hev_thresh <<= 2; + const uint16x4_t outer_mask = + OuterThreshold(src[1], src[2], src[3], src[4], outer_thresh); + uint16x4_t hev_mask; + uint16x4_t needs_filter_mask; + uint16x4_t is_flat3_mask; + const uint16x8_t p0q0 = vcombine_u16(src[2], src[3]); + const uint16x8_t p1q1 = vcombine_u16(src[1], src[4]); + const uint16x8_t p2q2 = vcombine_u16(src[0], src[5]); + Filter6Masks(p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh, + &needs_filter_mask, &is_flat3_mask, &hev_mask); + +#if defined(__aarch64__) + if (vaddv_u16(needs_filter_mask) == 0) { + // None of the values will be filtered. + return; + } +#endif // defined(__aarch64__) + + // Copy the masks to the high bits for packed comparisons later. + const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); + const uint16x8_t is_flat3_mask_8 = vcombine_u16(is_flat3_mask, is_flat3_mask); + const uint16x8_t needs_filter_mask_8 = + vcombine_u16(needs_filter_mask, needs_filter_mask); + + uint16x8_t f4_p1q1; + uint16x8_t f4_p0q0; + // ZIP1 p0q0, p1q1 may perform better here. + const uint16x8_t p0q1 = vcombine_u16(src[2], src[4]); + Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0); + f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1); + + uint16x8_t p0q0_output, p1q1_output; + // Because we did not return after testing |needs_filter_mask| we know it is + // nonzero. |is_flat3_mask| controls whether the needed filter is Filter4 or + // Filter6. Therefore if it is false when |needs_filter_mask| is true, Filter6 + // output is not used. + uint16x8_t f6_p1q1, f6_p0q0; + const uint64x1_t need_filter6 = vreinterpret_u64_u16(is_flat3_mask); + if (vget_lane_u64(need_filter6, 0) == 0) { + // Filter6() does not apply, but Filter4() applies to one or more values. + p0q0_output = p0q0; + p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1); + p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0); + } else { + Filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0); + p1q1_output = vbslq_u16(is_flat3_mask_8, f6_p1q1, f4_p1q1); + p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); + p0q0_output = vbslq_u16(is_flat3_mask_8, f6_p0q0, f4_p0q0); + p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); + } + + vst1_u16(dst_p1, vget_low_u16(p1q1_output)); + vst1_u16(dst_p0, vget_low_u16(p0q0_output)); + vst1_u16(dst_q0, vget_high_u16(p0q0_output)); + vst1_u16(dst_q1, vget_high_u16(p1q1_output)); +} + +void Vertical6_NEON(void* const dest, const ptrdiff_t stride, int outer_thresh, + int inner_thresh, int hev_thresh) { + // Left side of the filter window. + auto* const dst = static_cast<uint8_t*>(dest) - 3 * sizeof(uint16_t); + auto* const dst_0 = reinterpret_cast<uint16_t*>(dst); + auto* const dst_1 = reinterpret_cast<uint16_t*>(dst + stride); + auto* const dst_2 = reinterpret_cast<uint16_t*>(dst + 2 * stride); + auto* const dst_3 = reinterpret_cast<uint16_t*>(dst + 3 * stride); + + // Overread by 2 values. These overreads become the high halves of src_raw[2] + // and src_raw[3] after transpose. + uint16x8_t src_raw[4] = {vld1q_u16(dst_0), vld1q_u16(dst_1), vld1q_u16(dst_2), + vld1q_u16(dst_3)}; + Transpose4x8(src_raw); + // p2, p1, p0, q0, q1, q2 + const uint16x4_t src[6] = { + vget_low_u16(src_raw[0]), vget_low_u16(src_raw[1]), + vget_low_u16(src_raw[2]), vget_low_u16(src_raw[3]), + vget_high_u16(src_raw[0]), vget_high_u16(src_raw[1]), + }; + + // Adjust thresholds to bitdepth. + outer_thresh <<= 2; + inner_thresh <<= 2; + hev_thresh <<= 2; + const uint16x4_t outer_mask = + OuterThreshold(src[1], src[2], src[3], src[4], outer_thresh); + uint16x4_t hev_mask; + uint16x4_t needs_filter_mask; + uint16x4_t is_flat3_mask; + const uint16x8_t p0q0 = vcombine_u16(src[2], src[3]); + const uint16x8_t p1q1 = vcombine_u16(src[1], src[4]); + const uint16x8_t p2q2 = vcombine_u16(src[0], src[5]); + Filter6Masks(p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh, + &needs_filter_mask, &is_flat3_mask, &hev_mask); + +#if defined(__aarch64__) + if (vaddv_u16(needs_filter_mask) == 0) { + // None of the values will be filtered. + return; + } +#endif // defined(__aarch64__) + + // Copy the masks to the high bits for packed comparisons later. + const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); + const uint16x8_t is_flat3_mask_8 = vcombine_u16(is_flat3_mask, is_flat3_mask); + const uint16x8_t needs_filter_mask_8 = + vcombine_u16(needs_filter_mask, needs_filter_mask); + + uint16x8_t f4_p1q1; + uint16x8_t f4_p0q0; + // ZIP1 p0q0, p1q1 may perform better here. + const uint16x8_t p0q1 = vcombine_u16(src[2], src[4]); + Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0); + f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1); + + uint16x8_t p0q0_output, p1q1_output; + // Because we did not return after testing |needs_filter_mask| we know it is + // nonzero. |is_flat3_mask| controls whether the needed filter is Filter4 or + // Filter6. Therefore if it is false when |needs_filter_mask| is true, Filter6 + // output is not used. + uint16x8_t f6_p1q1, f6_p0q0; + const uint64x1_t need_filter6 = vreinterpret_u64_u16(is_flat3_mask); + if (vget_lane_u64(need_filter6, 0) == 0) { + // Filter6() does not apply, but Filter4() applies to one or more values. + p0q0_output = p0q0; + p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1); + p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0); + } else { + Filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0); + p1q1_output = vbslq_u16(is_flat3_mask_8, f6_p1q1, f4_p1q1); + p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); + p0q0_output = vbslq_u16(is_flat3_mask_8, f6_p0q0, f4_p0q0); + p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); + } + + uint16x4_t output[4] = { + vget_low_u16(p1q1_output), + vget_low_u16(p0q0_output), + vget_high_u16(p0q0_output), + vget_high_u16(p1q1_output), + }; + Transpose4x4(output); + + // dst_n starts at p2, so adjust to p1. + vst1_u16(dst_0 + 1, output[0]); + vst1_u16(dst_1 + 1, output[1]); + vst1_u16(dst_2 + 1, output[2]); + vst1_u16(dst_3 + 1, output[3]); +} + +inline void Filter8(const uint16x8_t p3q3, const uint16x8_t p2q2, + const uint16x8_t p1q1, const uint16x8_t p0q0, + uint16x8_t* const p2q2_output, + uint16x8_t* const p1q1_output, + uint16x8_t* const p0q0_output) { + // Sum p2 and q2 output from opposite directions. + // The formula is regrouped to allow 2 doubling operations to be combined. + // p2 = (3 * p3) + (2 * p2) + p1 + p0 + q0 + // ^^^^^^^^ + // q2 = p0 + q0 + q1 + (2 * q2) + (3 * q3) + // ^^^^^^^^ + // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0 + // ^^^^^^^^^^^ + const uint16x8_t p23q23 = vaddq_u16(p3q3, p2q2); + + // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0 + // ^^^^^ + uint16x8_t sum = vshlq_n_u16(p23q23, 1); + + // Add two other terms to make dual issue with shift more likely. + // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0 + // ^^^^^^^^^^^ + const uint16x8_t p01q01 = vaddq_u16(p0q0, p1q1); + + // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0 + // ^^^^^^^^^^^^^ + sum = vaddq_u16(sum, p01q01); + + // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0 + // ^^^^^^ + sum = vaddq_u16(sum, p3q3); + + // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0 + // ^^^^^^ + const uint16x8_t q0p0 = Transpose64(p0q0); + sum = vaddq_u16(sum, q0p0); + + *p2q2_output = vrshrq_n_u16(sum, 3); + + // Convert to p1 and q1 output: + // p1 = p2 - p3 - p2 + p1 + q1 + // q1 = q2 - q3 - q2 + q0 + p1 + sum = vsubq_u16(sum, p23q23); + const uint16x8_t q1p1 = Transpose64(p1q1); + sum = vaddq_u16(sum, vaddq_u16(p1q1, q1p1)); + + *p1q1_output = vrshrq_n_u16(sum, 3); + + // Convert to p0 and q0 output: + // p0 = p1 - p3 - p1 + p0 + q2 + // q0 = q1 - q3 - q1 + q0 + p2 + sum = vsubq_u16(sum, vaddq_u16(p3q3, p1q1)); + const uint16x8_t q2p2 = Transpose64(p2q2); + sum = vaddq_u16(sum, vaddq_u16(p0q0, q2p2)); + + *p0q0_output = vrshrq_n_u16(sum, 3); +} + +void Horizontal8_NEON(void* const dest, const ptrdiff_t stride, + int outer_thresh, int inner_thresh, int hev_thresh) { + auto* const dst = static_cast<uint8_t*>(dest); + auto* const dst_p3 = reinterpret_cast<uint16_t*>(dst - 4 * stride); + auto* const dst_p2 = reinterpret_cast<uint16_t*>(dst - 3 * stride); + auto* const dst_p1 = reinterpret_cast<uint16_t*>(dst - 2 * stride); + auto* const dst_p0 = reinterpret_cast<uint16_t*>(dst - stride); + auto* const dst_q0 = reinterpret_cast<uint16_t*>(dst); + auto* const dst_q1 = reinterpret_cast<uint16_t*>(dst + stride); + auto* const dst_q2 = reinterpret_cast<uint16_t*>(dst + 2 * stride); + auto* const dst_q3 = reinterpret_cast<uint16_t*>(dst + 3 * stride); + + const uint16x4_t src[8] = { + vld1_u16(dst_p3), vld1_u16(dst_p2), vld1_u16(dst_p1), vld1_u16(dst_p0), + vld1_u16(dst_q0), vld1_u16(dst_q1), vld1_u16(dst_q2), vld1_u16(dst_q3)}; + + // Adjust thresholds to bitdepth. + outer_thresh <<= 2; + inner_thresh <<= 2; + hev_thresh <<= 2; + const uint16x4_t outer_mask = + OuterThreshold(src[2], src[3], src[4], src[5], outer_thresh); + uint16x4_t hev_mask; + uint16x4_t needs_filter_mask; + uint16x4_t is_flat4_mask; + const uint16x8_t p0q0 = vcombine_u16(src[3], src[4]); + const uint16x8_t p1q1 = vcombine_u16(src[2], src[5]); + const uint16x8_t p2q2 = vcombine_u16(src[1], src[6]); + const uint16x8_t p3q3 = vcombine_u16(src[0], src[7]); + Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh, + &needs_filter_mask, &is_flat4_mask, &hev_mask); + +#if defined(__aarch64__) + if (vaddv_u16(needs_filter_mask) == 0) { + // None of the values will be filtered. + return; + } +#endif // defined(__aarch64__) + + // Copy the masks to the high bits for packed comparisons later. + const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); + const uint16x8_t needs_filter_mask_8 = + vcombine_u16(needs_filter_mask, needs_filter_mask); + + uint16x8_t f4_p1q1; + uint16x8_t f4_p0q0; + // ZIP1 p0q0, p1q1 may perform better here. + const uint16x8_t p0q1 = vcombine_u16(src[3], src[5]); + Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0); + f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1); + + uint16x8_t p0q0_output, p1q1_output, p2q2_output; + // Because we did not return after testing |needs_filter_mask| we know it is + // nonzero. |is_flat4_mask| controls whether the needed filter is Filter4 or + // Filter8. Therefore if it is false when |needs_filter_mask| is true, Filter8 + // output is not used. + uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0; + const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask); + if (vget_lane_u64(need_filter8, 0) == 0) { + // Filter8() does not apply, but Filter4() applies to one or more values. + p2q2_output = p2q2; + p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1); + p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0); + } else { + const uint16x8_t is_flat4_mask_8 = + vcombine_u16(is_flat4_mask, is_flat4_mask); + Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0); + p2q2_output = vbslq_u16(is_flat4_mask_8, f8_p2q2, p2q2); + p1q1_output = vbslq_u16(is_flat4_mask_8, f8_p1q1, f4_p1q1); + p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); + p0q0_output = vbslq_u16(is_flat4_mask_8, f8_p0q0, f4_p0q0); + p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); + } + + vst1_u16(dst_p2, vget_low_u16(p2q2_output)); + vst1_u16(dst_p1, vget_low_u16(p1q1_output)); + vst1_u16(dst_p0, vget_low_u16(p0q0_output)); + vst1_u16(dst_q0, vget_high_u16(p0q0_output)); + vst1_u16(dst_q1, vget_high_u16(p1q1_output)); + vst1_u16(dst_q2, vget_high_u16(p2q2_output)); +} + +inline uint16x8_t ReverseLowHalf(const uint16x8_t a) { + return vcombine_u16(vrev64_u16(vget_low_u16(a)), vget_high_u16(a)); +} + +void Vertical8_NEON(void* const dest, const ptrdiff_t stride, int outer_thresh, + int inner_thresh, int hev_thresh) { + auto* const dst = static_cast<uint8_t*>(dest) - 4 * sizeof(uint16_t); + auto* const dst_0 = reinterpret_cast<uint16_t*>(dst); + auto* const dst_1 = reinterpret_cast<uint16_t*>(dst + stride); + auto* const dst_2 = reinterpret_cast<uint16_t*>(dst + 2 * stride); + auto* const dst_3 = reinterpret_cast<uint16_t*>(dst + 3 * stride); + + // src_raw[n] contains p3, p2, p1, p0, q0, q1, q2, q3 for row n. + // To get desired pairs after transpose, one half should be reversed. + uint16x8_t src[4] = {vld1q_u16(dst_0), vld1q_u16(dst_1), vld1q_u16(dst_2), + vld1q_u16(dst_3)}; + + // src[0] = p0q0 + // src[1] = p1q1 + // src[2] = p2q2 + // src[3] = p3q3 + LoopFilterTranspose4x8(src); + + // Adjust thresholds to bitdepth. + outer_thresh <<= 2; + inner_thresh <<= 2; + hev_thresh <<= 2; + const uint16x4_t outer_mask = OuterThreshold( + vget_low_u16(src[1]), vget_low_u16(src[0]), vget_high_u16(src[0]), + vget_high_u16(src[1]), outer_thresh); + uint16x4_t hev_mask; + uint16x4_t needs_filter_mask; + uint16x4_t is_flat4_mask; + const uint16x8_t p0q0 = src[0]; + const uint16x8_t p1q1 = src[1]; + const uint16x8_t p2q2 = src[2]; + const uint16x8_t p3q3 = src[3]; + Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh, + &needs_filter_mask, &is_flat4_mask, &hev_mask); + +#if defined(__aarch64__) + if (vaddv_u16(needs_filter_mask) == 0) { + // None of the values will be filtered. + return; + } +#endif // defined(__aarch64__) + + // Copy the masks to the high bits for packed comparisons later. + const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); + const uint16x8_t needs_filter_mask_8 = + vcombine_u16(needs_filter_mask, needs_filter_mask); + + uint16x8_t f4_p1q1; + uint16x8_t f4_p0q0; + const uint16x8_t p0q1 = vcombine_u16(vget_low_u16(p0q0), vget_high_u16(p1q1)); + Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0); + f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1); + + uint16x8_t p0q0_output, p1q1_output, p2q2_output; + // Because we did not return after testing |needs_filter_mask| we know it is + // nonzero. |is_flat4_mask| controls whether the needed filter is Filter4 or + // Filter8. Therefore if it is false when |needs_filter_mask| is true, Filter8 + // output is not used. + const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask); + if (vget_lane_u64(need_filter8, 0) == 0) { + // Filter8() does not apply, but Filter4() applies to one or more values. + p2q2_output = p2q2; + p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1); + p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0); + } else { + const uint16x8_t is_flat4_mask_8 = + vcombine_u16(is_flat4_mask, is_flat4_mask); + uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0; + Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0); + p2q2_output = vbslq_u16(is_flat4_mask_8, f8_p2q2, p2q2); + p1q1_output = vbslq_u16(is_flat4_mask_8, f8_p1q1, f4_p1q1); + p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); + p0q0_output = vbslq_u16(is_flat4_mask_8, f8_p0q0, f4_p0q0); + p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); + } + + uint16x8_t output[4] = {p0q0_output, p1q1_output, p2q2_output, p3q3}; + // After transpose, |output| will contain rows of the form: + // p0 p1 p2 p3 q0 q1 q2 q3 + Transpose4x8(output); + + // Reverse p values to produce original order: + // p3 p2 p1 p0 q0 q1 q2 q3 + vst1q_u16(dst_0, ReverseLowHalf(output[0])); + vst1q_u16(dst_1, ReverseLowHalf(output[1])); + vst1q_u16(dst_2, ReverseLowHalf(output[2])); + vst1q_u16(dst_3, ReverseLowHalf(output[3])); +} + +inline void Filter14(const uint16x8_t p6q6, const uint16x8_t p5q5, + const uint16x8_t p4q4, const uint16x8_t p3q3, + const uint16x8_t p2q2, const uint16x8_t p1q1, + const uint16x8_t p0q0, uint16x8_t* const p5q5_output, + uint16x8_t* const p4q4_output, + uint16x8_t* const p3q3_output, + uint16x8_t* const p2q2_output, + uint16x8_t* const p1q1_output, + uint16x8_t* const p0q0_output) { + // Sum p5 and q5 output from opposite directions. + // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0 + // ^^^^^^^^ + // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6) + // ^^^^^^^^ + const uint16x8_t p6q6_x7 = vsubq_u16(vshlq_n_u16(p6q6, 3), p6q6); + + // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0 + // ^^^^^^^^^^^^^^^^^^^ + // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6) + // ^^^^^^^^^^^^^^^^^^^ + uint16x8_t sum = vshlq_n_u16(vaddq_u16(p5q5, p4q4), 1); + sum = vaddq_u16(sum, p6q6_x7); + + // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0 + // ^^^^^^^ + // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6) + // ^^^^^^^ + sum = vaddq_u16(vaddq_u16(p3q3, p2q2), sum); + + // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0 + // ^^^^^^^ + // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6) + // ^^^^^^^ + sum = vaddq_u16(vaddq_u16(p1q1, p0q0), sum); + + // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0 + // ^^ + // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6) + // ^^ + const uint16x8_t q0p0 = Transpose64(p0q0); + sum = vaddq_u16(sum, q0p0); + + *p5q5_output = vrshrq_n_u16(sum, 4); + + // Convert to p4 and q4 output: + // p4 = p5 - (2 * p6) + p3 + q1 + // q4 = q5 - (2 * q6) + q3 + p1 + sum = vsubq_u16(sum, vshlq_n_u16(p6q6, 1)); + const uint16x8_t q1p1 = Transpose64(p1q1); + sum = vaddq_u16(vaddq_u16(p3q3, q1p1), sum); + + *p4q4_output = vrshrq_n_u16(sum, 4); + + // Convert to p3 and q3 output: + // p3 = p4 - p6 - p5 + p2 + q2 + // q3 = q4 - q6 - q5 + q2 + p2 + sum = vsubq_u16(sum, vaddq_u16(p6q6, p5q5)); + const uint16x8_t q2p2 = Transpose64(p2q2); + sum = vaddq_u16(vaddq_u16(p2q2, q2p2), sum); + + *p3q3_output = vrshrq_n_u16(sum, 4); + + // Convert to p2 and q2 output: + // p2 = p3 - p6 - p4 + p1 + q3 + // q2 = q3 - q6 - q4 + q1 + p3 + sum = vsubq_u16(sum, vaddq_u16(p6q6, p4q4)); + const uint16x8_t q3p3 = Transpose64(p3q3); + sum = vaddq_u16(vaddq_u16(p1q1, q3p3), sum); + + *p2q2_output = vrshrq_n_u16(sum, 4); + + // Convert to p1 and q1 output: + // p1 = p2 - p6 - p3 + p0 + q4 + // q1 = q2 - q6 - q3 + q0 + p4 + sum = vsubq_u16(sum, vaddq_u16(p6q6, p3q3)); + const uint16x8_t q4p4 = Transpose64(p4q4); + sum = vaddq_u16(vaddq_u16(p0q0, q4p4), sum); + + *p1q1_output = vrshrq_n_u16(sum, 4); + + // Convert to p0 and q0 output: + // p0 = p1 - p6 - p2 + q0 + q5 + // q0 = q1 - q6 - q2 + p0 + p5 + sum = vsubq_u16(sum, vaddq_u16(p6q6, p2q2)); + const uint16x8_t q5p5 = Transpose64(p5q5); + sum = vaddq_u16(vaddq_u16(q0p0, q5p5), sum); + + *p0q0_output = vrshrq_n_u16(sum, 4); +} + +void Horizontal14_NEON(void* const dest, const ptrdiff_t stride, + int outer_thresh, int inner_thresh, int hev_thresh) { + auto* const dst = static_cast<uint8_t*>(dest); + auto* const dst_p6 = reinterpret_cast<uint16_t*>(dst - 7 * stride); + auto* const dst_p5 = reinterpret_cast<uint16_t*>(dst - 6 * stride); + auto* const dst_p4 = reinterpret_cast<uint16_t*>(dst - 5 * stride); + auto* const dst_p3 = reinterpret_cast<uint16_t*>(dst - 4 * stride); + auto* const dst_p2 = reinterpret_cast<uint16_t*>(dst - 3 * stride); + auto* const dst_p1 = reinterpret_cast<uint16_t*>(dst - 2 * stride); + auto* const dst_p0 = reinterpret_cast<uint16_t*>(dst - stride); + auto* const dst_q0 = reinterpret_cast<uint16_t*>(dst); + auto* const dst_q1 = reinterpret_cast<uint16_t*>(dst + stride); + auto* const dst_q2 = reinterpret_cast<uint16_t*>(dst + 2 * stride); + auto* const dst_q3 = reinterpret_cast<uint16_t*>(dst + 3 * stride); + auto* const dst_q4 = reinterpret_cast<uint16_t*>(dst + 4 * stride); + auto* const dst_q5 = reinterpret_cast<uint16_t*>(dst + 5 * stride); + auto* const dst_q6 = reinterpret_cast<uint16_t*>(dst + 6 * stride); + + const uint16x4_t src[14] = { + vld1_u16(dst_p6), vld1_u16(dst_p5), vld1_u16(dst_p4), vld1_u16(dst_p3), + vld1_u16(dst_p2), vld1_u16(dst_p1), vld1_u16(dst_p0), vld1_u16(dst_q0), + vld1_u16(dst_q1), vld1_u16(dst_q2), vld1_u16(dst_q3), vld1_u16(dst_q4), + vld1_u16(dst_q5), vld1_u16(dst_q6)}; + + // Adjust thresholds to bitdepth. + outer_thresh <<= 2; + inner_thresh <<= 2; + hev_thresh <<= 2; + const uint16x4_t outer_mask = + OuterThreshold(src[5], src[6], src[7], src[8], outer_thresh); + uint16x4_t hev_mask; + uint16x4_t needs_filter_mask; + uint16x4_t is_flat4_mask; + const uint16x8_t p0q0 = vcombine_u16(src[6], src[7]); + const uint16x8_t p1q1 = vcombine_u16(src[5], src[8]); + const uint16x8_t p2q2 = vcombine_u16(src[4], src[9]); + const uint16x8_t p3q3 = vcombine_u16(src[3], src[10]); + Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh, + &needs_filter_mask, &is_flat4_mask, &hev_mask); + +#if defined(__aarch64__) + if (vaddv_u16(needs_filter_mask) == 0) { + // None of the values will be filtered. + return; + } +#endif // defined(__aarch64__) + const uint16x8_t p4q4 = vcombine_u16(src[2], src[11]); + const uint16x8_t p5q5 = vcombine_u16(src[1], src[12]); + const uint16x8_t p6q6 = vcombine_u16(src[0], src[13]); + // Mask to choose between the outputs of Filter8 and Filter14. + // As with the derivation of |is_flat4_mask|, the question of whether to use + // Filter14 is only raised where |is_flat4_mask| is true. + const uint16x4_t is_flat4_outer_mask = vand_u16( + is_flat4_mask, IsFlat4(vabdq_u16(p0q0, p4q4), vabdq_u16(p0q0, p5q5), + vabdq_u16(p0q0, p6q6))); + // Copy the masks to the high bits for packed comparisons later. + const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); + const uint16x8_t needs_filter_mask_8 = + vcombine_u16(needs_filter_mask, needs_filter_mask); + + uint16x8_t f4_p1q1; + uint16x8_t f4_p0q0; + // ZIP1 p0q0, p1q1 may perform better here. + const uint16x8_t p0q1 = vcombine_u16(src[6], src[8]); + Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0); + f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1); + + uint16x8_t p0q0_output, p1q1_output, p2q2_output, p3q3_output, p4q4_output, + p5q5_output; + // Because we did not return after testing |needs_filter_mask| we know it is + // nonzero. |is_flat4_mask| controls whether the needed filter is Filter4 or + // Filter8. Therefore if it is false when |needs_filter_mask| is true, Filter8 + // output is not used. + uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0; + const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask); + if (vget_lane_u64(need_filter8, 0) == 0) { + // Filter8() and Filter14() do not apply, but Filter4() applies to one or + // more values. + p5q5_output = p5q5; + p4q4_output = p4q4; + p3q3_output = p3q3; + p2q2_output = p2q2; + p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1); + p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0); + } else { + const uint16x8_t use_filter8_mask = + vcombine_u16(is_flat4_mask, is_flat4_mask); + Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0); + const uint64x1_t need_filter14 = vreinterpret_u64_u16(is_flat4_outer_mask); + if (vget_lane_u64(need_filter14, 0) == 0) { + // Filter14() does not apply, but Filter8() and Filter4() apply to one or + // more values. + p5q5_output = p5q5; + p4q4_output = p4q4; + p3q3_output = p3q3; + p2q2_output = vbslq_u16(use_filter8_mask, f8_p2q2, p2q2); + p1q1_output = vbslq_u16(use_filter8_mask, f8_p1q1, f4_p1q1); + p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); + p0q0_output = vbslq_u16(use_filter8_mask, f8_p0q0, f4_p0q0); + p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); + } else { + // All filters may contribute values to final outputs. + const uint16x8_t use_filter14_mask = + vcombine_u16(is_flat4_outer_mask, is_flat4_outer_mask); + uint16x8_t f14_p5q5, f14_p4q4, f14_p3q3, f14_p2q2, f14_p1q1, f14_p0q0; + Filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4, + &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0); + p5q5_output = vbslq_u16(use_filter14_mask, f14_p5q5, p5q5); + p4q4_output = vbslq_u16(use_filter14_mask, f14_p4q4, p4q4); + p3q3_output = vbslq_u16(use_filter14_mask, f14_p3q3, p3q3); + p2q2_output = vbslq_u16(use_filter14_mask, f14_p2q2, f8_p2q2); + p2q2_output = vbslq_u16(use_filter8_mask, p2q2_output, p2q2); + p2q2_output = vbslq_u16(needs_filter_mask_8, p2q2_output, p2q2); + p1q1_output = vbslq_u16(use_filter14_mask, f14_p1q1, f8_p1q1); + p1q1_output = vbslq_u16(use_filter8_mask, p1q1_output, f4_p1q1); + p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); + p0q0_output = vbslq_u16(use_filter14_mask, f14_p0q0, f8_p0q0); + p0q0_output = vbslq_u16(use_filter8_mask, p0q0_output, f4_p0q0); + p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); + } + } + + vst1_u16(dst_p5, vget_low_u16(p5q5_output)); + vst1_u16(dst_p4, vget_low_u16(p4q4_output)); + vst1_u16(dst_p3, vget_low_u16(p3q3_output)); + vst1_u16(dst_p2, vget_low_u16(p2q2_output)); + vst1_u16(dst_p1, vget_low_u16(p1q1_output)); + vst1_u16(dst_p0, vget_low_u16(p0q0_output)); + vst1_u16(dst_q0, vget_high_u16(p0q0_output)); + vst1_u16(dst_q1, vget_high_u16(p1q1_output)); + vst1_u16(dst_q2, vget_high_u16(p2q2_output)); + vst1_u16(dst_q3, vget_high_u16(p3q3_output)); + vst1_u16(dst_q4, vget_high_u16(p4q4_output)); + vst1_u16(dst_q5, vget_high_u16(p5q5_output)); +} + +inline uint16x8x2_t PermuteACDB64(const uint16x8_t ab, const uint16x8_t cd) { + uint16x8x2_t acdb; +#if defined(__aarch64__) + // a[b] <- [c]d + acdb.val[0] = vreinterpretq_u16_u64( + vtrn1q_u64(vreinterpretq_u64_u16(ab), vreinterpretq_u64_u16(cd))); + // [a]b <- c[d] + acdb.val[1] = vreinterpretq_u16_u64( + vtrn2q_u64(vreinterpretq_u64_u16(cd), vreinterpretq_u64_u16(ab))); +#else + // a[b] <- [c]d + acdb.val[0] = vreinterpretq_u16_u64( + vsetq_lane_u64(vgetq_lane_u64(vreinterpretq_u64_u16(cd), 0), + vreinterpretq_u64_u16(ab), 1)); + // [a]b <- c[d] + acdb.val[1] = vreinterpretq_u16_u64( + vsetq_lane_u64(vgetq_lane_u64(vreinterpretq_u64_u16(cd), 1), + vreinterpretq_u64_u16(ab), 0)); +#endif // defined(__aarch64__) + return acdb; +} + +void Vertical14_NEON(void* const dest, const ptrdiff_t stride, int outer_thresh, + int inner_thresh, int hev_thresh) { + auto* const dst = static_cast<uint8_t*>(dest) - 8 * sizeof(uint16_t); + auto* const dst_0 = reinterpret_cast<uint16_t*>(dst); + auto* const dst_1 = reinterpret_cast<uint16_t*>(dst + stride); + auto* const dst_2 = reinterpret_cast<uint16_t*>(dst + 2 * stride); + auto* const dst_3 = reinterpret_cast<uint16_t*>(dst + 3 * stride); + + // Low halves: p7 p6 p5 p4 + // High halves: p3 p2 p1 p0 + uint16x8_t src_p[4] = {vld1q_u16(dst_0), vld1q_u16(dst_1), vld1q_u16(dst_2), + vld1q_u16(dst_3)}; + // p7 will be the low half of src_p[0]. Not used until the end. + Transpose4x8(src_p); + + // Low halves: q0 q1 q2 q3 + // High halves: q4 q5 q6 q7 + uint16x8_t src_q[4] = {vld1q_u16(dst_0 + 8), vld1q_u16(dst_1 + 8), + vld1q_u16(dst_2 + 8), vld1q_u16(dst_3 + 8)}; + // q7 will be the high half of src_q[3]. Not used until the end. + Transpose4x8(src_q); + + // Adjust thresholds to bitdepth. + outer_thresh <<= 2; + inner_thresh <<= 2; + hev_thresh <<= 2; + const uint16x4_t outer_mask = OuterThreshold( + vget_high_u16(src_p[2]), vget_high_u16(src_p[3]), vget_low_u16(src_q[0]), + vget_low_u16(src_q[1]), outer_thresh); + const uint16x8_t p0q0 = vextq_u16(src_p[3], src_q[0], 4); + const uint16x8_t p1q1 = vextq_u16(src_p[2], src_q[1], 4); + const uint16x8_t p2q2 = vextq_u16(src_p[1], src_q[2], 4); + const uint16x8_t p3q3 = vextq_u16(src_p[0], src_q[3], 4); + uint16x4_t hev_mask; + uint16x4_t needs_filter_mask; + uint16x4_t is_flat4_mask; + Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh, + &needs_filter_mask, &is_flat4_mask, &hev_mask); + +#if defined(__aarch64__) + if (vaddv_u16(needs_filter_mask) == 0) { + // None of the values will be filtered. + return; + } +#endif // defined(__aarch64__) + const uint16x8_t p4q4 = + vcombine_u16(vget_low_u16(src_p[3]), vget_high_u16(src_q[0])); + const uint16x8_t p5q5 = + vcombine_u16(vget_low_u16(src_p[2]), vget_high_u16(src_q[1])); + const uint16x8_t p6q6 = + vcombine_u16(vget_low_u16(src_p[1]), vget_high_u16(src_q[2])); + const uint16x8_t p7q7 = + vcombine_u16(vget_low_u16(src_p[0]), vget_high_u16(src_q[3])); + // Mask to choose between the outputs of Filter8 and Filter14. + // As with the derivation of |is_flat4_mask|, the question of whether to use + // Filter14 is only raised where |is_flat4_mask| is true. + const uint16x4_t is_flat4_outer_mask = vand_u16( + is_flat4_mask, IsFlat4(vabdq_u16(p0q0, p4q4), vabdq_u16(p0q0, p5q5), + vabdq_u16(p0q0, p6q6))); + // Copy the masks to the high bits for packed comparisons later. + const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); + const uint16x8_t needs_filter_mask_8 = + vcombine_u16(needs_filter_mask, needs_filter_mask); + + uint16x8_t f4_p1q1; + uint16x8_t f4_p0q0; + const uint16x8_t p0q1 = vcombine_u16(vget_low_u16(p0q0), vget_high_u16(p1q1)); + Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0); + f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1); + + uint16x8_t p0q0_output, p1q1_output, p2q2_output, p3q3_output, p4q4_output, + p5q5_output; + // Because we did not return after testing |needs_filter_mask| we know it is + // nonzero. |is_flat4_mask| controls whether the needed filter is Filter4 or + // Filter8. Therefore if it is false when |needs_filter_mask| is true, Filter8 + // output is not used. + uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0; + const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask); + if (vget_lane_u64(need_filter8, 0) == 0) { + // Filter8() and Filter14() do not apply, but Filter4() applies to one or + // more values. + p5q5_output = p5q5; + p4q4_output = p4q4; + p3q3_output = p3q3; + p2q2_output = p2q2; + p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1); + p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0); + } else { + const uint16x8_t use_filter8_mask = + vcombine_u16(is_flat4_mask, is_flat4_mask); + Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0); + const uint64x1_t need_filter14 = vreinterpret_u64_u16(is_flat4_outer_mask); + if (vget_lane_u64(need_filter14, 0) == 0) { + // Filter14() does not apply, but Filter8() and Filter4() apply to one or + // more values. + p5q5_output = p5q5; + p4q4_output = p4q4; + p3q3_output = p3q3; + p2q2_output = vbslq_u16(use_filter8_mask, f8_p2q2, p2q2); + p1q1_output = vbslq_u16(use_filter8_mask, f8_p1q1, f4_p1q1); + p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); + p0q0_output = vbslq_u16(use_filter8_mask, f8_p0q0, f4_p0q0); + p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); + } else { + // All filters may contribute values to final outputs. + const uint16x8_t use_filter14_mask = + vcombine_u16(is_flat4_outer_mask, is_flat4_outer_mask); + uint16x8_t f14_p5q5, f14_p4q4, f14_p3q3, f14_p2q2, f14_p1q1, f14_p0q0; + Filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4, + &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0); + p5q5_output = vbslq_u16(use_filter14_mask, f14_p5q5, p5q5); + p4q4_output = vbslq_u16(use_filter14_mask, f14_p4q4, p4q4); + p3q3_output = vbslq_u16(use_filter14_mask, f14_p3q3, p3q3); + p2q2_output = vbslq_u16(use_filter14_mask, f14_p2q2, f8_p2q2); + p2q2_output = vbslq_u16(use_filter8_mask, p2q2_output, p2q2); + p2q2_output = vbslq_u16(needs_filter_mask_8, p2q2_output, p2q2); + p1q1_output = vbslq_u16(use_filter14_mask, f14_p1q1, f8_p1q1); + p1q1_output = vbslq_u16(use_filter8_mask, p1q1_output, f4_p1q1); + p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); + p0q0_output = vbslq_u16(use_filter14_mask, f14_p0q0, f8_p0q0); + p0q0_output = vbslq_u16(use_filter8_mask, p0q0_output, f4_p0q0); + p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); + } + } + // To get the correctly ordered rows from the transpose, we need: + // p7p3 p6p2 p5p1 p4p0 + // q0q4 q1q5 q2q6 q3q7 + const uint16x8x2_t p7p3_q3q7 = PermuteACDB64(p7q7, p3q3_output); + const uint16x8x2_t p6p2_q2q6 = PermuteACDB64(p6q6, p2q2_output); + const uint16x8x2_t p5p1_q1q5 = PermuteACDB64(p5q5_output, p1q1_output); + const uint16x8x2_t p4p0_q0q4 = PermuteACDB64(p4q4_output, p0q0_output); + uint16x8_t output_p[4] = {p7p3_q3q7.val[0], p6p2_q2q6.val[0], + p5p1_q1q5.val[0], p4p0_q0q4.val[0]}; + Transpose4x8(output_p); + uint16x8_t output_q[4] = {p4p0_q0q4.val[1], p5p1_q1q5.val[1], + p6p2_q2q6.val[1], p7p3_q3q7.val[1]}; + Transpose4x8(output_q); + + // Reverse p values to produce original order: + // p3 p2 p1 p0 q0 q1 q2 q3 + vst1q_u16(dst_0, output_p[0]); + vst1q_u16(dst_0 + 8, output_q[0]); + vst1q_u16(dst_1, output_p[1]); + vst1q_u16(dst_1 + 8, output_q[1]); + vst1q_u16(dst_2, output_p[2]); + vst1q_u16(dst_2 + 8, output_q[2]); + vst1q_u16(dst_3, output_p[3]); + vst1q_u16(dst_3 + 8, output_q[3]); +} + +} // namespace + +void LoopFilterInit10bpp_NEON() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10); + assert(dsp != nullptr); + dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] = + Horizontal4_NEON; + dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] = Vertical4_NEON; + dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] = + Horizontal6_NEON; + dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] = Vertical6_NEON; + dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] = + Horizontal8_NEON; + dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] = Vertical8_NEON; + dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] = + Horizontal14_NEON; + dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] = + Vertical14_NEON; +} + +} // namespace dsp +} // namespace libgav1 + +#else // !(LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10) +namespace libgav1 { +namespace dsp { + +void LoopFilterInit10bpp_NEON() {} + +} // namespace dsp +} // namespace libgav1 +#endif // LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10 diff --git a/src/dsp/arm/loop_filter_neon.cc b/src/dsp/arm/loop_filter_neon.cc index 8c03928..a8b236d 100644 --- a/src/dsp/arm/loop_filter_neon.cc +++ b/src/dsp/arm/loop_filter_neon.cc @@ -29,7 +29,6 @@ namespace libgav1 { namespace dsp { -namespace low_bitdepth { namespace { // (abs(p1 - p0) > thresh) || (abs(q1 - q0) > thresh) @@ -149,10 +148,6 @@ void Horizontal4_NEON(void* const dest, const ptrdiff_t stride, needs_filter4_mask = InterleaveLow32(needs_filter4_mask, needs_filter4_mask); #if defined(__aarch64__) - // This provides a good speedup for the unit test. Not sure how applicable it - // is to valid streams though. - // Consider doing this on armv7 if there is a quick way to check if a vector - // is zero. if (vaddv_u8(needs_filter4_mask) == 0) { // None of the values will be filtered. return; @@ -209,10 +204,6 @@ void Vertical4_NEON(void* const dest, const ptrdiff_t stride, needs_filter4_mask = InterleaveLow32(needs_filter4_mask, needs_filter4_mask); #if defined(__aarch64__) - // This provides a good speedup for the unit test. Not sure how applicable it - // is to valid streams though. - // Consider doing this on armv7 if there is a quick way to check if a vector - // is zero. if (vaddv_u8(needs_filter4_mask) == 0) { // None of the values will be filtered. return; @@ -346,10 +337,6 @@ void Horizontal6_NEON(void* const dest, const ptrdiff_t stride, hev_mask = InterleaveLow32(hev_mask, hev_mask); #if defined(__aarch64__) - // This provides a good speedup for the unit test. Not sure how applicable it - // is to valid streams though. - // Consider doing this on armv7 if there is a quick way to check if a vector - // is zero. if (vaddv_u8(needs_filter6_mask) == 0) { // None of the values will be filtered. return; @@ -420,10 +407,6 @@ void Vertical6_NEON(void* const dest, const ptrdiff_t stride, hev_mask = InterleaveLow32(hev_mask, hev_mask); #if defined(__aarch64__) - // This provides a good speedup for the unit test. Not sure how applicable it - // is to valid streams though. - // Consider doing this on armv7 if there is a quick way to check if a vector - // is zero. if (vaddv_u8(needs_filter6_mask) == 0) { // None of the values will be filtered. return; @@ -600,10 +583,6 @@ void Horizontal8_NEON(void* const dest, const ptrdiff_t stride, hev_mask = InterleaveLow32(hev_mask, hev_mask); #if defined(__aarch64__) - // This provides a good speedup for the unit test. Not sure how applicable it - // is to valid streams though. - // Consider doing this on armv7 if there is a quick way to check if a vector - // is zero. if (vaddv_u8(needs_filter8_mask) == 0) { // None of the values will be filtered. return; @@ -679,10 +658,6 @@ void Vertical8_NEON(void* const dest, const ptrdiff_t stride, hev_mask = InterleaveLow32(hev_mask, hev_mask); #if defined(__aarch64__) - // This provides a good speedup for the unit test. Not sure how applicable it - // is to valid streams though. - // Consider doing this on armv7 if there is a quick way to check if a vector - // is zero. if (vaddv_u8(needs_filter8_mask) == 0) { // None of the values will be filtered. return; @@ -863,10 +838,6 @@ void Horizontal14_NEON(void* const dest, const ptrdiff_t stride, hev_mask = InterleaveLow32(hev_mask, hev_mask); #if defined(__aarch64__) - // This provides a good speedup for the unit test. Not sure how applicable it - // is to valid streams though. - // Consider doing this on armv7 if there is a quick way to check if a vector - // is zero. if (vaddv_u8(needs_filter8_mask) == 0) { // None of the values will be filtered. return; @@ -1031,10 +1002,6 @@ void Vertical14_NEON(void* const dest, const ptrdiff_t stride, hev_mask = InterleaveLow32(hev_mask, hev_mask); #if defined(__aarch64__) - // This provides a good speedup for the unit test. Not sure how applicable it - // is to valid streams though. - // Consider doing this on armv7 if there is a quick way to check if a vector - // is zero. if (vaddv_u8(needs_filter8_mask) == 0) { // None of the values will be filtered. return; @@ -1158,7 +1125,9 @@ void Vertical14_NEON(void* const dest, const ptrdiff_t stride, vst1q_u8(dst, output_3); } -void Init8bpp() { +} // namespace + +void LoopFilterInit_NEON() { Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8); assert(dsp != nullptr); dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] = @@ -1178,1267 +1147,6 @@ void Init8bpp() { dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] = Vertical14_NEON; } -} // namespace -} // namespace low_bitdepth - -#if LIBGAV1_MAX_BITDEPTH >= 10 -namespace high_bitdepth { -namespace { - -// (abs(p1 - p0) > thresh) || (abs(q1 - q0) > thresh) -inline uint16x4_t Hev(const uint16x8_t abd_p0p1_q0q1, const uint16_t thresh) { - const uint16x8_t a = vcgtq_u16(abd_p0p1_q0q1, vdupq_n_u16(thresh)); - return vorr_u16(vget_low_u16(a), vget_high_u16(a)); -} - -// abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= outer_thresh -inline uint16x4_t OuterThreshold(const uint16x4_t p1, const uint16x4_t p0, - const uint16x4_t q0, const uint16x4_t q1, - const uint16_t outer_thresh) { - const uint16x4_t abd_p0q0 = vabd_u16(p0, q0); - const uint16x4_t abd_p1q1 = vabd_u16(p1, q1); - const uint16x4_t p0q0_double = vshl_n_u16(abd_p0q0, 1); - const uint16x4_t p1q1_half = vshr_n_u16(abd_p1q1, 1); - const uint16x4_t sum = vadd_u16(p0q0_double, p1q1_half); - return vcle_u16(sum, vdup_n_u16(outer_thresh)); -} - -// abs(p1 - p0) <= inner_thresh && abs(q1 - q0) <= inner_thresh && -// OuterThreshold() -inline uint16x4_t NeedsFilter4(const uint16x8_t abd_p0p1_q0q1, - const uint16_t inner_thresh, - const uint16x4_t outer_mask) { - const uint16x8_t a = vcleq_u16(abd_p0p1_q0q1, vdupq_n_u16(inner_thresh)); - const uint16x4_t inner_mask = vand_u16(vget_low_u16(a), vget_high_u16(a)); - return vand_u16(inner_mask, outer_mask); -} - -// abs(p2 - p1) <= inner_thresh && abs(p1 - p0) <= inner_thresh && -// abs(q1 - q0) <= inner_thresh && abs(q2 - q1) <= inner_thresh && -// OuterThreshold() -inline uint16x4_t NeedsFilter6(const uint16x8_t abd_p0p1_q0q1, - const uint16x8_t abd_p1p2_q1q2, - const uint16_t inner_thresh, - const uint16x4_t outer_mask) { - const uint16x8_t a = vmaxq_u16(abd_p0p1_q0q1, abd_p1p2_q1q2); - const uint16x8_t b = vcleq_u16(a, vdupq_n_u16(inner_thresh)); - const uint16x4_t inner_mask = vand_u16(vget_low_u16(b), vget_high_u16(b)); - return vand_u16(inner_mask, outer_mask); -} - -// abs(p3 - p2) <= inner_thresh && abs(p2 - p1) <= inner_thresh && -// abs(p1 - p0) <= inner_thresh && abs(q1 - q0) <= inner_thresh && -// abs(q2 - q1) <= inner_thresh && abs(q3 - q2) <= inner_thresh -// OuterThreshold() -inline uint16x4_t NeedsFilter8(const uint16x8_t abd_p0p1_q0q1, - const uint16x8_t abd_p1p2_q1q2, - const uint16x8_t abd_p2p3_q2q3, - const uint16_t inner_thresh, - const uint16x4_t outer_mask) { - const uint16x8_t a = vmaxq_u16(abd_p0p1_q0q1, abd_p1p2_q1q2); - const uint16x8_t b = vmaxq_u16(a, abd_p2p3_q2q3); - const uint16x8_t c = vcleq_u16(b, vdupq_n_u16(inner_thresh)); - const uint16x4_t inner_mask = vand_u16(vget_low_u16(c), vget_high_u16(c)); - return vand_u16(inner_mask, outer_mask); -} - -// ----------------------------------------------------------------------------- -// FilterNMasks functions. - -inline void Filter4Masks(const uint16x8_t p0q0, const uint16x8_t p1q1, - const uint16_t hev_thresh, const uint16x4_t outer_mask, - const uint16_t inner_thresh, - uint16x4_t* const hev_mask, - uint16x4_t* const needs_filter4_mask) { - const uint16x8_t p0p1_q0q1 = vabdq_u16(p0q0, p1q1); - // This includes cases where NeedsFilter4() is not true and so Filter2() will - // not be applied. - const uint16x4_t hev_tmp_mask = Hev(p0p1_q0q1, hev_thresh); - - *needs_filter4_mask = NeedsFilter4(p0p1_q0q1, inner_thresh, outer_mask); - - // Filter2() will only be applied if both NeedsFilter4() and Hev() are true. - *hev_mask = vand_u16(hev_tmp_mask, *needs_filter4_mask); -} - -// abs(p1 - p0) <= flat_thresh && abs(q1 - q0) <= flat_thresh && -// abs(p2 - p0) <= flat_thresh && abs(q2 - q0) <= flat_thresh -// |flat_thresh| == 4 for 10 bit decode. -inline uint16x4_t IsFlat3(const uint16x8_t abd_p0p1_q0q1, - const uint16x8_t abd_p0p2_q0q2) { - constexpr int flat_thresh = 1 << 2; - const uint16x8_t a = vmaxq_u16(abd_p0p1_q0q1, abd_p0p2_q0q2); - const uint16x8_t b = vcleq_u16(a, vdupq_n_u16(flat_thresh)); - return vand_u16(vget_low_u16(b), vget_high_u16(b)); -} - -inline void Filter6Masks(const uint16x8_t p2q2, const uint16x8_t p1q1, - const uint16x8_t p0q0, const uint16_t hev_thresh, - const uint16x4_t outer_mask, - const uint16_t inner_thresh, - uint16x4_t* const needs_filter6_mask, - uint16x4_t* const is_flat3_mask, - uint16x4_t* const hev_mask) { - const uint16x8_t abd_p0p1_q0q1 = vabdq_u16(p0q0, p1q1); - *hev_mask = Hev(abd_p0p1_q0q1, hev_thresh); - *is_flat3_mask = IsFlat3(abd_p0p1_q0q1, vabdq_u16(p0q0, p2q2)); - *needs_filter6_mask = NeedsFilter6(abd_p0p1_q0q1, vabdq_u16(p1q1, p2q2), - inner_thresh, outer_mask); -} - -// IsFlat4 uses N=1, IsFlatOuter4 uses N=4. -// abs(p[N] - p0) <= flat_thresh && abs(q[N] - q0) <= flat_thresh && -// abs(p[N+1] - p0) <= flat_thresh && abs(q[N+1] - q0) <= flat_thresh && -// abs(p[N+2] - p0) <= flat_thresh && abs(q[N+1] - q0) <= flat_thresh -// |flat_thresh| == 4 for 10 bit decode. -inline uint16x4_t IsFlat4(const uint16x8_t abd_pnp0_qnq0, - const uint16x8_t abd_pn1p0_qn1q0, - const uint16x8_t abd_pn2p0_qn2q0) { - constexpr int flat_thresh = 1 << 2; - const uint16x8_t a = vmaxq_u16(abd_pnp0_qnq0, abd_pn1p0_qn1q0); - const uint16x8_t b = vmaxq_u16(a, abd_pn2p0_qn2q0); - const uint16x8_t c = vcleq_u16(b, vdupq_n_u16(flat_thresh)); - return vand_u16(vget_low_u16(c), vget_high_u16(c)); -} - -inline void Filter8Masks(const uint16x8_t p3q3, const uint16x8_t p2q2, - const uint16x8_t p1q1, const uint16x8_t p0q0, - const uint16_t hev_thresh, const uint16x4_t outer_mask, - const uint16_t inner_thresh, - uint16x4_t* const needs_filter8_mask, - uint16x4_t* const is_flat4_mask, - uint16x4_t* const hev_mask) { - const uint16x8_t abd_p0p1_q0q1 = vabdq_u16(p0q0, p1q1); - *hev_mask = Hev(abd_p0p1_q0q1, hev_thresh); - const uint16x4_t is_flat4 = - IsFlat4(abd_p0p1_q0q1, vabdq_u16(p0q0, p2q2), vabdq_u16(p0q0, p3q3)); - *needs_filter8_mask = - NeedsFilter8(abd_p0p1_q0q1, vabdq_u16(p1q1, p2q2), vabdq_u16(p2q2, p3q3), - inner_thresh, outer_mask); - // |is_flat4_mask| is used to decide where to use the result of Filter8. - // In rare cases, |is_flat4| can be true where |needs_filter8_mask| is false, - // overriding the question of whether to use Filter8. Because Filter4 doesn't - // apply to p2q2, |is_flat4_mask| chooses directly between Filter8 and the - // source value. To be correct, the mask must account for this override. - *is_flat4_mask = vand_u16(is_flat4, *needs_filter8_mask); -} - -// ----------------------------------------------------------------------------- -// FilterN functions. - -// Calculate Filter4() or Filter2() based on |hev_mask|. -inline void Filter4(const uint16x8_t p0q0, const uint16x8_t p0q1, - const uint16x8_t p1q1, const uint16x4_t hev_mask, - uint16x8_t* const p1q1_result, - uint16x8_t* const p0q0_result) { - const uint16x8_t q0p1 = vextq_u16(p0q0, p1q1, 4); - // a = 3 * (q0 - p0) + Clip3(p1 - q1, min_signed_val, max_signed_val); - // q0mp0 means "q0 minus p0". - const int16x8_t q0mp0_p1mq1 = vreinterpretq_s16_u16(vsubq_u16(q0p1, p0q1)); - const int16x4_t q0mp0_3 = vmul_n_s16(vget_low_s16(q0mp0_p1mq1), 3); - - // If this is for Filter2() then include |p1mq1|. Otherwise zero it. - const int16x4_t min_signed_pixel = vdup_n_s16(-(1 << (9 /*bitdepth-1*/))); - const int16x4_t max_signed_pixel = vdup_n_s16((1 << (9 /*bitdepth-1*/)) - 1); - const int16x4_t p1mq1 = vget_high_s16(q0mp0_p1mq1); - const int16x4_t p1mq1_saturated = - Clip3S16(p1mq1, min_signed_pixel, max_signed_pixel); - const int16x4_t hev_option = - vand_s16(vreinterpret_s16_u16(hev_mask), p1mq1_saturated); - - const int16x4_t a = vadd_s16(q0mp0_3, hev_option); - - // Need to figure out what's going on here because there are some unnecessary - // tricks to accommodate 8x8 as smallest 8bpp vector - - // We can not shift with rounding because the clamp comes *before* the - // shifting. a1 = Clip3(a + 4, min_signed_val, max_signed_val) >> 3; a2 = - // Clip3(a + 3, min_signed_val, max_signed_val) >> 3; - const int16x4_t plus_four = - Clip3S16(vadd_s16(a, vdup_n_s16(4)), min_signed_pixel, max_signed_pixel); - const int16x4_t plus_three = - Clip3S16(vadd_s16(a, vdup_n_s16(3)), min_signed_pixel, max_signed_pixel); - const int16x4_t a1 = vshr_n_s16(plus_four, 3); - const int16x4_t a2 = vshr_n_s16(plus_three, 3); - - // a3 = (a1 + 1) >> 1; - const int16x4_t a3 = vrshr_n_s16(a1, 1); - - const int16x8_t a3_ma3 = vcombine_s16(a3, vneg_s16(a3)); - const int16x8_t p1q1_a3 = vaddq_s16(vreinterpretq_s16_u16(p1q1), a3_ma3); - - // Need to shift the second term or we end up with a2_ma2. - const int16x8_t a2_ma1 = vcombine_s16(a2, vneg_s16(a1)); - const int16x8_t p0q0_a = vaddq_s16(vreinterpretq_s16_u16(p0q0), a2_ma1); - *p1q1_result = ConvertToUnsignedPixelU16(p1q1_a3, kBitdepth10); - *p0q0_result = ConvertToUnsignedPixelU16(p0q0_a, kBitdepth10); -} - -void Horizontal4_NEON(void* const dest, const ptrdiff_t stride, - int outer_thresh, int inner_thresh, int hev_thresh) { - auto* const dst = static_cast<uint8_t*>(dest); - auto* const dst_p1 = reinterpret_cast<uint16_t*>(dst - 2 * stride); - auto* const dst_p0 = reinterpret_cast<uint16_t*>(dst - stride); - auto* const dst_q0 = reinterpret_cast<uint16_t*>(dst); - auto* const dst_q1 = reinterpret_cast<uint16_t*>(dst + stride); - - const uint16x4_t src[4] = {vld1_u16(dst_p1), vld1_u16(dst_p0), - vld1_u16(dst_q0), vld1_u16(dst_q1)}; - - // Adjust thresholds to bitdepth. - outer_thresh <<= 2; - inner_thresh <<= 2; - hev_thresh <<= 2; - const uint16x4_t outer_mask = - OuterThreshold(src[0], src[1], src[2], src[3], outer_thresh); - uint16x4_t hev_mask; - uint16x4_t needs_filter4_mask; - const uint16x8_t p0q0 = vcombine_u16(src[1], src[2]); - const uint16x8_t p1q1 = vcombine_u16(src[0], src[3]); - Filter4Masks(p0q0, p1q1, hev_thresh, outer_mask, inner_thresh, &hev_mask, - &needs_filter4_mask); - -#if defined(__aarch64__) - // This provides a good speedup for the unit test, but may not come up often - // enough to warrant it. - if (vaddv_u16(needs_filter4_mask) == 0) { - // None of the values will be filtered. - return; - } -#else // !defined(__aarch64__) - const uint64x1_t needs_filter4_mask64 = - vreinterpret_u64_u16(needs_filter4_mask); - if (vget_lane_u64(needs_filter4_mask64, 0) == 0) { - // None of the values will be filtered. - return; - } -#endif // defined(__aarch64__) - - // Copy the masks to the high bits for packed comparisons later. - const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); - const uint16x8_t needs_filter4_mask_8 = - vcombine_u16(needs_filter4_mask, needs_filter4_mask); - - uint16x8_t f_p1q1; - uint16x8_t f_p0q0; - const uint16x8_t p0q1 = vcombine_u16(src[1], src[3]); - Filter4(p0q0, p0q1, p1q1, hev_mask, &f_p1q1, &f_p0q0); - - // Already integrated the Hev mask when calculating the filtered values. - const uint16x8_t p0q0_output = vbslq_u16(needs_filter4_mask_8, f_p0q0, p0q0); - - // p1/q1 are unmodified if only Hev() is true. This works because it was and'd - // with |needs_filter4_mask| previously. - const uint16x8_t p1q1_mask = veorq_u16(hev_mask_8, needs_filter4_mask_8); - const uint16x8_t p1q1_output = vbslq_u16(p1q1_mask, f_p1q1, p1q1); - - vst1_u16(dst_p1, vget_low_u16(p1q1_output)); - vst1_u16(dst_p0, vget_low_u16(p0q0_output)); - vst1_u16(dst_q0, vget_high_u16(p0q0_output)); - vst1_u16(dst_q1, vget_high_u16(p1q1_output)); -} - -void Vertical4_NEON(void* const dest, const ptrdiff_t stride, int outer_thresh, - int inner_thresh, int hev_thresh) { - // Offset by 2 uint16_t values to load from first p1 position. - auto* dst = static_cast<uint8_t*>(dest) - 4; - auto* dst_p1 = reinterpret_cast<uint16_t*>(dst); - auto* dst_p0 = reinterpret_cast<uint16_t*>(dst + stride); - auto* dst_q0 = reinterpret_cast<uint16_t*>(dst + stride * 2); - auto* dst_q1 = reinterpret_cast<uint16_t*>(dst + stride * 3); - - uint16x4_t src[4] = {vld1_u16(dst_p1), vld1_u16(dst_p0), vld1_u16(dst_q0), - vld1_u16(dst_q1)}; - Transpose4x4(src); - - // Adjust thresholds to bitdepth. - outer_thresh <<= 2; - inner_thresh <<= 2; - hev_thresh <<= 2; - const uint16x4_t outer_mask = - OuterThreshold(src[0], src[1], src[2], src[3], outer_thresh); - uint16x4_t hev_mask; - uint16x4_t needs_filter4_mask; - const uint16x8_t p0q0 = vcombine_u16(src[1], src[2]); - const uint16x8_t p1q1 = vcombine_u16(src[0], src[3]); - Filter4Masks(p0q0, p1q1, hev_thresh, outer_mask, inner_thresh, &hev_mask, - &needs_filter4_mask); - -#if defined(__aarch64__) - // This provides a good speedup for the unit test. Not sure how applicable it - // is to valid streams though. - // Consider doing this on armv7 if there is a quick way to check if a vector - // is zero. - if (vaddv_u16(needs_filter4_mask) == 0) { - // None of the values will be filtered. - return; - } -#else // !defined(__aarch64__) - const uint64x1_t needs_filter4_mask64 = - vreinterpret_u64_u16(needs_filter4_mask); - if (vget_lane_u64(needs_filter4_mask64, 0) == 0) { - // None of the values will be filtered. - return; - } -#endif // defined(__aarch64__) - - // Copy the masks to the high bits for packed comparisons later. - const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); - const uint16x8_t needs_filter4_mask_8 = - vcombine_u16(needs_filter4_mask, needs_filter4_mask); - - uint16x8_t f_p1q1; - uint16x8_t f_p0q0; - const uint16x8_t p0q1 = vcombine_u16(src[1], src[3]); - Filter4(p0q0, p0q1, p1q1, hev_mask, &f_p1q1, &f_p0q0); - - // Already integrated the Hev mask when calculating the filtered values. - const uint16x8_t p0q0_output = vbslq_u16(needs_filter4_mask_8, f_p0q0, p0q0); - - // p1/q1 are unmodified if only Hev() is true. This works because it was and'd - // with |needs_filter4_mask| previously. - const uint16x8_t p1q1_mask = veorq_u16(hev_mask_8, needs_filter4_mask_8); - const uint16x8_t p1q1_output = vbslq_u16(p1q1_mask, f_p1q1, p1q1); - - uint16x4_t output[4] = { - vget_low_u16(p1q1_output), - vget_low_u16(p0q0_output), - vget_high_u16(p0q0_output), - vget_high_u16(p1q1_output), - }; - Transpose4x4(output); - - vst1_u16(dst_p1, output[0]); - vst1_u16(dst_p0, output[1]); - vst1_u16(dst_q0, output[2]); - vst1_u16(dst_q1, output[3]); -} - -inline void Filter6(const uint16x8_t p2q2, const uint16x8_t p1q1, - const uint16x8_t p0q0, uint16x8_t* const p1q1_output, - uint16x8_t* const p0q0_output) { - // Sum p1 and q1 output from opposite directions. - // The formula is regrouped to allow 3 doubling operations to be combined. - // - // p1 = (3 * p2) + (2 * p1) + (2 * p0) + q0 - // ^^^^^^^^ - // q1 = p0 + (2 * q0) + (2 * q1) + (3 * q2) - // ^^^^^^^^ - // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0 - // ^^^^^^^^^^^ - uint16x8_t sum = vaddq_u16(p2q2, p1q1); - - // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0 - // ^^^^^^ - sum = vaddq_u16(sum, p0q0); - - // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0 - // ^^^^^ - sum = vshlq_n_u16(sum, 1); - - // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0 - // ^^^^^^ ^^^^^^ - // Should dual issue with the left shift. - const uint16x8_t q0p0 = Transpose64(p0q0); - const uint16x8_t outer_sum = vaddq_u16(p2q2, q0p0); - sum = vaddq_u16(sum, outer_sum); - - *p1q1_output = vrshrq_n_u16(sum, 3); - - // Convert to p0 and q0 output: - // p0 = p1 - (2 * p2) + q0 + q1 - // q0 = q1 - (2 * q2) + p0 + p1 - // p0q0 = p1q1 - (2 * p2q2) + q0p0 + q1p1 - // ^^^^^^^^ - const uint16x8_t p2q2_double = vshlq_n_u16(p2q2, 1); - // p0q0 = p1q1 - (2 * p2q2) + q0p0 + q1p1 - // ^^^^^^^^ - sum = vsubq_u16(sum, p2q2_double); - const uint16x8_t q1p1 = Transpose64(p1q1); - sum = vaddq_u16(sum, vaddq_u16(q0p0, q1p1)); - - *p0q0_output = vrshrq_n_u16(sum, 3); -} - -void Horizontal6_NEON(void* const dest, const ptrdiff_t stride, - int outer_thresh, int inner_thresh, int hev_thresh) { - auto* const dst = static_cast<uint8_t*>(dest); - auto* const dst_p2 = reinterpret_cast<uint16_t*>(dst - 3 * stride); - auto* const dst_p1 = reinterpret_cast<uint16_t*>(dst - 2 * stride); - auto* const dst_p0 = reinterpret_cast<uint16_t*>(dst - stride); - auto* const dst_q0 = reinterpret_cast<uint16_t*>(dst); - auto* const dst_q1 = reinterpret_cast<uint16_t*>(dst + stride); - auto* const dst_q2 = reinterpret_cast<uint16_t*>(dst + 2 * stride); - - const uint16x4_t src[6] = {vld1_u16(dst_p2), vld1_u16(dst_p1), - vld1_u16(dst_p0), vld1_u16(dst_q0), - vld1_u16(dst_q1), vld1_u16(dst_q2)}; - - // Adjust thresholds to bitdepth. - outer_thresh <<= 2; - inner_thresh <<= 2; - hev_thresh <<= 2; - const uint16x4_t outer_mask = - OuterThreshold(src[1], src[2], src[3], src[4], outer_thresh); - uint16x4_t hev_mask; - uint16x4_t needs_filter_mask; - uint16x4_t is_flat3_mask; - const uint16x8_t p0q0 = vcombine_u16(src[2], src[3]); - const uint16x8_t p1q1 = vcombine_u16(src[1], src[4]); - const uint16x8_t p2q2 = vcombine_u16(src[0], src[5]); - Filter6Masks(p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh, - &needs_filter_mask, &is_flat3_mask, &hev_mask); - -#if defined(__aarch64__) - if (vaddv_u16(needs_filter_mask) == 0) { - // None of the values will be filtered. - return; - } -#else // !defined(__aarch64__) - // This might be faster than vaddv (latency 3) because mov to general register - // has latency 2. - const uint64x1_t needs_filter_mask64 = - vreinterpret_u64_u16(needs_filter_mask); - if (vget_lane_u64(needs_filter_mask64, 0) == 0) { - // None of the values will be filtered. - return; - } -#endif // defined(__aarch64__) - - // Copy the masks to the high bits for packed comparisons later. - const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); - const uint16x8_t is_flat3_mask_8 = vcombine_u16(is_flat3_mask, is_flat3_mask); - const uint16x8_t needs_filter_mask_8 = - vcombine_u16(needs_filter_mask, needs_filter_mask); - - uint16x8_t f4_p1q1; - uint16x8_t f4_p0q0; - // ZIP1 p0q0, p1q1 may perform better here. - const uint16x8_t p0q1 = vcombine_u16(src[2], src[4]); - Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0); - f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1); - - uint16x8_t p0q0_output, p1q1_output; - // Because we did not return after testing |needs_filter_mask| we know it is - // nonzero. |is_flat3_mask| controls whether the needed filter is Filter4 or - // Filter6. Therefore if it is false when |needs_filter_mask| is true, Filter6 - // output is not used. - uint16x8_t f6_p1q1, f6_p0q0; - const uint64x1_t need_filter6 = vreinterpret_u64_u16(is_flat3_mask); - if (vget_lane_u64(need_filter6, 0) == 0) { - // Filter6() does not apply, but Filter4() applies to one or more values. - p0q0_output = p0q0; - p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1); - p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0); - } else { - Filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0); - p1q1_output = vbslq_u16(is_flat3_mask_8, f6_p1q1, f4_p1q1); - p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); - p0q0_output = vbslq_u16(is_flat3_mask_8, f6_p0q0, f4_p0q0); - p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); - } - - vst1_u16(dst_p1, vget_low_u16(p1q1_output)); - vst1_u16(dst_p0, vget_low_u16(p0q0_output)); - vst1_u16(dst_q0, vget_high_u16(p0q0_output)); - vst1_u16(dst_q1, vget_high_u16(p1q1_output)); -} - -void Vertical6_NEON(void* const dest, const ptrdiff_t stride, int outer_thresh, - int inner_thresh, int hev_thresh) { - // Left side of the filter window. - auto* const dst = static_cast<uint8_t*>(dest) - 3 * sizeof(uint16_t); - auto* const dst_0 = reinterpret_cast<uint16_t*>(dst); - auto* const dst_1 = reinterpret_cast<uint16_t*>(dst + stride); - auto* const dst_2 = reinterpret_cast<uint16_t*>(dst + 2 * stride); - auto* const dst_3 = reinterpret_cast<uint16_t*>(dst + 3 * stride); - - // Overread by 2 values. These overreads become the high halves of src_raw[2] - // and src_raw[3] after transpose. - uint16x8_t src_raw[4] = {vld1q_u16(dst_0), vld1q_u16(dst_1), vld1q_u16(dst_2), - vld1q_u16(dst_3)}; - Transpose4x8(src_raw); - // p2, p1, p0, q0, q1, q2 - const uint16x4_t src[6] = { - vget_low_u16(src_raw[0]), vget_low_u16(src_raw[1]), - vget_low_u16(src_raw[2]), vget_low_u16(src_raw[3]), - vget_high_u16(src_raw[0]), vget_high_u16(src_raw[1]), - }; - - // Adjust thresholds to bitdepth. - outer_thresh <<= 2; - inner_thresh <<= 2; - hev_thresh <<= 2; - const uint16x4_t outer_mask = - OuterThreshold(src[1], src[2], src[3], src[4], outer_thresh); - uint16x4_t hev_mask; - uint16x4_t needs_filter_mask; - uint16x4_t is_flat3_mask; - const uint16x8_t p0q0 = vcombine_u16(src[2], src[3]); - const uint16x8_t p1q1 = vcombine_u16(src[1], src[4]); - const uint16x8_t p2q2 = vcombine_u16(src[0], src[5]); - Filter6Masks(p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh, - &needs_filter_mask, &is_flat3_mask, &hev_mask); - -#if defined(__aarch64__) - if (vaddv_u16(needs_filter_mask) == 0) { - // None of the values will be filtered. - return; - } -#else // !defined(__aarch64__) - // This might be faster than vaddv (latency 3) because mov to general register - // has latency 2. - const uint64x1_t needs_filter_mask64 = - vreinterpret_u64_u16(needs_filter_mask); - if (vget_lane_u64(needs_filter_mask64, 0) == 0) { - // None of the values will be filtered. - return; - } -#endif // defined(__aarch64__) - - // Copy the masks to the high bits for packed comparisons later. - const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); - const uint16x8_t is_flat3_mask_8 = vcombine_u16(is_flat3_mask, is_flat3_mask); - const uint16x8_t needs_filter_mask_8 = - vcombine_u16(needs_filter_mask, needs_filter_mask); - - uint16x8_t f4_p1q1; - uint16x8_t f4_p0q0; - // ZIP1 p0q0, p1q1 may perform better here. - const uint16x8_t p0q1 = vcombine_u16(src[2], src[4]); - Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0); - f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1); - - uint16x8_t p0q0_output, p1q1_output; - // Because we did not return after testing |needs_filter_mask| we know it is - // nonzero. |is_flat3_mask| controls whether the needed filter is Filter4 or - // Filter6. Therefore if it is false when |needs_filter_mask| is true, Filter6 - // output is not used. - uint16x8_t f6_p1q1, f6_p0q0; - const uint64x1_t need_filter6 = vreinterpret_u64_u16(is_flat3_mask); - if (vget_lane_u64(need_filter6, 0) == 0) { - // Filter6() does not apply, but Filter4() applies to one or more values. - p0q0_output = p0q0; - p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1); - p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0); - } else { - Filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0); - p1q1_output = vbslq_u16(is_flat3_mask_8, f6_p1q1, f4_p1q1); - p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); - p0q0_output = vbslq_u16(is_flat3_mask_8, f6_p0q0, f4_p0q0); - p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); - } - - uint16x4_t output[4] = { - vget_low_u16(p1q1_output), - vget_low_u16(p0q0_output), - vget_high_u16(p0q0_output), - vget_high_u16(p1q1_output), - }; - Transpose4x4(output); - - // dst_n starts at p2, so adjust to p1. - vst1_u16(dst_0 + 1, output[0]); - vst1_u16(dst_1 + 1, output[1]); - vst1_u16(dst_2 + 1, output[2]); - vst1_u16(dst_3 + 1, output[3]); -} - -inline void Filter8(const uint16x8_t p3q3, const uint16x8_t p2q2, - const uint16x8_t p1q1, const uint16x8_t p0q0, - uint16x8_t* const p2q2_output, - uint16x8_t* const p1q1_output, - uint16x8_t* const p0q0_output) { - // Sum p2 and q2 output from opposite directions. - // The formula is regrouped to allow 2 doubling operations to be combined. - // p2 = (3 * p3) + (2 * p2) + p1 + p0 + q0 - // ^^^^^^^^ - // q2 = p0 + q0 + q1 + (2 * q2) + (3 * q3) - // ^^^^^^^^ - // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0 - // ^^^^^^^^^^^ - const uint16x8_t p23q23 = vaddq_u16(p3q3, p2q2); - - // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0 - // ^^^^^ - uint16x8_t sum = vshlq_n_u16(p23q23, 1); - - // Add two other terms to make dual issue with shift more likely. - // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0 - // ^^^^^^^^^^^ - const uint16x8_t p01q01 = vaddq_u16(p0q0, p1q1); - - // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0 - // ^^^^^^^^^^^^^ - sum = vaddq_u16(sum, p01q01); - - // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0 - // ^^^^^^ - sum = vaddq_u16(sum, p3q3); - - // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0 - // ^^^^^^ - const uint16x8_t q0p0 = Transpose64(p0q0); - sum = vaddq_u16(sum, q0p0); - - *p2q2_output = vrshrq_n_u16(sum, 3); - - // Convert to p1 and q1 output: - // p1 = p2 - p3 - p2 + p1 + q1 - // q1 = q2 - q3 - q2 + q0 + p1 - sum = vsubq_u16(sum, p23q23); - const uint16x8_t q1p1 = Transpose64(p1q1); - sum = vaddq_u16(sum, vaddq_u16(p1q1, q1p1)); - - *p1q1_output = vrshrq_n_u16(sum, 3); - - // Convert to p0 and q0 output: - // p0 = p1 - p3 - p1 + p0 + q2 - // q0 = q1 - q3 - q1 + q0 + p2 - sum = vsubq_u16(sum, vaddq_u16(p3q3, p1q1)); - const uint16x8_t q2p2 = Transpose64(p2q2); - sum = vaddq_u16(sum, vaddq_u16(p0q0, q2p2)); - - *p0q0_output = vrshrq_n_u16(sum, 3); -} - -void Horizontal8_NEON(void* const dest, const ptrdiff_t stride, - int outer_thresh, int inner_thresh, int hev_thresh) { - auto* const dst = static_cast<uint8_t*>(dest); - auto* const dst_p3 = reinterpret_cast<uint16_t*>(dst - 4 * stride); - auto* const dst_p2 = reinterpret_cast<uint16_t*>(dst - 3 * stride); - auto* const dst_p1 = reinterpret_cast<uint16_t*>(dst - 2 * stride); - auto* const dst_p0 = reinterpret_cast<uint16_t*>(dst - stride); - auto* const dst_q0 = reinterpret_cast<uint16_t*>(dst); - auto* const dst_q1 = reinterpret_cast<uint16_t*>(dst + stride); - auto* const dst_q2 = reinterpret_cast<uint16_t*>(dst + 2 * stride); - auto* const dst_q3 = reinterpret_cast<uint16_t*>(dst + 3 * stride); - - const uint16x4_t src[8] = { - vld1_u16(dst_p3), vld1_u16(dst_p2), vld1_u16(dst_p1), vld1_u16(dst_p0), - vld1_u16(dst_q0), vld1_u16(dst_q1), vld1_u16(dst_q2), vld1_u16(dst_q3)}; - - // Adjust thresholds to bitdepth. - outer_thresh <<= 2; - inner_thresh <<= 2; - hev_thresh <<= 2; - const uint16x4_t outer_mask = - OuterThreshold(src[2], src[3], src[4], src[5], outer_thresh); - uint16x4_t hev_mask; - uint16x4_t needs_filter_mask; - uint16x4_t is_flat4_mask; - const uint16x8_t p0q0 = vcombine_u16(src[3], src[4]); - const uint16x8_t p1q1 = vcombine_u16(src[2], src[5]); - const uint16x8_t p2q2 = vcombine_u16(src[1], src[6]); - const uint16x8_t p3q3 = vcombine_u16(src[0], src[7]); - Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh, - &needs_filter_mask, &is_flat4_mask, &hev_mask); - -#if defined(__aarch64__) - if (vaddv_u16(needs_filter_mask) == 0) { - // None of the values will be filtered. - return; - } -#else // !defined(__aarch64__) - // This might be faster than vaddv (latency 3) because mov to general register - // has latency 2. - const uint64x1_t needs_filter_mask64 = - vreinterpret_u64_u16(needs_filter_mask); - if (vget_lane_u64(needs_filter_mask64, 0) == 0) { - // None of the values will be filtered. - return; - } -#endif // defined(__aarch64__) - - // Copy the masks to the high bits for packed comparisons later. - const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); - const uint16x8_t needs_filter_mask_8 = - vcombine_u16(needs_filter_mask, needs_filter_mask); - - uint16x8_t f4_p1q1; - uint16x8_t f4_p0q0; - // ZIP1 p0q0, p1q1 may perform better here. - const uint16x8_t p0q1 = vcombine_u16(src[3], src[5]); - Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0); - f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1); - - uint16x8_t p0q0_output, p1q1_output, p2q2_output; - // Because we did not return after testing |needs_filter_mask| we know it is - // nonzero. |is_flat4_mask| controls whether the needed filter is Filter4 or - // Filter8. Therefore if it is false when |needs_filter_mask| is true, Filter8 - // output is not used. - uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0; - const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask); - if (vget_lane_u64(need_filter8, 0) == 0) { - // Filter8() does not apply, but Filter4() applies to one or more values. - p2q2_output = p2q2; - p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1); - p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0); - } else { - const uint16x8_t is_flat4_mask_8 = - vcombine_u16(is_flat4_mask, is_flat4_mask); - Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0); - p2q2_output = vbslq_u16(is_flat4_mask_8, f8_p2q2, p2q2); - p1q1_output = vbslq_u16(is_flat4_mask_8, f8_p1q1, f4_p1q1); - p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); - p0q0_output = vbslq_u16(is_flat4_mask_8, f8_p0q0, f4_p0q0); - p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); - } - - vst1_u16(dst_p2, vget_low_u16(p2q2_output)); - vst1_u16(dst_p1, vget_low_u16(p1q1_output)); - vst1_u16(dst_p0, vget_low_u16(p0q0_output)); - vst1_u16(dst_q0, vget_high_u16(p0q0_output)); - vst1_u16(dst_q1, vget_high_u16(p1q1_output)); - vst1_u16(dst_q2, vget_high_u16(p2q2_output)); -} - -inline uint16x8_t ReverseLowHalf(const uint16x8_t a) { - return vcombine_u16(vrev64_u16(vget_low_u16(a)), vget_high_u16(a)); -} - -void Vertical8_NEON(void* const dest, const ptrdiff_t stride, int outer_thresh, - int inner_thresh, int hev_thresh) { - auto* const dst = static_cast<uint8_t*>(dest) - 4 * sizeof(uint16_t); - auto* const dst_0 = reinterpret_cast<uint16_t*>(dst); - auto* const dst_1 = reinterpret_cast<uint16_t*>(dst + stride); - auto* const dst_2 = reinterpret_cast<uint16_t*>(dst + 2 * stride); - auto* const dst_3 = reinterpret_cast<uint16_t*>(dst + 3 * stride); - - // src_raw[n] contains p3, p2, p1, p0, q0, q1, q2, q3 for row n. - // To get desired pairs after transpose, one half should be reversed. - uint16x8_t src[4] = {vld1q_u16(dst_0), vld1q_u16(dst_1), vld1q_u16(dst_2), - vld1q_u16(dst_3)}; - - // src[0] = p0q0 - // src[1] = p1q1 - // src[2] = p2q2 - // src[3] = p3q3 - LoopFilterTranspose4x8(src); - - // Adjust thresholds to bitdepth. - outer_thresh <<= 2; - inner_thresh <<= 2; - hev_thresh <<= 2; - const uint16x4_t outer_mask = OuterThreshold( - vget_low_u16(src[1]), vget_low_u16(src[0]), vget_high_u16(src[0]), - vget_high_u16(src[1]), outer_thresh); - uint16x4_t hev_mask; - uint16x4_t needs_filter_mask; - uint16x4_t is_flat4_mask; - const uint16x8_t p0q0 = src[0]; - const uint16x8_t p1q1 = src[1]; - const uint16x8_t p2q2 = src[2]; - const uint16x8_t p3q3 = src[3]; - Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh, - &needs_filter_mask, &is_flat4_mask, &hev_mask); - -#if defined(__aarch64__) - if (vaddv_u16(needs_filter_mask) == 0) { - // None of the values will be filtered. - return; - } -#else // !defined(__aarch64__) - // This might be faster than vaddv (latency 3) because mov to general register - // has latency 2. - const uint64x1_t needs_filter_mask64 = - vreinterpret_u64_u16(needs_filter_mask); - if (vget_lane_u64(needs_filter_mask64, 0) == 0) { - // None of the values will be filtered. - return; - } -#endif // defined(__aarch64__) - - // Copy the masks to the high bits for packed comparisons later. - const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); - const uint16x8_t needs_filter_mask_8 = - vcombine_u16(needs_filter_mask, needs_filter_mask); - - uint16x8_t f4_p1q1; - uint16x8_t f4_p0q0; - const uint16x8_t p0q1 = vcombine_u16(vget_low_u16(p0q0), vget_high_u16(p1q1)); - Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0); - f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1); - - uint16x8_t p0q0_output, p1q1_output, p2q2_output; - // Because we did not return after testing |needs_filter_mask| we know it is - // nonzero. |is_flat4_mask| controls whether the needed filter is Filter4 or - // Filter8. Therefore if it is false when |needs_filter_mask| is true, Filter8 - // output is not used. - const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask); - if (vget_lane_u64(need_filter8, 0) == 0) { - // Filter8() does not apply, but Filter4() applies to one or more values. - p2q2_output = p2q2; - p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1); - p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0); - } else { - const uint16x8_t is_flat4_mask_8 = - vcombine_u16(is_flat4_mask, is_flat4_mask); - uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0; - Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0); - p2q2_output = vbslq_u16(is_flat4_mask_8, f8_p2q2, p2q2); - p1q1_output = vbslq_u16(is_flat4_mask_8, f8_p1q1, f4_p1q1); - p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); - p0q0_output = vbslq_u16(is_flat4_mask_8, f8_p0q0, f4_p0q0); - p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); - } - - uint16x8_t output[4] = {p0q0_output, p1q1_output, p2q2_output, p3q3}; - // After transpose, |output| will contain rows of the form: - // p0 p1 p2 p3 q0 q1 q2 q3 - Transpose4x8(output); - - // Reverse p values to produce original order: - // p3 p2 p1 p0 q0 q1 q2 q3 - vst1q_u16(dst_0, ReverseLowHalf(output[0])); - vst1q_u16(dst_1, ReverseLowHalf(output[1])); - vst1q_u16(dst_2, ReverseLowHalf(output[2])); - vst1q_u16(dst_3, ReverseLowHalf(output[3])); -} -inline void Filter14(const uint16x8_t p6q6, const uint16x8_t p5q5, - const uint16x8_t p4q4, const uint16x8_t p3q3, - const uint16x8_t p2q2, const uint16x8_t p1q1, - const uint16x8_t p0q0, uint16x8_t* const p5q5_output, - uint16x8_t* const p4q4_output, - uint16x8_t* const p3q3_output, - uint16x8_t* const p2q2_output, - uint16x8_t* const p1q1_output, - uint16x8_t* const p0q0_output) { - // Sum p5 and q5 output from opposite directions. - // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0 - // ^^^^^^^^ - // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6) - // ^^^^^^^^ - const uint16x8_t p6q6_x7 = vsubq_u16(vshlq_n_u16(p6q6, 3), p6q6); - - // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0 - // ^^^^^^^^^^^^^^^^^^^ - // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6) - // ^^^^^^^^^^^^^^^^^^^ - uint16x8_t sum = vshlq_n_u16(vaddq_u16(p5q5, p4q4), 1); - sum = vaddq_u16(sum, p6q6_x7); - - // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0 - // ^^^^^^^ - // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6) - // ^^^^^^^ - sum = vaddq_u16(vaddq_u16(p3q3, p2q2), sum); - - // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0 - // ^^^^^^^ - // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6) - // ^^^^^^^ - sum = vaddq_u16(vaddq_u16(p1q1, p0q0), sum); - - // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0 - // ^^ - // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6) - // ^^ - const uint16x8_t q0p0 = Transpose64(p0q0); - sum = vaddq_u16(sum, q0p0); - - *p5q5_output = vrshrq_n_u16(sum, 4); - - // Convert to p4 and q4 output: - // p4 = p5 - (2 * p6) + p3 + q1 - // q4 = q5 - (2 * q6) + q3 + p1 - sum = vsubq_u16(sum, vshlq_n_u16(p6q6, 1)); - const uint16x8_t q1p1 = Transpose64(p1q1); - sum = vaddq_u16(vaddq_u16(p3q3, q1p1), sum); - - *p4q4_output = vrshrq_n_u16(sum, 4); - - // Convert to p3 and q3 output: - // p3 = p4 - p6 - p5 + p2 + q2 - // q3 = q4 - q6 - q5 + q2 + p2 - sum = vsubq_u16(sum, vaddq_u16(p6q6, p5q5)); - const uint16x8_t q2p2 = Transpose64(p2q2); - sum = vaddq_u16(vaddq_u16(p2q2, q2p2), sum); - - *p3q3_output = vrshrq_n_u16(sum, 4); - - // Convert to p2 and q2 output: - // p2 = p3 - p6 - p4 + p1 + q3 - // q2 = q3 - q6 - q4 + q1 + p3 - sum = vsubq_u16(sum, vaddq_u16(p6q6, p4q4)); - const uint16x8_t q3p3 = Transpose64(p3q3); - sum = vaddq_u16(vaddq_u16(p1q1, q3p3), sum); - - *p2q2_output = vrshrq_n_u16(sum, 4); - - // Convert to p1 and q1 output: - // p1 = p2 - p6 - p3 + p0 + q4 - // q1 = q2 - q6 - q3 + q0 + p4 - sum = vsubq_u16(sum, vaddq_u16(p6q6, p3q3)); - const uint16x8_t q4p4 = Transpose64(p4q4); - sum = vaddq_u16(vaddq_u16(p0q0, q4p4), sum); - - *p1q1_output = vrshrq_n_u16(sum, 4); - - // Convert to p0 and q0 output: - // p0 = p1 - p6 - p2 + q0 + q5 - // q0 = q1 - q6 - q2 + p0 + p5 - sum = vsubq_u16(sum, vaddq_u16(p6q6, p2q2)); - const uint16x8_t q5p5 = Transpose64(p5q5); - sum = vaddq_u16(vaddq_u16(q0p0, q5p5), sum); - - *p0q0_output = vrshrq_n_u16(sum, 4); -} - -void Horizontal14_NEON(void* const dest, const ptrdiff_t stride, - int outer_thresh, int inner_thresh, int hev_thresh) { - auto* const dst = static_cast<uint8_t*>(dest); - auto* const dst_p6 = reinterpret_cast<uint16_t*>(dst - 7 * stride); - auto* const dst_p5 = reinterpret_cast<uint16_t*>(dst - 6 * stride); - auto* const dst_p4 = reinterpret_cast<uint16_t*>(dst - 5 * stride); - auto* const dst_p3 = reinterpret_cast<uint16_t*>(dst - 4 * stride); - auto* const dst_p2 = reinterpret_cast<uint16_t*>(dst - 3 * stride); - auto* const dst_p1 = reinterpret_cast<uint16_t*>(dst - 2 * stride); - auto* const dst_p0 = reinterpret_cast<uint16_t*>(dst - stride); - auto* const dst_q0 = reinterpret_cast<uint16_t*>(dst); - auto* const dst_q1 = reinterpret_cast<uint16_t*>(dst + stride); - auto* const dst_q2 = reinterpret_cast<uint16_t*>(dst + 2 * stride); - auto* const dst_q3 = reinterpret_cast<uint16_t*>(dst + 3 * stride); - auto* const dst_q4 = reinterpret_cast<uint16_t*>(dst + 4 * stride); - auto* const dst_q5 = reinterpret_cast<uint16_t*>(dst + 5 * stride); - auto* const dst_q6 = reinterpret_cast<uint16_t*>(dst + 6 * stride); - - const uint16x4_t src[14] = { - vld1_u16(dst_p6), vld1_u16(dst_p5), vld1_u16(dst_p4), vld1_u16(dst_p3), - vld1_u16(dst_p2), vld1_u16(dst_p1), vld1_u16(dst_p0), vld1_u16(dst_q0), - vld1_u16(dst_q1), vld1_u16(dst_q2), vld1_u16(dst_q3), vld1_u16(dst_q4), - vld1_u16(dst_q5), vld1_u16(dst_q6)}; - - // Adjust thresholds to bitdepth. - outer_thresh <<= 2; - inner_thresh <<= 2; - hev_thresh <<= 2; - const uint16x4_t outer_mask = - OuterThreshold(src[5], src[6], src[7], src[8], outer_thresh); - uint16x4_t hev_mask; - uint16x4_t needs_filter_mask; - uint16x4_t is_flat4_mask; - const uint16x8_t p0q0 = vcombine_u16(src[6], src[7]); - const uint16x8_t p1q1 = vcombine_u16(src[5], src[8]); - const uint16x8_t p2q2 = vcombine_u16(src[4], src[9]); - const uint16x8_t p3q3 = vcombine_u16(src[3], src[10]); - Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh, - &needs_filter_mask, &is_flat4_mask, &hev_mask); - -#if defined(__aarch64__) - if (vaddv_u16(needs_filter_mask) == 0) { - // None of the values will be filtered. - return; - } -#else // !defined(__aarch64__) - // This might be faster than vaddv (latency 3) because mov to general register - // has latency 2. - const uint64x1_t needs_filter_mask64 = - vreinterpret_u64_u16(needs_filter_mask); - if (vget_lane_u64(needs_filter_mask64, 0) == 0) { - // None of the values will be filtered. - return; - } -#endif // defined(__aarch64__) - const uint16x8_t p4q4 = vcombine_u16(src[2], src[11]); - const uint16x8_t p5q5 = vcombine_u16(src[1], src[12]); - const uint16x8_t p6q6 = vcombine_u16(src[0], src[13]); - // Mask to choose between the outputs of Filter8 and Filter14. - // As with the derivation of |is_flat4_mask|, the question of whether to use - // Filter14 is only raised where |is_flat4_mask| is true. - const uint16x4_t is_flat4_outer_mask = vand_u16( - is_flat4_mask, IsFlat4(vabdq_u16(p0q0, p4q4), vabdq_u16(p0q0, p5q5), - vabdq_u16(p0q0, p6q6))); - // Copy the masks to the high bits for packed comparisons later. - const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); - const uint16x8_t needs_filter_mask_8 = - vcombine_u16(needs_filter_mask, needs_filter_mask); - - uint16x8_t f4_p1q1; - uint16x8_t f4_p0q0; - // ZIP1 p0q0, p1q1 may perform better here. - const uint16x8_t p0q1 = vcombine_u16(src[6], src[8]); - Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0); - f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1); - - uint16x8_t p0q0_output, p1q1_output, p2q2_output, p3q3_output, p4q4_output, - p5q5_output; - // Because we did not return after testing |needs_filter_mask| we know it is - // nonzero. |is_flat4_mask| controls whether the needed filter is Filter4 or - // Filter8. Therefore if it is false when |needs_filter_mask| is true, Filter8 - // output is not used. - uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0; - const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask); - if (vget_lane_u64(need_filter8, 0) == 0) { - // Filter8() and Filter14() do not apply, but Filter4() applies to one or - // more values. - p5q5_output = p5q5; - p4q4_output = p4q4; - p3q3_output = p3q3; - p2q2_output = p2q2; - p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1); - p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0); - } else { - const uint16x8_t use_filter8_mask = - vcombine_u16(is_flat4_mask, is_flat4_mask); - Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0); - const uint64x1_t need_filter14 = vreinterpret_u64_u16(is_flat4_outer_mask); - if (vget_lane_u64(need_filter14, 0) == 0) { - // Filter14() does not apply, but Filter8() and Filter4() apply to one or - // more values. - p5q5_output = p5q5; - p4q4_output = p4q4; - p3q3_output = p3q3; - p2q2_output = vbslq_u16(use_filter8_mask, f8_p2q2, p2q2); - p1q1_output = vbslq_u16(use_filter8_mask, f8_p1q1, f4_p1q1); - p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); - p0q0_output = vbslq_u16(use_filter8_mask, f8_p0q0, f4_p0q0); - p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); - } else { - // All filters may contribute values to final outputs. - const uint16x8_t use_filter14_mask = - vcombine_u16(is_flat4_outer_mask, is_flat4_outer_mask); - uint16x8_t f14_p5q5, f14_p4q4, f14_p3q3, f14_p2q2, f14_p1q1, f14_p0q0; - Filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4, - &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0); - p5q5_output = vbslq_u16(use_filter14_mask, f14_p5q5, p5q5); - p4q4_output = vbslq_u16(use_filter14_mask, f14_p4q4, p4q4); - p3q3_output = vbslq_u16(use_filter14_mask, f14_p3q3, p3q3); - p2q2_output = vbslq_u16(use_filter14_mask, f14_p2q2, f8_p2q2); - p2q2_output = vbslq_u16(use_filter8_mask, p2q2_output, p2q2); - p2q2_output = vbslq_u16(needs_filter_mask_8, p2q2_output, p2q2); - p1q1_output = vbslq_u16(use_filter14_mask, f14_p1q1, f8_p1q1); - p1q1_output = vbslq_u16(use_filter8_mask, p1q1_output, f4_p1q1); - p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); - p0q0_output = vbslq_u16(use_filter14_mask, f14_p0q0, f8_p0q0); - p0q0_output = vbslq_u16(use_filter8_mask, p0q0_output, f4_p0q0); - p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); - } - } - - vst1_u16(dst_p5, vget_low_u16(p5q5_output)); - vst1_u16(dst_p4, vget_low_u16(p4q4_output)); - vst1_u16(dst_p3, vget_low_u16(p3q3_output)); - vst1_u16(dst_p2, vget_low_u16(p2q2_output)); - vst1_u16(dst_p1, vget_low_u16(p1q1_output)); - vst1_u16(dst_p0, vget_low_u16(p0q0_output)); - vst1_u16(dst_q0, vget_high_u16(p0q0_output)); - vst1_u16(dst_q1, vget_high_u16(p1q1_output)); - vst1_u16(dst_q2, vget_high_u16(p2q2_output)); - vst1_u16(dst_q3, vget_high_u16(p3q3_output)); - vst1_u16(dst_q4, vget_high_u16(p4q4_output)); - vst1_u16(dst_q5, vget_high_u16(p5q5_output)); -} - -inline uint16x8x2_t PermuteACDB64(const uint16x8_t ab, const uint16x8_t cd) { - uint16x8x2_t acdb; -#if defined(__aarch64__) - // a[b] <- [c]d - acdb.val[0] = vreinterpretq_u16_u64( - vtrn1q_u64(vreinterpretq_u64_u16(ab), vreinterpretq_u64_u16(cd))); - // [a]b <- c[d] - acdb.val[1] = vreinterpretq_u16_u64( - vtrn2q_u64(vreinterpretq_u64_u16(cd), vreinterpretq_u64_u16(ab))); -#else - // a[b] <- [c]d - acdb.val[0] = vreinterpretq_u16_u64( - vsetq_lane_u64(vgetq_lane_u64(vreinterpretq_u64_u16(cd), 0), - vreinterpretq_u64_u16(ab), 1)); - // [a]b <- c[d] - acdb.val[1] = vreinterpretq_u16_u64( - vsetq_lane_u64(vgetq_lane_u64(vreinterpretq_u64_u16(cd), 1), - vreinterpretq_u64_u16(ab), 0)); -#endif // defined(__aarch64__) - return acdb; -} - -void Vertical14_NEON(void* const dest, const ptrdiff_t stride, int outer_thresh, - int inner_thresh, int hev_thresh) { - auto* const dst = static_cast<uint8_t*>(dest) - 8 * sizeof(uint16_t); - auto* const dst_0 = reinterpret_cast<uint16_t*>(dst); - auto* const dst_1 = reinterpret_cast<uint16_t*>(dst + stride); - auto* const dst_2 = reinterpret_cast<uint16_t*>(dst + 2 * stride); - auto* const dst_3 = reinterpret_cast<uint16_t*>(dst + 3 * stride); - - // Low halves: p7 p6 p5 p4 - // High halves: p3 p2 p1 p0 - uint16x8_t src_p[4] = {vld1q_u16(dst_0), vld1q_u16(dst_1), vld1q_u16(dst_2), - vld1q_u16(dst_3)}; - // p7 will be the low half of src_p[0]. Not used until the end. - Transpose4x8(src_p); - - // Low halves: q0 q1 q2 q3 - // High halves: q4 q5 q6 q7 - uint16x8_t src_q[4] = {vld1q_u16(dst_0 + 8), vld1q_u16(dst_1 + 8), - vld1q_u16(dst_2 + 8), vld1q_u16(dst_3 + 8)}; - // q7 will be the high half of src_q[3]. Not used until the end. - Transpose4x8(src_q); - - // Adjust thresholds to bitdepth. - outer_thresh <<= 2; - inner_thresh <<= 2; - hev_thresh <<= 2; - const uint16x4_t outer_mask = OuterThreshold( - vget_high_u16(src_p[2]), vget_high_u16(src_p[3]), vget_low_u16(src_q[0]), - vget_low_u16(src_q[1]), outer_thresh); - const uint16x8_t p0q0 = vextq_u16(src_p[3], src_q[0], 4); - const uint16x8_t p1q1 = vextq_u16(src_p[2], src_q[1], 4); - const uint16x8_t p2q2 = vextq_u16(src_p[1], src_q[2], 4); - const uint16x8_t p3q3 = vextq_u16(src_p[0], src_q[3], 4); - uint16x4_t hev_mask; - uint16x4_t needs_filter_mask; - uint16x4_t is_flat4_mask; - Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh, - &needs_filter_mask, &is_flat4_mask, &hev_mask); - -#if defined(__aarch64__) - if (vaddv_u16(needs_filter_mask) == 0) { - // None of the values will be filtered. - return; - } -#else // !defined(__aarch64__) - // This might be faster than vaddv (latency 3) because mov to general register - // has latency 2. - const uint64x1_t needs_filter_mask64 = - vreinterpret_u64_u16(needs_filter_mask); - if (vget_lane_u64(needs_filter_mask64, 0) == 0) { - // None of the values will be filtered. - return; - } -#endif // defined(__aarch64__) - const uint16x8_t p4q4 = - vcombine_u16(vget_low_u16(src_p[3]), vget_high_u16(src_q[0])); - const uint16x8_t p5q5 = - vcombine_u16(vget_low_u16(src_p[2]), vget_high_u16(src_q[1])); - const uint16x8_t p6q6 = - vcombine_u16(vget_low_u16(src_p[1]), vget_high_u16(src_q[2])); - const uint16x8_t p7q7 = - vcombine_u16(vget_low_u16(src_p[0]), vget_high_u16(src_q[3])); - // Mask to choose between the outputs of Filter8 and Filter14. - // As with the derivation of |is_flat4_mask|, the question of whether to use - // Filter14 is only raised where |is_flat4_mask| is true. - const uint16x4_t is_flat4_outer_mask = vand_u16( - is_flat4_mask, IsFlat4(vabdq_u16(p0q0, p4q4), vabdq_u16(p0q0, p5q5), - vabdq_u16(p0q0, p6q6))); - // Copy the masks to the high bits for packed comparisons later. - const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); - const uint16x8_t needs_filter_mask_8 = - vcombine_u16(needs_filter_mask, needs_filter_mask); - - uint16x8_t f4_p1q1; - uint16x8_t f4_p0q0; - const uint16x8_t p0q1 = vcombine_u16(vget_low_u16(p0q0), vget_high_u16(p1q1)); - Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0); - f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1); - - uint16x8_t p0q0_output, p1q1_output, p2q2_output, p3q3_output, p4q4_output, - p5q5_output; - // Because we did not return after testing |needs_filter_mask| we know it is - // nonzero. |is_flat4_mask| controls whether the needed filter is Filter4 or - // Filter8. Therefore if it is false when |needs_filter_mask| is true, Filter8 - // output is not used. - uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0; - const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask); - if (vget_lane_u64(need_filter8, 0) == 0) { - // Filter8() and Filter14() do not apply, but Filter4() applies to one or - // more values. - p5q5_output = p5q5; - p4q4_output = p4q4; - p3q3_output = p3q3; - p2q2_output = p2q2; - p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1); - p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0); - } else { - const uint16x8_t use_filter8_mask = - vcombine_u16(is_flat4_mask, is_flat4_mask); - Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0); - const uint64x1_t need_filter14 = vreinterpret_u64_u16(is_flat4_outer_mask); - if (vget_lane_u64(need_filter14, 0) == 0) { - // Filter14() does not apply, but Filter8() and Filter4() apply to one or - // more values. - p5q5_output = p5q5; - p4q4_output = p4q4; - p3q3_output = p3q3; - p2q2_output = vbslq_u16(use_filter8_mask, f8_p2q2, p2q2); - p1q1_output = vbslq_u16(use_filter8_mask, f8_p1q1, f4_p1q1); - p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); - p0q0_output = vbslq_u16(use_filter8_mask, f8_p0q0, f4_p0q0); - p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); - } else { - // All filters may contribute values to final outputs. - const uint16x8_t use_filter14_mask = - vcombine_u16(is_flat4_outer_mask, is_flat4_outer_mask); - uint16x8_t f14_p5q5, f14_p4q4, f14_p3q3, f14_p2q2, f14_p1q1, f14_p0q0; - Filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4, - &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0); - p5q5_output = vbslq_u16(use_filter14_mask, f14_p5q5, p5q5); - p4q4_output = vbslq_u16(use_filter14_mask, f14_p4q4, p4q4); - p3q3_output = vbslq_u16(use_filter14_mask, f14_p3q3, p3q3); - p2q2_output = vbslq_u16(use_filter14_mask, f14_p2q2, f8_p2q2); - p2q2_output = vbslq_u16(use_filter8_mask, p2q2_output, p2q2); - p2q2_output = vbslq_u16(needs_filter_mask_8, p2q2_output, p2q2); - p1q1_output = vbslq_u16(use_filter14_mask, f14_p1q1, f8_p1q1); - p1q1_output = vbslq_u16(use_filter8_mask, p1q1_output, f4_p1q1); - p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); - p0q0_output = vbslq_u16(use_filter14_mask, f14_p0q0, f8_p0q0); - p0q0_output = vbslq_u16(use_filter8_mask, p0q0_output, f4_p0q0); - p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); - } - } - // To get the correctly ordered rows from the transpose, we need: - // p7p3 p6p2 p5p1 p4p0 - // q0q4 q1q5 q2q6 q3q7 - const uint16x8x2_t p7p3_q3q7 = PermuteACDB64(p7q7, p3q3_output); - const uint16x8x2_t p6p2_q2q6 = PermuteACDB64(p6q6, p2q2_output); - const uint16x8x2_t p5p1_q1q5 = PermuteACDB64(p5q5_output, p1q1_output); - const uint16x8x2_t p4p0_q0q4 = PermuteACDB64(p4q4_output, p0q0_output); - uint16x8_t output_p[4] = {p7p3_q3q7.val[0], p6p2_q2q6.val[0], - p5p1_q1q5.val[0], p4p0_q0q4.val[0]}; - Transpose4x8(output_p); - uint16x8_t output_q[4] = {p4p0_q0q4.val[1], p5p1_q1q5.val[1], - p6p2_q2q6.val[1], p7p3_q3q7.val[1]}; - Transpose4x8(output_q); - - // Reverse p values to produce original order: - // p3 p2 p1 p0 q0 q1 q2 q3 - vst1q_u16(dst_0, output_p[0]); - vst1q_u16(dst_0 + 8, output_q[0]); - vst1q_u16(dst_1, output_p[1]); - vst1q_u16(dst_1 + 8, output_q[1]); - vst1q_u16(dst_2, output_p[2]); - vst1q_u16(dst_2 + 8, output_q[2]); - vst1q_u16(dst_3, output_p[3]); - vst1q_u16(dst_3 + 8, output_q[3]); -} - -void Init10bpp() { - Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10); - assert(dsp != nullptr); - dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] = - Horizontal4_NEON; - dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] = Vertical4_NEON; - dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] = - Horizontal6_NEON; - dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] = Vertical6_NEON; - dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] = - Horizontal8_NEON; - dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] = Vertical8_NEON; - dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] = - Horizontal14_NEON; - dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] = - Vertical14_NEON; -} - -} // namespace -} // namespace high_bitdepth -#endif // LIBGAV1_MAX_BITDEPTH >= 10 - -void LoopFilterInit_NEON() { - low_bitdepth::Init8bpp(); -#if LIBGAV1_MAX_BITDEPTH >= 10 - high_bitdepth::Init10bpp(); -#endif -} } // namespace dsp } // namespace libgav1 diff --git a/src/dsp/arm/loop_filter_neon.h b/src/dsp/arm/loop_filter_neon.h index 540defc..531cd0d 100644 --- a/src/dsp/arm/loop_filter_neon.h +++ b/src/dsp/arm/loop_filter_neon.h @@ -26,6 +26,7 @@ namespace dsp { // Initializes Dsp::loop_filters, see the defines below for specifics. This // function is not thread-safe. void LoopFilterInit_NEON(); +void LoopFilterInit10bpp_NEON(); } // namespace dsp } // namespace libgav1 diff --git a/src/dsp/arm/loop_restoration_neon.cc b/src/dsp/arm/loop_restoration_neon.cc index 2db137f..cd8552e 100644 --- a/src/dsp/arm/loop_restoration_neon.cc +++ b/src/dsp/arm/loop_restoration_neon.cc @@ -1504,7 +1504,6 @@ inline void BoxSumFilterPreProcess5(const uint8_t* const src0, const ptrdiff_t overread_in_bytes = kWideOverreadInBytesPass1 - width; uint8x16_t s[2][2], mas[2]; uint16x8_t sq[2][4], bs[3]; - // TODO(b/194217060): Future msan load. s[0][0] = vld1q_u8(src0); s[1][0] = vld1q_u8(src1); @@ -1599,7 +1598,6 @@ inline void BoxSumFilterPreProcess( const ptrdiff_t overread_in_bytes = kWideOverreadInBytesPass1 - width; uint8x16_t s[2][2], ma3[2][2], ma5[2]; uint16x8_t sq[2][4], b3[2][3], b5[3]; - // TODO(b/194217060): Future msan load. s[0][0] = vld1q_u8(src0); s[1][0] = vld1q_u8(src1); @@ -1801,7 +1799,6 @@ inline void BoxFilterPass1LastRow(const uint8_t* const src, uint8_t* const dst) { uint8x16_t s[2], mas[2]; uint16x8_t sq[4], bs[4]; - // TODO(b/194217060): Future msan load. s[0] = vld1q_u8(src0); BoxFilterPreProcess5LastRowLo(s, scale, sum5, square_sum5, sq, &mas[0], @@ -1812,7 +1809,6 @@ inline void BoxFilterPass1LastRow(const uint8_t* const src, uint16x8_t ma[2]; uint8x16_t masx[3]; uint32x4x2_t b[2]; - // TODO(b/194217060): Future msan load. s[1] = vld1q_u8(src0 + x + 16); BoxFilterPreProcess5LastRow(s, x + 8, scale, sum5, square_sum5, sq + 1, mas, @@ -1856,7 +1852,6 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterPass2( const ptrdiff_t overread_in_bytes = kWideOverreadInBytesPass2 - width; uint8x16_t s[2], mas[2]; uint16x8_t sq[4], bs[3]; - // TODO(b/194217060): Future msan load. s[0] = vld1q_u8(src0); BoxFilterPreProcess3Lo(&s[0], scale, sum3, square_sum3, sq, &mas[0], &bs[0]); @@ -1915,7 +1910,6 @@ LIBGAV1_ALWAYS_INLINE void BoxFilter( const ptrdiff_t overread_in_bytes = kWideOverreadInBytesPass1 - width; uint8x16_t s[2][2], ma3[2][2], ma5[2]; uint16x8_t sq[2][4], b3[2][3], b5[3]; - // TODO(b/194217060): Future msan load. s[0][0] = vld1q_u8(src0); s[1][0] = vld1q_u8(src1); @@ -2023,7 +2017,6 @@ inline void BoxFilterLastRow( uint8x16_t s[2], ma3[2], ma5[2]; uint16x8_t sq[4], ma[3], b3[3], b5[3]; uint32x4x2_t b[3]; - // TODO(b/194217060): Future msan load. s[0] = vld1q_u8(src0); BoxFilterPreProcessLastRowLo(s, scales, sum3, sum5, square_sum3, square_sum5, @@ -2033,7 +2026,6 @@ inline void BoxFilterLastRow( do { uint8x16_t ma3x[3], ma5x[3]; int16x8_t p[2]; - // TODO(b/194217060): Future msan load. s[1] = vld1q_u8(src0 + x + 16); BoxFilterPreProcessLastRow(s, x + 8, scales, sum3, sum5, square_sum3, diff --git a/src/dsp/arm/mask_blend_neon.cc b/src/dsp/arm/mask_blend_neon.cc index 853f949..ecc67f8 100644 --- a/src/dsp/arm/mask_blend_neon.cc +++ b/src/dsp/arm/mask_blend_neon.cc @@ -33,50 +33,40 @@ namespace dsp { namespace low_bitdepth { namespace { -// TODO(b/150461164): Consider combining with GetInterIntraMask4x2(). -// Compound predictors use int16_t values and need to multiply long because the -// Convolve range * 64 is 20 bits. Unfortunately there is no multiply int16_t by -// int8_t and accumulate into int32_t instruction. -template <int subsampling_x, int subsampling_y> -inline int16x8_t GetMask4x2(const uint8_t* mask, ptrdiff_t mask_stride) { - if (subsampling_x == 1) { - const int16x4_t mask_val0 = vreinterpret_s16_u16(vpaddl_u8(vld1_u8(mask))); - const int16x4_t mask_val1 = vreinterpret_s16_u16( - vpaddl_u8(vld1_u8(mask + (mask_stride << subsampling_y)))); - int16x8_t final_val; - if (subsampling_y == 1) { - const int16x4_t next_mask_val0 = - vreinterpret_s16_u16(vpaddl_u8(vld1_u8(mask + mask_stride))); - const int16x4_t next_mask_val1 = - vreinterpret_s16_u16(vpaddl_u8(vld1_u8(mask + mask_stride * 3))); - final_val = vaddq_s16(vcombine_s16(mask_val0, mask_val1), - vcombine_s16(next_mask_val0, next_mask_val1)); - } else { - final_val = vreinterpretq_s16_u16( - vpaddlq_u8(vreinterpretq_u8_s16(vcombine_s16(mask_val0, mask_val1)))); - } - return vrshrq_n_s16(final_val, subsampling_y + 1); +template <int subsampling_y> +inline uint8x8_t GetMask4x2(const uint8_t* mask) { + if (subsampling_y == 1) { + const uint8x16x2_t mask_val = vld2q_u8(mask); + const uint8x16_t combined_horz = vaddq_u8(mask_val.val[0], mask_val.val[1]); + const uint32x2_t row_01 = vreinterpret_u32_u8(vget_low_u8(combined_horz)); + const uint32x2_t row_23 = vreinterpret_u32_u8(vget_high_u8(combined_horz)); + + const uint32x2x2_t row_02_13 = vtrn_u32(row_01, row_23); + // Use a halving add to work around the case where all |mask| values are 64. + return vrshr_n_u8(vhadd_u8(vreinterpret_u8_u32(row_02_13.val[0]), + vreinterpret_u8_u32(row_02_13.val[1])), + 1); } - assert(subsampling_y == 0 && subsampling_x == 0); - const uint8x8_t mask_val0 = Load4(mask); - const uint8x8_t mask_val = Load4<1>(mask + mask_stride, mask_val0); - return vreinterpretq_s16_u16(vmovl_u8(mask_val)); + // subsampling_x == 1 + const uint8x8x2_t mask_val = vld2_u8(mask); + return vrhadd_u8(mask_val.val[0], mask_val.val[1]); } template <int subsampling_x, int subsampling_y> -inline int16x8_t GetMask8(const uint8_t* mask, ptrdiff_t mask_stride) { +inline uint8x8_t GetMask8(const uint8_t* mask) { + if (subsampling_x == 1 && subsampling_y == 1) { + const uint8x16x2_t mask_val = vld2q_u8(mask); + const uint8x16_t combined_horz = vaddq_u8(mask_val.val[0], mask_val.val[1]); + // Use a halving add to work around the case where all |mask| values are 64. + return vrshr_n_u8( + vhadd_u8(vget_low_u8(combined_horz), vget_high_u8(combined_horz)), 1); + } if (subsampling_x == 1) { - int16x8_t mask_val = vreinterpretq_s16_u16(vpaddlq_u8(vld1q_u8(mask))); - if (subsampling_y == 1) { - const int16x8_t next_mask_val = - vreinterpretq_s16_u16(vpaddlq_u8(vld1q_u8(mask + mask_stride))); - mask_val = vaddq_s16(mask_val, next_mask_val); - } - return vrshrq_n_s16(mask_val, 1 + subsampling_y); + const uint8x8x2_t mask_val = vld2_u8(mask); + return vrhadd_u8(mask_val.val[0], mask_val.val[1]); } assert(subsampling_y == 0 && subsampling_x == 0); - const uint8x8_t mask_val = vld1_u8(mask); - return vreinterpretq_s16_u16(vmovl_u8(mask_val)); + return vld1_u8(mask); } inline void WriteMaskBlendLine4x2(const int16_t* LIBGAV1_RESTRICT const pred_0, @@ -109,89 +99,162 @@ inline void WriteMaskBlendLine4x2(const int16_t* LIBGAV1_RESTRICT const pred_0, StoreHi4(dst + dst_stride, result); } -template <int subsampling_x, int subsampling_y> +template <int subsampling_y> inline void MaskBlending4x4_NEON(const int16_t* LIBGAV1_RESTRICT pred_0, const int16_t* LIBGAV1_RESTRICT pred_1, const uint8_t* LIBGAV1_RESTRICT mask, - const ptrdiff_t mask_stride, uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t dst_stride) { + constexpr int subsampling_x = 1; + constexpr ptrdiff_t mask_stride = 4 << subsampling_x; const int16x8_t mask_inverter = vdupq_n_s16(64); - int16x8_t pred_mask_0 = - GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride); + // Compound predictors use int16_t values and need to multiply long because + // the Convolve range * 64 is 20 bits. Unfortunately there is no multiply + // int16_t by int8_t and accumulate into int32_t instruction. + int16x8_t pred_mask_0 = ZeroExtend(GetMask4x2<subsampling_y>(mask)); int16x8_t pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0); WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst, dst_stride); - // TODO(b/150461164): Arm tends to do better with load(val); val += stride - // It may be possible to turn this into a loop with a templated height. - pred_0 += 4 << 1; - pred_1 += 4 << 1; - mask += mask_stride << (1 + subsampling_y); - dst += dst_stride << 1; + pred_0 += 4 << subsampling_x; + pred_1 += 4 << subsampling_x; + mask += mask_stride << (subsampling_x + subsampling_y); + dst += dst_stride << subsampling_x; - pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride); + pred_mask_0 = ZeroExtend(GetMask4x2<subsampling_y>(mask)); pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0); WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst, dst_stride); } -template <int subsampling_x, int subsampling_y> +template <int subsampling_y> inline void MaskBlending4xH_NEON(const int16_t* LIBGAV1_RESTRICT pred_0, const int16_t* LIBGAV1_RESTRICT pred_1, const uint8_t* LIBGAV1_RESTRICT const mask_ptr, - const ptrdiff_t mask_stride, const int height, + const int height, uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t dst_stride) { const uint8_t* mask = mask_ptr; if (height == 4) { - MaskBlending4x4_NEON<subsampling_x, subsampling_y>( - pred_0, pred_1, mask, mask_stride, dst, dst_stride); + MaskBlending4x4_NEON<subsampling_y>(pred_0, pred_1, mask, dst, dst_stride); return; } + constexpr int subsampling_x = 1; + constexpr ptrdiff_t mask_stride = 4 << subsampling_x; const int16x8_t mask_inverter = vdupq_n_s16(64); int y = 0; do { int16x8_t pred_mask_0 = - GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride); + vreinterpretq_s16_u16(vmovl_u8(GetMask4x2<subsampling_y>(mask))); int16x8_t pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0); WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst, dst_stride); - pred_0 += 4 << 1; - pred_1 += 4 << 1; - mask += mask_stride << (1 + subsampling_y); - dst += dst_stride << 1; + pred_0 += 4 << subsampling_x; + pred_1 += 4 << subsampling_x; + mask += mask_stride << (subsampling_x + subsampling_y); + dst += dst_stride << subsampling_x; - pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride); + pred_mask_0 = ZeroExtend(GetMask4x2<subsampling_y>(mask)); pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0); WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst, dst_stride); - pred_0 += 4 << 1; - pred_1 += 4 << 1; - mask += mask_stride << (1 + subsampling_y); - dst += dst_stride << 1; + pred_0 += 4 << subsampling_x; + pred_1 += 4 << subsampling_x; + mask += mask_stride << (subsampling_x + subsampling_y); + dst += dst_stride << subsampling_x; - pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride); + pred_mask_0 = ZeroExtend(GetMask4x2<subsampling_y>(mask)); pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0); WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst, dst_stride); - pred_0 += 4 << 1; - pred_1 += 4 << 1; - mask += mask_stride << (1 + subsampling_y); - dst += dst_stride << 1; + pred_0 += 4 << subsampling_x; + pred_1 += 4 << subsampling_x; + mask += mask_stride << (subsampling_x + subsampling_y); + dst += dst_stride << subsampling_x; - pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride); + pred_mask_0 = ZeroExtend(GetMask4x2<subsampling_y>(mask)); pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0); WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst, dst_stride); - pred_0 += 4 << 1; - pred_1 += 4 << 1; - mask += mask_stride << (1 + subsampling_y); - dst += dst_stride << 1; + pred_0 += 4 << subsampling_x; + pred_1 += 4 << subsampling_x; + mask += mask_stride << (subsampling_x + subsampling_y); + dst += dst_stride << subsampling_x; y += 8; } while (y < height); } +inline uint8x8_t CombinePred8(const int16_t* LIBGAV1_RESTRICT pred_0, + const int16_t* LIBGAV1_RESTRICT pred_1, + const int16x8_t pred_mask_0, + const int16x8_t pred_mask_1) { + // First 8 values. + const int16x8_t pred_val_0 = vld1q_s16(pred_0); + const int16x8_t pred_val_1 = vld1q_s16(pred_1); + // int res = (mask_value * prediction_0[x] + + // (64 - mask_value) * prediction_1[x]) >> 6; + const int32x4_t weighted_pred_lo = + vmull_s16(vget_low_s16(pred_mask_0), vget_low_s16(pred_val_0)); + const int32x4_t weighted_pred_hi = + vmull_s16(vget_high_s16(pred_mask_0), vget_high_s16(pred_val_0)); + const int32x4_t weighted_combo_lo = vmlal_s16( + weighted_pred_lo, vget_low_s16(pred_mask_1), vget_low_s16(pred_val_1)); + const int32x4_t weighted_combo_hi = vmlal_s16( + weighted_pred_hi, vget_high_s16(pred_mask_1), vget_high_s16(pred_val_1)); + + // dst[x] = static_cast<Pixel>( + // Clip3(RightShiftWithRounding(res, inter_post_round_bits), 0, + // (1 << kBitdepth8) - 1)); + return vqrshrun_n_s16(vcombine_s16(vshrn_n_s32(weighted_combo_lo, 6), + vshrn_n_s32(weighted_combo_hi, 6)), + 4); +} + +template <int subsampling_x, int subsampling_y> +inline void MaskBlending8xH_NEON(const int16_t* LIBGAV1_RESTRICT pred_0, + const int16_t* LIBGAV1_RESTRICT pred_1, + const uint8_t* LIBGAV1_RESTRICT const mask_ptr, + const int height, + uint8_t* LIBGAV1_RESTRICT dst, + const ptrdiff_t dst_stride) { + const uint8_t* mask = mask_ptr; + const int16x8_t mask_inverter = vdupq_n_s16(64); + int y = height; + do { + const int16x8_t pred_mask_0 = + ZeroExtend(GetMask8<subsampling_x, subsampling_y>(mask)); + // 64 - mask + const int16x8_t pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0); + const uint8x8_t result = + CombinePred8(pred_0, pred_1, pred_mask_0, pred_mask_1); + vst1_u8(dst, result); + dst += dst_stride; + mask += 8 << (subsampling_x + subsampling_y); + pred_0 += 8; + pred_1 += 8; + } while (--y != 0); +} + +template <int subsampling_x, int subsampling_y> +inline uint8x16_t GetMask16(const uint8_t* mask, const ptrdiff_t mask_stride) { + if (subsampling_x == 1 && subsampling_y == 1) { + const uint8x16x2_t mask_val0 = vld2q_u8(mask); + const uint8x16x2_t mask_val1 = vld2q_u8(mask + mask_stride); + const uint8x16_t combined_horz0 = + vaddq_u8(mask_val0.val[0], mask_val0.val[1]); + const uint8x16_t combined_horz1 = + vaddq_u8(mask_val1.val[0], mask_val1.val[1]); + // Use a halving add to work around the case where all |mask| values are 64. + return vrshrq_n_u8(vhaddq_u8(combined_horz0, combined_horz1), 1); + } + if (subsampling_x == 1) { + const uint8x16x2_t mask_val = vld2q_u8(mask); + return vrhaddq_u8(mask_val.val[0], mask_val.val[1]); + } + assert(subsampling_y == 0 && subsampling_x == 0); + return vld1q_u8(mask); +} + template <int subsampling_x, int subsampling_y> inline void MaskBlend_NEON(const void* LIBGAV1_RESTRICT prediction_0, const void* LIBGAV1_RESTRICT prediction_1, @@ -204,8 +267,13 @@ inline void MaskBlend_NEON(const void* LIBGAV1_RESTRICT prediction_0, const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); if (width == 4) { - MaskBlending4xH_NEON<subsampling_x, subsampling_y>( - pred_0, pred_1, mask_ptr, mask_stride, height, dst, dst_stride); + MaskBlending4xH_NEON<subsampling_y>(pred_0, pred_1, mask_ptr, height, dst, + dst_stride); + return; + } + if (width == 8) { + MaskBlending8xH_NEON<subsampling_x, subsampling_y>(pred_0, pred_1, mask_ptr, + height, dst, dst_stride); return; } const uint8_t* mask = mask_ptr; @@ -214,35 +282,24 @@ inline void MaskBlend_NEON(const void* LIBGAV1_RESTRICT prediction_0, do { int x = 0; do { - const int16x8_t pred_mask_0 = GetMask8<subsampling_x, subsampling_y>( + const uint8x16_t pred_mask_0 = GetMask16<subsampling_x, subsampling_y>( mask + (x << subsampling_x), mask_stride); + const int16x8_t pred_mask_0_lo = ZeroExtend(vget_low_u8(pred_mask_0)); + const int16x8_t pred_mask_0_hi = ZeroExtend(vget_high_u8(pred_mask_0)); // 64 - mask - const int16x8_t pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0); - const int16x8_t pred_val_0 = vld1q_s16(pred_0 + x); - const int16x8_t pred_val_1 = vld1q_s16(pred_1 + x); + const int16x8_t pred_mask_1_lo = vsubq_s16(mask_inverter, pred_mask_0_lo); + const int16x8_t pred_mask_1_hi = vsubq_s16(mask_inverter, pred_mask_0_hi); + uint8x8_t result; - // int res = (mask_value * prediction_0[x] + - // (64 - mask_value) * prediction_1[x]) >> 6; - const int32x4_t weighted_pred_0_lo = - vmull_s16(vget_low_s16(pred_mask_0), vget_low_s16(pred_val_0)); - const int32x4_t weighted_pred_0_hi = - vmull_s16(vget_high_s16(pred_mask_0), vget_high_s16(pred_val_0)); - const int32x4_t weighted_combo_lo = - vmlal_s16(weighted_pred_0_lo, vget_low_s16(pred_mask_1), - vget_low_s16(pred_val_1)); - const int32x4_t weighted_combo_hi = - vmlal_s16(weighted_pred_0_hi, vget_high_s16(pred_mask_1), - vget_high_s16(pred_val_1)); - - // dst[x] = static_cast<Pixel>( - // Clip3(RightShiftWithRounding(res, inter_post_round_bits), 0, - // (1 << kBitdepth8) - 1)); - result = vqrshrun_n_s16(vcombine_s16(vshrn_n_s32(weighted_combo_lo, 6), - vshrn_n_s32(weighted_combo_hi, 6)), - 4); + result = + CombinePred8(pred_0 + x, pred_1 + x, pred_mask_0_lo, pred_mask_1_lo); vst1_u8(dst + x, result); - x += 8; + result = CombinePred8(pred_0 + x + 8, pred_1 + x + 8, pred_mask_0_hi, + pred_mask_1_hi); + vst1_u8(dst + x + 8, result); + + x += 16; } while (x < width); dst += dst_stride; pred_0 += width; @@ -251,63 +308,19 @@ inline void MaskBlend_NEON(const void* LIBGAV1_RESTRICT prediction_0, } while (++y < height); } -// TODO(b/150461164): This is much faster for inter_intra (input is Pixel -// values) but regresses compound versions (input is int16_t). Try to -// consolidate these. template <int subsampling_x, int subsampling_y> inline uint8x8_t GetInterIntraMask4x2(const uint8_t* mask, ptrdiff_t mask_stride) { if (subsampling_x == 1) { - const uint8x8_t mask_val = - vpadd_u8(vld1_u8(mask), vld1_u8(mask + (mask_stride << subsampling_y))); - if (subsampling_y == 1) { - const uint8x8_t next_mask_val = vpadd_u8(vld1_u8(mask + mask_stride), - vld1_u8(mask + mask_stride * 3)); - - // Use a saturating add to work around the case where all |mask| values - // are 64. Together with the rounding shift this ensures the correct - // result. - const uint8x8_t sum = vqadd_u8(mask_val, next_mask_val); - return vrshr_n_u8(sum, /*subsampling_x=*/1 + subsampling_y); - } - - return vrshr_n_u8(mask_val, /*subsampling_x=*/1); + return GetMask4x2<subsampling_y>(mask); } - + // When using intra or difference weighted masks, the function doesn't use + // subsampling, so |mask_stride| may be 4 or 8. assert(subsampling_y == 0 && subsampling_x == 0); const uint8x8_t mask_val0 = Load4(mask); - // TODO(b/150461164): Investigate the source of |mask| and see if the stride - // can be removed. - // TODO(b/150461164): The unit tests start at 8x8. Does this get run? return Load4<1>(mask + mask_stride, mask_val0); } -template <int subsampling_x, int subsampling_y> -inline uint8x8_t GetInterIntraMask8(const uint8_t* mask, - ptrdiff_t mask_stride) { - if (subsampling_x == 1) { - const uint8x16_t mask_val = vld1q_u8(mask); - const uint8x8_t mask_paired = - vpadd_u8(vget_low_u8(mask_val), vget_high_u8(mask_val)); - if (subsampling_y == 1) { - const uint8x16_t next_mask_val = vld1q_u8(mask + mask_stride); - const uint8x8_t next_mask_paired = - vpadd_u8(vget_low_u8(next_mask_val), vget_high_u8(next_mask_val)); - - // Use a saturating add to work around the case where all |mask| values - // are 64. Together with the rounding shift this ensures the correct - // result. - const uint8x8_t sum = vqadd_u8(mask_paired, next_mask_paired); - return vrshr_n_u8(sum, /*subsampling_x=*/1 + subsampling_y); - } - - return vrshr_n_u8(mask_paired, /*subsampling_x=*/1); - } - - assert(subsampling_y == 0 && subsampling_x == 0); - return vld1_u8(mask); -} - inline void InterIntraWriteMaskBlendLine8bpp4x2( const uint8_t* LIBGAV1_RESTRICT const pred_0, uint8_t* LIBGAV1_RESTRICT const pred_1, const ptrdiff_t pred_stride_1, @@ -374,6 +387,32 @@ inline void InterIntraMaskBlending8bpp4xH_NEON( } template <int subsampling_x, int subsampling_y> +inline void InterIntraMaskBlending8bpp8xH_NEON( + const uint8_t* LIBGAV1_RESTRICT pred_0, uint8_t* LIBGAV1_RESTRICT pred_1, + const ptrdiff_t pred_stride_1, const uint8_t* LIBGAV1_RESTRICT mask, + const ptrdiff_t mask_stride, const int height) { + const uint8x8_t mask_inverter = vdup_n_u8(64); + int y = height; + do { + const uint8x8_t pred_mask_1 = GetMask8<subsampling_x, subsampling_y>(mask); + // 64 - mask + const uint8x8_t pred_mask_0 = vsub_u8(mask_inverter, pred_mask_1); + const uint8x8_t pred_val_0 = vld1_u8(pred_0); + const uint8x8_t pred_val_1 = vld1_u8(pred_1); + const uint16x8_t weighted_pred_0 = vmull_u8(pred_mask_0, pred_val_0); + // weighted_pred0 + weighted_pred1 + const uint16x8_t weighted_combo = + vmlal_u8(weighted_pred_0, pred_mask_1, pred_val_1); + const uint8x8_t result = vrshrn_n_u16(weighted_combo, 6); + vst1_u8(pred_1, result); + + pred_0 += 8; + pred_1 += pred_stride_1; + mask += mask_stride << subsampling_y; + } while (--y != 0); +} + +template <int subsampling_x, int subsampling_y> inline void InterIntraMaskBlend8bpp_NEON( const uint8_t* LIBGAV1_RESTRICT prediction_0, uint8_t* LIBGAV1_RESTRICT prediction_1, const ptrdiff_t prediction_stride_1, @@ -385,30 +424,46 @@ inline void InterIntraMaskBlend8bpp_NEON( height); return; } + if (width == 8) { + InterIntraMaskBlending8bpp8xH_NEON<subsampling_x, subsampling_y>( + prediction_0, prediction_1, prediction_stride_1, mask_ptr, mask_stride, + height); + return; + } const uint8_t* mask = mask_ptr; - const uint8x8_t mask_inverter = vdup_n_u8(64); + const uint8x16_t mask_inverter = vdupq_n_u8(64); int y = 0; do { int x = 0; do { - // TODO(b/150461164): Consider a 16 wide specialization (at least for the - // unsampled version) to take advantage of vld1q_u8(). - const uint8x8_t pred_mask_1 = - GetInterIntraMask8<subsampling_x, subsampling_y>( - mask + (x << subsampling_x), mask_stride); + const uint8x16_t pred_mask_1 = GetMask16<subsampling_x, subsampling_y>( + mask + (x << subsampling_x), mask_stride); // 64 - mask - const uint8x8_t pred_mask_0 = vsub_u8(mask_inverter, pred_mask_1); - const uint8x8_t pred_val_0 = vld1_u8(prediction_0); + const uint8x16_t pred_mask_0 = vsubq_u8(mask_inverter, pred_mask_1); + const uint8x8_t pred_val_0_lo = vld1_u8(prediction_0); + prediction_0 += 8; + const uint8x8_t pred_val_0_hi = vld1_u8(prediction_0); prediction_0 += 8; - const uint8x8_t pred_val_1 = vld1_u8(prediction_1 + x); - const uint16x8_t weighted_pred_0 = vmull_u8(pred_mask_0, pred_val_0); + // Ensure armv7 build combines the load. + const uint8x16_t pred_val_1 = vld1q_u8(prediction_1 + x); + const uint8x8_t pred_val_1_lo = vget_low_u8(pred_val_1); + const uint8x8_t pred_val_1_hi = vget_high_u8(pred_val_1); + const uint16x8_t weighted_pred_0_lo = + vmull_u8(vget_low_u8(pred_mask_0), pred_val_0_lo); // weighted_pred0 + weighted_pred1 - const uint16x8_t weighted_combo = - vmlal_u8(weighted_pred_0, pred_mask_1, pred_val_1); - const uint8x8_t result = vrshrn_n_u16(weighted_combo, 6); - vst1_u8(prediction_1 + x, result); + const uint16x8_t weighted_combo_lo = + vmlal_u8(weighted_pred_0_lo, vget_low_u8(pred_mask_1), pred_val_1_lo); + const uint8x8_t result_lo = vrshrn_n_u16(weighted_combo_lo, 6); + vst1_u8(prediction_1 + x, result_lo); + const uint16x8_t weighted_pred_0_hi = + vmull_u8(vget_high_u8(pred_mask_0), pred_val_0_hi); + // weighted_pred0 + weighted_pred1 + const uint16x8_t weighted_combo_hi = vmlal_u8( + weighted_pred_0_hi, vget_high_u8(pred_mask_1), pred_val_1_hi); + const uint8x8_t result_hi = vrshrn_n_u16(weighted_combo_hi, 6); + vst1_u8(prediction_1 + x + 8, result_hi); - x += 8; + x += 16; } while (x < width); prediction_1 += prediction_stride_1; mask += mask_stride << subsampling_y; diff --git a/src/dsp/arm/obmc_neon.cc b/src/dsp/arm/obmc_neon.cc index 659ed8e..271bbaa 100644 --- a/src/dsp/arm/obmc_neon.cc +++ b/src/dsp/arm/obmc_neon.cc @@ -52,6 +52,17 @@ inline void WriteObmcLine4(uint8_t* LIBGAV1_RESTRICT const pred, StoreLo4(pred, result); } +inline void WriteObmcLine8(uint8_t* LIBGAV1_RESTRICT const pred, + const uint8x8_t obmc_pred_val, + const uint8x8_t pred_mask, + const uint8x8_t obmc_pred_mask) { + const uint8x8_t pred_val = vld1_u8(pred); + const uint16x8_t weighted_pred = vmull_u8(pred_mask, pred_val); + const uint8x8_t result = + vrshrn_n_u16(vmlal_u8(weighted_pred, obmc_pred_mask, obmc_pred_val), 6); + vst1_u8(pred, result); +} + inline void OverlapBlendFromLeft2xH_NEON( uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride, const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred, @@ -99,24 +110,25 @@ inline void OverlapBlendFromLeft4xH_NEON( inline void OverlapBlendFromLeft8xH_NEON( uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride, - const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred, - const ptrdiff_t obmc_prediction_stride) { + const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred) { const uint8x8_t mask_inverter = vdup_n_u8(64); const uint8x8_t pred_mask = vld1_u8(kObmcMask + 6); + constexpr int obmc_prediction_stride = 8; // 64 - mask const uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask); int y = 0; do { - const uint8x8_t pred_val = vld1_u8(pred); - const uint16x8_t weighted_pred = vmull_u8(pred_mask, pred_val); - const uint8x8_t obmc_pred_val = vld1_u8(obmc_pred); - const uint8x8_t result = - vrshrn_n_u16(vmlal_u8(weighted_pred, obmc_pred_mask, obmc_pred_val), 6); + const uint8x16_t obmc_pred_val = vld1q_u8(obmc_pred); + WriteObmcLine8(pred, vget_low_u8(obmc_pred_val), pred_mask, obmc_pred_mask); + pred += prediction_stride; - vst1_u8(pred, result); + WriteObmcLine8(pred, vget_high_u8(obmc_pred_val), pred_mask, + obmc_pred_mask); pred += prediction_stride; - obmc_pred += obmc_prediction_stride; - } while (++y != height); + + obmc_pred += obmc_prediction_stride << 1; + y += 2; + } while (y != height); } void OverlapBlendFromLeft_NEON( @@ -140,8 +152,7 @@ void OverlapBlendFromLeft_NEON( return; } if (width == 8) { - OverlapBlendFromLeft8xH_NEON(pred, prediction_stride, height, obmc_pred, - obmc_prediction_stride); + OverlapBlendFromLeft8xH_NEON(pred, prediction_stride, height, obmc_pred); return; } const uint8x16_t mask_inverter = vdupq_n_u8(64); @@ -262,26 +273,31 @@ inline void OverlapBlendFromTop4xH_NEON( inline void OverlapBlendFromTop8xH_NEON( uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride, - const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred, - const ptrdiff_t obmc_prediction_stride) { + const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred) { + constexpr int obmc_prediction_stride = 8; const uint8x8_t mask_inverter = vdup_n_u8(64); const uint8_t* mask = kObmcMask + height - 2; const int compute_height = height - (height >> 2); int y = 0; do { - const uint8x8_t pred_mask = vdup_n_u8(mask[y]); + const uint8x8_t pred_mask0 = vdup_n_u8(mask[y]); // 64 - mask - const uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask); - const uint8x8_t pred_val = vld1_u8(pred); - const uint16x8_t weighted_pred = vmull_u8(pred_mask, pred_val); - const uint8x8_t obmc_pred_val = vld1_u8(obmc_pred); - const uint8x8_t result = - vrshrn_n_u16(vmlal_u8(weighted_pred, obmc_pred_mask, obmc_pred_val), 6); + const uint8x8_t obmc_pred_mask0 = vsub_u8(mask_inverter, pred_mask0); + const uint8x16_t obmc_pred_val = vld1q_u8(obmc_pred); - vst1_u8(pred, result); + WriteObmcLine8(pred, vget_low_u8(obmc_pred_val), pred_mask0, + obmc_pred_mask0); pred += prediction_stride; - obmc_pred += obmc_prediction_stride; - } while (++y != compute_height); + ++y; + + const uint8x8_t pred_mask1 = vdup_n_u8(mask[y]); + // 64 - mask + const uint8x8_t obmc_pred_mask1 = vsub_u8(mask_inverter, pred_mask1); + WriteObmcLine8(pred, vget_high_u8(obmc_pred_val), pred_mask1, + obmc_pred_mask1); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride << 1; + } while (++y < compute_height); } void OverlapBlendFromTop_NEON( @@ -301,8 +317,7 @@ void OverlapBlendFromTop_NEON( } if (width == 8) { - OverlapBlendFromTop8xH_NEON(pred, prediction_stride, height, obmc_pred, - obmc_prediction_stride); + OverlapBlendFromTop8xH_NEON(pred, prediction_stride, height, obmc_pred); return; } @@ -371,26 +386,23 @@ constexpr uint16_t kObmcMask[62] = { 33, 35, 36, 38, 40, 41, 43, 44, 45, 47, 48, 50, 51, 52, 53, 55, 56, 57, 58, 59, 60, 60, 61, 62, 64, 64, 64, 64, 64, 64, 64, 64}; -inline uint16x4_t BlendObmc2Or4(uint8_t* LIBGAV1_RESTRICT const pred, - const uint8_t* LIBGAV1_RESTRICT const obmc_pred, +inline uint16x4_t BlendObmc2Or4(uint16_t* const pred, + const uint16x4_t obmc_pred_val, const uint16x4_t pred_mask, const uint16x4_t obmc_pred_mask) { - const uint16x4_t pred_val = vld1_u16(reinterpret_cast<uint16_t*>(pred)); - const uint16x4_t obmc_pred_val = - vld1_u16(reinterpret_cast<const uint16_t*>(obmc_pred)); + const uint16x4_t pred_val = vld1_u16(pred); const uint16x4_t weighted_pred = vmul_u16(pred_mask, pred_val); const uint16x4_t result = vrshr_n_u16(vmla_u16(weighted_pred, obmc_pred_mask, obmc_pred_val), 6); return result; } -inline uint16x8_t BlendObmc8(uint8_t* LIBGAV1_RESTRICT const pred, - const uint8_t* LIBGAV1_RESTRICT const obmc_pred, +inline uint16x8_t BlendObmc8(uint16_t* LIBGAV1_RESTRICT const pred, + const uint16_t* LIBGAV1_RESTRICT const obmc_pred, const uint16x8_t pred_mask, const uint16x8_t obmc_pred_mask) { - const uint16x8_t pred_val = vld1q_u16(reinterpret_cast<uint16_t*>(pred)); - const uint16x8_t obmc_pred_val = - vld1q_u16(reinterpret_cast<const uint16_t*>(obmc_pred)); + const uint16x8_t pred_val = vld1q_u16(pred); + const uint16x8_t obmc_pred_val = vld1q_u16(obmc_pred); const uint16x8_t weighted_pred = vmulq_u16(pred_mask, pred_val); const uint16x8_t result = vrshrq_n_u16(vmlaq_u16(weighted_pred, obmc_pred_mask, obmc_pred_val), 6); @@ -398,27 +410,29 @@ inline uint16x8_t BlendObmc8(uint8_t* LIBGAV1_RESTRICT const pred, } inline void OverlapBlendFromLeft2xH_NEON( - uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride, - const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred, - const ptrdiff_t obmc_prediction_stride) { + uint16_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride, + const int height, const uint16_t* LIBGAV1_RESTRICT obmc_pred) { + constexpr int obmc_prediction_stride = 2; const uint16x4_t mask_inverter = vdup_n_u16(64); // Second two lanes unused. const uint16x4_t pred_mask = vld1_u16(kObmcMask); const uint16x4_t obmc_pred_mask = vsub_u16(mask_inverter, pred_mask); int y = 0; do { + const uint16x4_t obmc_pred_0 = vld1_u16(obmc_pred); const uint16x4_t result_0 = - BlendObmc2Or4(pred, obmc_pred, pred_mask, obmc_pred_mask); - Store2<0>(reinterpret_cast<uint16_t*>(pred), result_0); + BlendObmc2Or4(pred, obmc_pred_0, pred_mask, obmc_pred_mask); + Store2<0>(pred, result_0); - pred += prediction_stride; + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; + const uint16x4_t obmc_pred_1 = vld1_u16(obmc_pred); const uint16x4_t result_1 = - BlendObmc2Or4(pred, obmc_pred, pred_mask, obmc_pred_mask); - Store2<0>(reinterpret_cast<uint16_t*>(pred), result_1); + BlendObmc2Or4(pred, obmc_pred_1, pred_mask, obmc_pred_mask); + Store2<0>(pred, result_1); - pred += prediction_stride; + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; y += 2; @@ -426,26 +440,26 @@ inline void OverlapBlendFromLeft2xH_NEON( } inline void OverlapBlendFromLeft4xH_NEON( - uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride, - const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred, - const ptrdiff_t obmc_prediction_stride) { + uint16_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride, + const int height, const uint16_t* LIBGAV1_RESTRICT obmc_pred) { + constexpr int obmc_prediction_stride = 4; const uint16x4_t mask_inverter = vdup_n_u16(64); const uint16x4_t pred_mask = vld1_u16(kObmcMask + 2); // 64 - mask const uint16x4_t obmc_pred_mask = vsub_u16(mask_inverter, pred_mask); int y = 0; do { - const uint16x4_t result_0 = - BlendObmc2Or4(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1_u16(reinterpret_cast<uint16_t*>(pred), result_0); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; - - const uint16x4_t result_1 = - BlendObmc2Or4(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1_u16(reinterpret_cast<uint16_t*>(pred), result_1); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + const uint16x8_t obmc_pred_val = vld1q_u16(obmc_pred); + const uint16x4_t result_0 = BlendObmc2Or4(pred, vget_low_u16(obmc_pred_val), + pred_mask, obmc_pred_mask); + vst1_u16(pred, result_0); + pred = AddByteStride(pred, prediction_stride); + + const uint16x4_t result_1 = BlendObmc2Or4( + pred, vget_high_u16(obmc_pred_val), pred_mask, obmc_pred_mask); + vst1_u16(pred, result_1); + pred = AddByteStride(pred, prediction_stride); + obmc_pred += obmc_prediction_stride << 1; y += 2; } while (y != height); @@ -456,52 +470,47 @@ void OverlapBlendFromLeft_NEON( const int width, const int height, const void* LIBGAV1_RESTRICT const obmc_prediction, const ptrdiff_t obmc_prediction_stride) { - auto* pred = static_cast<uint8_t*>(prediction); - const auto* obmc_pred = static_cast<const uint8_t*>(obmc_prediction); + auto* pred = static_cast<uint16_t*>(prediction); + const auto* obmc_pred = static_cast<const uint16_t*>(obmc_prediction); assert(width >= 2); assert(height >= 4); if (width == 2) { - OverlapBlendFromLeft2xH_NEON(pred, prediction_stride, height, obmc_pred, - obmc_prediction_stride); + OverlapBlendFromLeft2xH_NEON(pred, prediction_stride, height, obmc_pred); return; } if (width == 4) { - OverlapBlendFromLeft4xH_NEON(pred, prediction_stride, height, obmc_pred, - obmc_prediction_stride); + OverlapBlendFromLeft4xH_NEON(pred, prediction_stride, height, obmc_pred); return; } const uint16x8_t mask_inverter = vdupq_n_u16(64); const uint16_t* mask = kObmcMask + width - 2; int x = 0; do { - pred = reinterpret_cast<uint8_t*>(static_cast<uint16_t*>(prediction) + x); - obmc_pred = reinterpret_cast<const uint8_t*>( - static_cast<const uint16_t*>(obmc_prediction) + x); + uint16_t* pred_x = pred + x; + const uint16_t* obmc_pred_x = obmc_pred + x; const uint16x8_t pred_mask = vld1q_u16(mask + x); // 64 - mask const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask); int y = 0; do { const uint16x8_t result = - BlendObmc8(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); + BlendObmc8(pred_x, obmc_pred_x, pred_mask, obmc_pred_mask); + vst1q_u16(pred_x, result); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred_x = AddByteStride(pred_x, prediction_stride); + obmc_pred_x = AddByteStride(obmc_pred_x, obmc_prediction_stride); } while (++y < height); x += 8; } while (x < width); } template <int lane> -inline uint16x4_t BlendObmcFromTop4( - uint8_t* LIBGAV1_RESTRICT const pred, - const uint8_t* LIBGAV1_RESTRICT const obmc_pred, const uint16x8_t pred_mask, - const uint16x8_t obmc_pred_mask) { - const uint16x4_t pred_val = vld1_u16(reinterpret_cast<uint16_t*>(pred)); - const uint16x4_t obmc_pred_val = - vld1_u16(reinterpret_cast<const uint16_t*>(obmc_pred)); +inline uint16x4_t BlendObmcFromTop4(uint16_t* const pred, + const uint16x4_t obmc_pred_val, + const uint16x8_t pred_mask, + const uint16x8_t obmc_pred_mask) { + const uint16x4_t pred_val = vld1_u16(pred); const uint16x4_t weighted_pred = VMulLaneQU16<lane>(pred_val, pred_mask); const uint16x4_t result = vrshr_n_u16( VMlaLaneQU16<lane>(weighted_pred, obmc_pred_val, obmc_pred_mask), 6); @@ -510,12 +519,11 @@ inline uint16x4_t BlendObmcFromTop4( template <int lane> inline uint16x8_t BlendObmcFromTop8( - uint8_t* LIBGAV1_RESTRICT const pred, - const uint8_t* LIBGAV1_RESTRICT const obmc_pred, const uint16x8_t pred_mask, - const uint16x8_t obmc_pred_mask) { - const uint16x8_t pred_val = vld1q_u16(reinterpret_cast<uint16_t*>(pred)); - const uint16x8_t obmc_pred_val = - vld1q_u16(reinterpret_cast<const uint16_t*>(obmc_pred)); + uint16_t* LIBGAV1_RESTRICT const pred, + const uint16_t* LIBGAV1_RESTRICT const obmc_pred, + const uint16x8_t pred_mask, const uint16x8_t obmc_pred_mask) { + const uint16x8_t pred_val = vld1q_u16(pred); + const uint16x8_t obmc_pred_val = vld1q_u16(obmc_pred); const uint16x8_t weighted_pred = VMulQLaneQU16<lane>(pred_val, pred_mask); const uint16x8_t result = vrshrq_n_u16( VMlaQLaneQU16<lane>(weighted_pred, obmc_pred_val, obmc_pred_mask), 6); @@ -523,41 +531,43 @@ inline uint16x8_t BlendObmcFromTop8( } inline void OverlapBlendFromTop4x2Or4_NEON( - uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride, - const uint8_t* LIBGAV1_RESTRICT obmc_pred, - const ptrdiff_t obmc_prediction_stride, const int height) { + uint16_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride, + const uint16_t* LIBGAV1_RESTRICT obmc_pred, const int height) { + constexpr int obmc_prediction_stride = 4; const uint16x8_t pred_mask = vld1q_u16(&kObmcMask[height - 2]); const uint16x8_t mask_inverter = vdupq_n_u16(64); const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask); - uint16x4_t result = - BlendObmcFromTop4<0>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + const uint16x8_t obmc_pred_val_0 = vld1q_u16(obmc_pred); + uint16x4_t result = BlendObmcFromTop4<0>(pred, vget_low_u16(obmc_pred_val_0), + pred_mask, obmc_pred_mask); + vst1_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); if (height == 2) { // Mask value is 64, meaning |pred| is unchanged. return; } - result = BlendObmcFromTop4<1>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + result = BlendObmcFromTop4<1>(pred, vget_high_u16(obmc_pred_val_0), pred_mask, + obmc_pred_mask); + vst1_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); + obmc_pred += obmc_prediction_stride << 1; - result = BlendObmcFromTop4<2>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1_u16(reinterpret_cast<uint16_t*>(pred), result); + const uint16x4_t obmc_pred_val_2 = vld1_u16(obmc_pred); + result = + BlendObmcFromTop4<2>(pred, obmc_pred_val_2, pred_mask, obmc_pred_mask); + vst1_u16(pred, result); } inline void OverlapBlendFromTop4xH_NEON( - uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride, - const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred, - const ptrdiff_t obmc_prediction_stride) { + uint16_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride, + const int height, const uint16_t* LIBGAV1_RESTRICT obmc_pred) { if (height < 8) { - OverlapBlendFromTop4x2Or4_NEON(pred, prediction_stride, obmc_pred, - obmc_prediction_stride, height); + OverlapBlendFromTop4x2Or4_NEON(pred, prediction_stride, obmc_pred, height); return; } + constexpr int obmc_prediction_stride = 4; const uint16_t* mask = kObmcMask + height - 2; const uint16x8_t mask_inverter = vdupq_n_u16(64); int y = 0; @@ -566,36 +576,44 @@ inline void OverlapBlendFromTop4xH_NEON( do { const uint16x8_t pred_mask = vld1q_u16(&mask[y]); const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask); - uint16x4_t result = - BlendObmcFromTop4<0>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; - - result = BlendObmcFromTop4<1>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; - - result = BlendObmcFromTop4<2>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; - - result = BlendObmcFromTop4<3>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; - - result = BlendObmcFromTop4<4>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; - - result = BlendObmcFromTop4<5>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + // Load obmc row 0, 1. + uint16x8_t obmc_pred_val = vld1q_u16(obmc_pred); + uint16x4_t result = BlendObmcFromTop4<0>(pred, vget_low_u16(obmc_pred_val), + pred_mask, obmc_pred_mask); + vst1_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); + + result = BlendObmcFromTop4<1>(pred, vget_high_u16(obmc_pred_val), pred_mask, + obmc_pred_mask); + vst1_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); + obmc_pred += obmc_prediction_stride << 1; + + // Load obmc row 2, 3. + obmc_pred_val = vld1q_u16(obmc_pred); + result = BlendObmcFromTop4<2>(pred, vget_low_u16(obmc_pred_val), pred_mask, + obmc_pred_mask); + vst1_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); + + result = BlendObmcFromTop4<3>(pred, vget_high_u16(obmc_pred_val), pred_mask, + obmc_pred_mask); + vst1_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); + obmc_pred += obmc_prediction_stride << 1; + + // Load obmc row 4, 5. + obmc_pred_val = vld1q_u16(obmc_pred); + result = BlendObmcFromTop4<4>(pred, vget_low_u16(obmc_pred_val), pred_mask, + obmc_pred_mask); + vst1_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); + + result = BlendObmcFromTop4<5>(pred, vget_high_u16(obmc_pred_val), pred_mask, + obmc_pred_mask); + vst1_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); + obmc_pred += obmc_prediction_stride << 1; // Increment for the right mask index. y += 6; @@ -603,147 +621,147 @@ inline void OverlapBlendFromTop4xH_NEON( } inline void OverlapBlendFromTop8xH_NEON( - uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride, - const uint8_t* LIBGAV1_RESTRICT obmc_pred, - const ptrdiff_t obmc_prediction_stride, const int height) { + uint16_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride, + const uint16_t* LIBGAV1_RESTRICT obmc_pred, const int height) { const uint16_t* mask = kObmcMask + height - 2; const uint16x8_t mask_inverter = vdupq_n_u16(64); uint16x8_t pred_mask = vld1q_u16(mask); uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask); uint16x8_t result = BlendObmcFromTop8<0>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); + vst1q_u16(pred, result); if (height == 2) return; - pred += prediction_stride; + constexpr int obmc_prediction_stride = 8; + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<1>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<2>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<3>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); + vst1q_u16(pred, result); if (height == 4) return; - pred += prediction_stride; + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<4>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<5>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); + vst1q_u16(pred, result); if (height == 8) return; - pred += prediction_stride; + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<6>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<7>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; pred_mask = vld1q_u16(&mask[8]); obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask); result = BlendObmcFromTop8<0>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<1>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<2>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<3>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); + vst1q_u16(pred, result); if (height == 16) return; - pred += prediction_stride; + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<4>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<5>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<6>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<7>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; pred_mask = vld1q_u16(&mask[16]); obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask); result = BlendObmcFromTop8<0>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<1>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<2>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<3>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<4>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<5>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<6>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); - pred += prediction_stride; + vst1q_u16(pred, result); + pred = AddByteStride(pred, prediction_stride); obmc_pred += obmc_prediction_stride; result = BlendObmcFromTop8<7>(pred, obmc_pred, pred_mask, obmc_pred_mask); - vst1q_u16(reinterpret_cast<uint16_t*>(pred), result); + vst1q_u16(pred, result); } void OverlapBlendFromTop_NEON( @@ -751,20 +769,18 @@ void OverlapBlendFromTop_NEON( const int width, const int height, const void* LIBGAV1_RESTRICT const obmc_prediction, const ptrdiff_t obmc_prediction_stride) { - auto* pred = static_cast<uint8_t*>(prediction); - const auto* obmc_pred = static_cast<const uint8_t*>(obmc_prediction); + auto* pred = static_cast<uint16_t*>(prediction); + const auto* obmc_pred = static_cast<const uint16_t*>(obmc_prediction); assert(width >= 4); assert(height >= 2); if (width == 4) { - OverlapBlendFromTop4xH_NEON(pred, prediction_stride, height, obmc_pred, - obmc_prediction_stride); + OverlapBlendFromTop4xH_NEON(pred, prediction_stride, height, obmc_pred); return; } if (width == 8) { - OverlapBlendFromTop8xH_NEON(pred, prediction_stride, obmc_pred, - obmc_prediction_stride, height); + OverlapBlendFromTop8xH_NEON(pred, prediction_stride, obmc_pred, height); return; } @@ -773,19 +789,16 @@ void OverlapBlendFromTop_NEON( const uint16x8_t pred_mask = vld1q_u16(mask); // 64 - mask const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask); -#define OBMC_ROW_FROM_TOP(n) \ - do { \ - int x = 0; \ - do { \ - const uint16x8_t result = BlendObmcFromTop8<n>( \ - reinterpret_cast<uint8_t*>(reinterpret_cast<uint16_t*>(pred) + x), \ - reinterpret_cast<const uint8_t*>( \ - reinterpret_cast<const uint16_t*>(obmc_pred) + x), \ - pred_mask, obmc_pred_mask); \ - vst1q_u16(reinterpret_cast<uint16_t*>(pred) + x, result); \ - \ - x += 8; \ - } while (x < width); \ +#define OBMC_ROW_FROM_TOP(n) \ + do { \ + int x = 0; \ + do { \ + const uint16x8_t result = BlendObmcFromTop8<n>( \ + pred + x, obmc_pred + x, pred_mask, obmc_pred_mask); \ + vst1q_u16(pred + x, result); \ + \ + x += 8; \ + } while (x < width); \ } while (false) // Compute 1 row. @@ -797,11 +810,11 @@ void OverlapBlendFromTop_NEON( // Compute 3 rows. if (height == 4) { OBMC_ROW_FROM_TOP(0); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(1); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(2); return; } @@ -809,20 +822,20 @@ void OverlapBlendFromTop_NEON( // Compute 6 rows. if (height == 8) { OBMC_ROW_FROM_TOP(0); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(1); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(2); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(3); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(4); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(5); return; } @@ -830,42 +843,42 @@ void OverlapBlendFromTop_NEON( // Compute 12 rows. if (height == 16) { OBMC_ROW_FROM_TOP(0); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(1); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(2); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(3); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(4); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(5); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(6); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(7); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); const uint16x8_t pred_mask = vld1q_u16(&mask[8]); // 64 - mask const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask); OBMC_ROW_FROM_TOP(0); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(1); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(2); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(3); return; } @@ -879,29 +892,29 @@ void OverlapBlendFromTop_NEON( // 64 - mask const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask); OBMC_ROW_FROM_TOP(0); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(1); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(2); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(3); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(4); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(5); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(6); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); OBMC_ROW_FROM_TOP(7); - pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + pred = AddByteStride(pred, prediction_stride); + obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride); y += 8; } while (y < compute_height); diff --git a/src/dsp/arm/warp_neon.cc b/src/dsp/arm/warp_neon.cc index 71e0a43..da380b1 100644 --- a/src/dsp/arm/warp_neon.cc +++ b/src/dsp/arm/warp_neon.cc @@ -147,14 +147,8 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source, do { const int src_x = (start_x + 4) << subsampling_x; const int src_y = (start_y + 4) << subsampling_y; - const int dst_x = - src_x * warp_params[2] + src_y * warp_params[3] + warp_params[0]; - const int dst_y = - src_x * warp_params[4] + src_y * warp_params[5] + warp_params[1]; - const int x4 = dst_x >> subsampling_x; - const int y4 = dst_y >> subsampling_y; - const int ix4 = x4 >> kWarpedModelPrecisionBits; - const int iy4 = y4 >> kWarpedModelPrecisionBits; + const WarpFilterParams filter_params = GetWarpFilterParams( + src_x, src_y, subsampling_x, subsampling_y, warp_params); // A prediction block may fall outside the frame's boundaries. If a // prediction block is calculated using only samples outside the frame's // boundary, the filtering can be simplified. We can divide the plane @@ -207,22 +201,24 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source, // border index (source_width - 1 or 0, respectively). Then for each x, // the inner for loop of the horizontal filter is reduced to multiplying // the border pixel by the sum of the filter coefficients. - if (ix4 - 7 >= source_width - 1 || ix4 + 7 <= 0) { + if (filter_params.ix4 - 7 >= source_width - 1 || + filter_params.ix4 + 7 <= 0) { // Regions 1 and 2. // Points to the left or right border of the first row of |src|. const uint8_t* first_row_border = - (ix4 + 7 <= 0) ? src : src + source_width - 1; + (filter_params.ix4 + 7 <= 0) ? src : src + source_width - 1; // In general, for y in [-7, 8), the row number iy4 + y is clipped: // const int row = Clip3(iy4 + y, 0, source_height - 1); // In two special cases, iy4 + y is clipped to either 0 or // source_height - 1 for all y. In the rest of the cases, iy4 + y is // bounded and we can avoid clipping iy4 + y by relying on a reference // frame's boundary extension on the top and bottom. - if (iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0) { + if (filter_params.iy4 - 7 >= source_height - 1 || + filter_params.iy4 + 7 <= 0) { // Region 1. // Every sample used to calculate the prediction block has the same // value. So the whole prediction block has the same value. - const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1; + const int row = (filter_params.iy4 + 7 <= 0) ? 0 : source_height - 1; const uint8_t row_border_pixel = first_row_border[row * source_stride]; @@ -256,15 +252,15 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source, // We may over-read up to 13 pixels above the top source row, or up // to 13 pixels below the bottom source row. This is proved in // warp.cc. - const int row = iy4 + y; + const int row = filter_params.iy4 + y; int sum = first_row_border[row * source_stride]; sum <<= (kFilterBits - kInterRoundBitsHorizontal); intermediate_result_column[y + 7] = sum; } // Vertical filter. DestType* dst_row = dst + start_x - block_start_x; - int sy4 = - (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta); + int sy4 = (filter_params.y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - + MultiplyBy4(delta); for (int y = 0; y < 8; ++y) { int sy = sy4 - MultiplyBy4(gamma); #if defined(__aarch64__) @@ -341,10 +337,11 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source, // source_height - 1 for all y. In the rest of the cases, iy4 + y is // bounded and we can avoid clipping iy4 + y by relying on a reference // frame's boundary extension on the top and bottom. - if (iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0) { + if (filter_params.iy4 - 7 >= source_height - 1 || + filter_params.iy4 + 7 <= 0) { // Region 3. // Horizontal filter. - const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1; + const int row = (filter_params.iy4 + 7 <= 0) ? 0 : source_height - 1; const uint8_t* const src_row = src + row * source_stride; // Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also // read but is ignored. @@ -354,11 +351,12 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source, // has left and right borders of at least 13 bytes that extend the // frame boundary pixels. We also assume there is at least one extra // padding byte after the right border of the last source row. - const uint8x16_t src_row_v = vld1q_u8(&src_row[ix4 - 7]); + const uint8x16_t src_row_v = vld1q_u8(&src_row[filter_params.ix4 - 7]); // Convert src_row_v to int8 (subtract 128). const int8x16_t src_row_centered = vreinterpretq_s8_u8(vsubq_u8(src_row_v, vdupq_n_u8(128))); - int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7; + int sx4 = (filter_params.x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - + beta * 7; for (int y = -7; y < 8; ++y) { HorizontalFilter(sx4, alpha, src_row_centered, intermediate_result[y + 7]); @@ -367,12 +365,13 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source, } else { // Region 4. // Horizontal filter. - int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7; + int sx4 = (filter_params.x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - + beta * 7; for (int y = -7; y < 8; ++y) { // We may over-read up to 13 pixels above the top source row, or up // to 13 pixels below the bottom source row. This is proved in // warp.cc. - const int row = iy4 + y; + const int row = filter_params.iy4 + y; const uint8_t* const src_row = src + row * source_stride; // Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also // read but is ignored. @@ -382,7 +381,8 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source, // has left and right borders of at least 13 bytes that extend the // frame boundary pixels. We also assume there is at least one extra // padding byte after the right border of the last source row. - const uint8x16_t src_row_v = vld1q_u8(&src_row[ix4 - 7]); + const uint8x16_t src_row_v = + vld1q_u8(&src_row[filter_params.ix4 - 7]); // Convert src_row_v to int8 (subtract 128). const int8x16_t src_row_centered = vreinterpretq_s8_u8(vsubq_u8(src_row_v, vdupq_n_u8(128))); @@ -395,8 +395,8 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source, // Regions 3 and 4. // Vertical filter. DestType* dst_row = dst + start_x - block_start_x; - int sy4 = - (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta); + int sy4 = (filter_params.y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - + MultiplyBy4(delta); for (int y = 0; y < 8; ++y) { int sy = sy4 - MultiplyBy4(gamma); int16x8_t filter[8]; @@ -574,14 +574,8 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source, do { const int src_x = (start_x + 4) << subsampling_x; const int src_y = (start_y + 4) << subsampling_y; - const int dst_x = - src_x * warp_params[2] + src_y * warp_params[3] + warp_params[0]; - const int dst_y = - src_x * warp_params[4] + src_y * warp_params[5] + warp_params[1]; - const int x4 = dst_x >> subsampling_x; - const int y4 = dst_y >> subsampling_y; - const int ix4 = x4 >> kWarpedModelPrecisionBits; - const int iy4 = y4 >> kWarpedModelPrecisionBits; + const WarpFilterParams filter_params = GetWarpFilterParams( + src_x, src_y, subsampling_x, subsampling_y, warp_params); // A prediction block may fall outside the frame's boundaries. If a // prediction block is calculated using only samples outside the frame's // boundary, the filtering can be simplified. We can divide the plane @@ -634,22 +628,24 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source, // border index (source_width - 1 or 0, respectively). Then for each x, // the inner for loop of the horizontal filter is reduced to multiplying // the border pixel by the sum of the filter coefficients. - if (ix4 - 7 >= source_width - 1 || ix4 + 7 <= 0) { + if (filter_params.ix4 - 7 >= source_width - 1 || + filter_params.ix4 + 7 <= 0) { // Regions 1 and 2. // Points to the left or right border of the first row of |src|. const uint16_t* first_row_border = - (ix4 + 7 <= 0) ? src : src + source_width - 1; + (filter_params.ix4 + 7 <= 0) ? src : src + source_width - 1; // In general, for y in [-7, 8), the row number iy4 + y is clipped: // const int row = Clip3(iy4 + y, 0, source_height - 1); // In two special cases, iy4 + y is clipped to either 0 or // source_height - 1 for all y. In the rest of the cases, iy4 + y is // bounded and we can avoid clipping iy4 + y by relying on a reference // frame's boundary extension on the top and bottom. - if (iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0) { + if (filter_params.iy4 - 7 >= source_height - 1 || + filter_params.iy4 + 7 <= 0) { // Region 1. // Every sample used to calculate the prediction block has the same // value. So the whole prediction block has the same value. - const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1; + const int row = (filter_params.iy4 + 7 <= 0) ? 0 : source_height - 1; const uint16_t row_border_pixel = first_row_border[row * src_stride]; DestType* dst_row = dst + start_x - block_start_x; @@ -684,15 +680,15 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source, // We may over-read up to 13 pixels above the top source row, or up // to 13 pixels below the bottom source row. This is proved in // warp.cc. - const int row = iy4 + y; + const int row = filter_params.iy4 + y; int sum = first_row_border[row * src_stride]; sum <<= (kFilterBits - kInterRoundBitsHorizontal); intermediate_result_column[y + 7] = sum; } // Vertical filter. DestType* dst_row = dst + start_x - block_start_x; - int sy4 = - (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta); + int sy4 = (filter_params.y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - + MultiplyBy4(delta); for (int y = 0; y < 8; ++y) { int sy = sy4 - MultiplyBy4(gamma); #if defined(__aarch64__) @@ -782,10 +778,11 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source, // source_height - 1 for all y. In the rest of the cases, iy4 + y is // bounded and we can avoid clipping iy4 + y by relying on a reference // frame's boundary extension on the top and bottom. - if (iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0) { + if (filter_params.iy4 - 7 >= source_height - 1 || + filter_params.iy4 + 7 <= 0) { // Region 3. // Horizontal filter. - const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1; + const int row = (filter_params.iy4 + 7 <= 0) ? 0 : source_height - 1; const uint16_t* const src_row = src + row * src_stride; // Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also // read but is ignored. @@ -795,8 +792,10 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source, // has left and right borders of at least 13 pixels that extend the // frame boundary pixels. We also assume there is at least one extra // padding pixel after the right border of the last source row. - const uint16x8x2_t src_row_v = LoadSrcRow(&src_row[ix4 - 7]); - int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7; + const uint16x8x2_t src_row_v = + LoadSrcRow(&src_row[filter_params.ix4 - 7]); + int sx4 = (filter_params.x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - + beta * 7; for (int y = -7; y < 8; ++y) { HorizontalFilter(sx4, alpha, src_row_v, intermediate_result[y + 7]); sx4 += beta; @@ -804,12 +803,13 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source, } else { // Region 4. // Horizontal filter. - int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7; + int sx4 = (filter_params.x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - + beta * 7; for (int y = -7; y < 8; ++y) { // We may over-read up to 13 pixels above the top source row, or up // to 13 pixels below the bottom source row. This is proved in // warp.cc. - const int row = iy4 + y; + const int row = filter_params.iy4 + y; const uint16_t* const src_row = src + row * src_stride; // Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also // read but is ignored. @@ -819,7 +819,8 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source, // frame has left and right borders of at least 13 pixels that extend // the frame boundary pixels. We also assume there is at least one // extra padding pixel after the right border of the last source row. - const uint16x8x2_t src_row_v = LoadSrcRow(&src_row[ix4 - 7]); + const uint16x8x2_t src_row_v = + LoadSrcRow(&src_row[filter_params.ix4 - 7]); HorizontalFilter(sx4, alpha, src_row_v, intermediate_result[y + 7]); sx4 += beta; } @@ -828,8 +829,8 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source, // Regions 3 and 4. // Vertical filter. DestType* dst_row = dst + start_x - block_start_x; - int sy4 = - (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta); + int sy4 = (filter_params.y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - + MultiplyBy4(delta); for (int y = 0; y < 8; ++y) { int sy = sy4 - MultiplyBy4(gamma); int16x8_t filter[8]; diff --git a/src/dsp/average_blend.cc b/src/dsp/average_blend.cc index 273b355..1a37aa1 100644 --- a/src/dsp/average_blend.cc +++ b/src/dsp/average_blend.cc @@ -87,6 +87,21 @@ void Init10bpp() { } #endif +#if LIBGAV1_MAX_BITDEPTH == 12 +void Init12bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(12); + assert(dsp != nullptr); +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + dsp->average_blend = AverageBlend_C<12, uint16_t>; +#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + static_cast<void>(dsp); +#ifndef LIBGAV1_Dsp12bpp_AverageBlend + dsp->average_blend = AverageBlend_C<12, uint16_t>; +#endif +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +} +#endif + } // namespace void AverageBlendInit_C() { @@ -94,6 +109,9 @@ void AverageBlendInit_C() { #if LIBGAV1_MAX_BITDEPTH >= 10 Init10bpp(); #endif +#if LIBGAV1_MAX_BITDEPTH == 12 + Init12bpp(); +#endif } } // namespace dsp diff --git a/src/dsp/average_blend_test.cc b/src/dsp/average_blend_test.cc index 04e24e5..6d1100a 100644 --- a/src/dsp/average_blend_test.cc +++ b/src/dsp/average_blend_test.cc @@ -59,6 +59,7 @@ template <int bitdepth, typename Pixel> class AverageBlendTest : public testing::TestWithParam<BlockSize>, public test_utils::MaxAlignedAllocable { public: + static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, ""); AverageBlendTest() = default; ~AverageBlendTest() override = default; @@ -282,6 +283,60 @@ INSTANTIATE_TEST_SUITE_P(NEON, AverageBlendTest10bpp, #endif #endif // LIBGAV1_MAX_BITDEPTH >= 10 +#if LIBGAV1_MAX_BITDEPTH == 12 +using AverageBlendTest12bpp = AverageBlendTest<12, uint16_t>; + +const char* GetAverageBlendDigest12bpp(const BlockSize block_size) { + static const char* const kDigests[kMaxBlockSizes] = { + // 4xN + "8f5ad8fba61a0f1cb6b77f5460c241be", + "3a9d017848fdb4162315c689b4449ac6", + "bb97029fff021b168b98b209dcee5123", + // 8xN + "a7ff1b199965b8856499ae3f1b2c48eb", + "05220c72835fc4662d261183df0a57cf", + "97de8c325f1475c44e1afc44183e55ad", + "60d820c46cad14d9d934da238bb79707", + // 16xN + "f3e4863121819bc28f7c1f453898650c", + "5f5f68d21269d7df546c848921e8f2cd", + "17efe0b0fce1f8d4c7bc6eacf769063e", + "3da591e201f44511cdd6c465692ace1e", + "5a0ca6c88664d2e918a032b5fcf66070", + // 32xN + "efe236bee8a9fef90b99d8012006f985", + "d6ff3aacbbbadff6d0ccb0873fb9fa2a", + "38801f7361052873423d57b574aabddc", + "55c76772ecdc1721e92ca04d2fc7c089", + // 64xN + "4261ecdde34eedc4e5066a93e0f64881", + "fe82e012efab872672193316d670fd82", + "6c698bc2d4acf4444a64ac55ae9641de", + "98626e25101cff69019d1b7e6e439404", + // 128xN + "fe0f3c89dd39786df1c952a2470d680d", + "af7e166fc3d8c9ce85789acf3467ed9d", + }; + assert(block_size < kMaxBlockSizes); + return kDigests[block_size]; +} + +TEST_P(AverageBlendTest12bpp, Blending) { + Test(GetAverageBlendDigest12bpp(GetParam()), 1, false); +} + +TEST_P(AverageBlendTest12bpp, DISABLED_Speed) { + Test(GetAverageBlendDigest12bpp(GetParam()), + kNumSpeedTests / + (kBlockHeightPixels[GetParam()] * kBlockHeightPixels[GetParam()]) / + 2, + false); +} + +INSTANTIATE_TEST_SUITE_P(C, AverageBlendTest12bpp, + testing::ValuesIn(kTestParam)); +#endif // LIBGAV1_MAX_BITDEPTH == 12 + } // namespace } // namespace dsp diff --git a/src/dsp/cdef.cc b/src/dsp/cdef.cc index ca2adfd..9dd9287 100644 --- a/src/dsp/cdef.cc +++ b/src/dsp/cdef.cc @@ -32,9 +32,11 @@ namespace { #include "src/dsp/cdef.inc" // Silence unused function warnings when CdefDirection_C is obviated. -#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \ - !defined(LIBGAV1_Dsp8bpp_CdefDirection) || \ - (LIBGAV1_MAX_BITDEPTH >= 10 && !defined(LIBGAV1_Dsp10bpp_CdefDirection)) +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \ + !defined(LIBGAV1_Dsp8bpp_CdefDirection) || \ + (LIBGAV1_MAX_BITDEPTH >= 10 && \ + !defined(LIBGAV1_Dsp10bpp_CdefDirection)) || \ + (LIBGAV1_MAX_BITDEPTH == 12 && !defined(LIBGAV1_Dsp12bpp_CdefDirection)) constexpr int16_t kDivisionTable[] = {840, 420, 280, 210, 168, 140, 120, 105}; int32_t Square(int32_t x) { return x * x; } @@ -103,12 +105,15 @@ void CdefDirection_C(const void* LIBGAV1_RESTRICT const source, #endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || // !defined(LIBGAV1_Dsp8bpp_CdefDirection) || // (LIBGAV1_MAX_BITDEPTH >= 10 && - // !defined(LIBGAV1_Dsp10bpp_CdefDirection)) + // !defined(LIBGAV1_Dsp10bpp_CdefDirection)) + // (LIBGAV1_MAX_BITDEPTH == 12 && + // !defined(LIBGAV1_Dsp12bpp_CdefDirection)) // Silence unused function warnings when CdefFilter_C is obviated. -#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \ - !defined(LIBGAV1_Dsp8bpp_CdefFilters) || \ - (LIBGAV1_MAX_BITDEPTH >= 10 && !defined(LIBGAV1_Dsp10bpp_CdefFilters)) +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \ + !defined(LIBGAV1_Dsp8bpp_CdefFilters) || \ + (LIBGAV1_MAX_BITDEPTH >= 10 && !defined(LIBGAV1_Dsp10bpp_CdefFilters)) || \ + (LIBGAV1_MAX_BITDEPTH == 12 && !defined(LIBGAV1_Dsp12bpp_CdefFilters)) int Constrain(int diff, int threshold, int damping) { assert(threshold != 0); @@ -218,7 +223,9 @@ void CdefFilter_C(const uint16_t* LIBGAV1_RESTRICT src, #endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || // !defined(LIBGAV1_Dsp8bpp_CdefFilters) || // (LIBGAV1_MAX_BITDEPTH >= 10 && - // !defined(LIBGAV1_Dsp10bpp_CdefFilters)) + // !defined(LIBGAV1_Dsp10bpp_CdefFilters)) + // (LIBGAV1_MAX_BITDEPTH == 12 && + // !defined(LIBGAV1_Dsp12bpp_CdefFilters)) void Init8bpp() { Dsp* const dsp = dsp_internal::GetWritableDspTable(8); @@ -294,7 +301,48 @@ void Init10bpp() { #endif #endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS } +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +#if LIBGAV1_MAX_BITDEPTH == 12 +void Init12bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(12); + assert(dsp != nullptr); +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + dsp->cdef_direction = CdefDirection_C<12, uint16_t>; + dsp->cdef_filters[0][0] = CdefFilter_C<4, 12, uint16_t>; + dsp->cdef_filters[0][1] = + CdefFilter_C<4, 12, uint16_t, /*enable_primary=*/true, + /*enable_secondary=*/false>; + dsp->cdef_filters[0][2] = + CdefFilter_C<4, 12, uint16_t, /*enable_primary=*/false>; + dsp->cdef_filters[1][0] = CdefFilter_C<8, 12, uint16_t>; + dsp->cdef_filters[1][1] = + CdefFilter_C<8, 12, uint16_t, /*enable_primary=*/true, + /*enable_secondary=*/false>; + dsp->cdef_filters[1][2] = + CdefFilter_C<8, 12, uint16_t, /*enable_primary=*/false>; +#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + static_cast<void>(dsp); +#ifndef LIBGAV1_Dsp12bpp_CdefDirection + dsp->cdef_direction = CdefDirection_C<12, uint16_t>; #endif +#ifndef LIBGAV1_Dsp12bpp_CdefFilters + dsp->cdef_filters[0][0] = CdefFilter_C<4, 12, uint16_t>; + dsp->cdef_filters[0][1] = + CdefFilter_C<4, 12, uint16_t, /*enable_primary=*/true, + /*enable_secondary=*/false>; + dsp->cdef_filters[0][2] = + CdefFilter_C<4, 12, uint16_t, /*enable_primary=*/false>; + dsp->cdef_filters[1][0] = CdefFilter_C<8, 12, uint16_t>; + dsp->cdef_filters[1][1] = + CdefFilter_C<8, 12, uint16_t, /*enable_primary=*/true, + /*enable_secondary=*/false>; + dsp->cdef_filters[1][2] = + CdefFilter_C<8, 12, uint16_t, /*enable_primary=*/false>; +#endif +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +} +#endif // LIBGAV1_MAX_BITDEPTH == 12 } // namespace @@ -303,6 +351,9 @@ void CdefInit_C() { #if LIBGAV1_MAX_BITDEPTH >= 10 Init10bpp(); #endif +#if LIBGAV1_MAX_BITDEPTH == 12 + Init12bpp(); +#endif } } // namespace dsp diff --git a/src/dsp/cdef.h b/src/dsp/cdef.h index b820b77..ce23ea5 100644 --- a/src/dsp/cdef.h +++ b/src/dsp/cdef.h @@ -38,6 +38,11 @@ namespace libgav1 { namespace dsp { +enum { + kCdefSecondaryTap0 = 2, + kCdefSecondaryTap1 = 1, +}; + // Initializes Dsp::cdef_direction and Dsp::cdef_filters. This function is not // thread-safe. void CdefInit_C(); diff --git a/src/dsp/cdef_test.cc b/src/dsp/cdef_test.cc index c10a8d7..c25d7df 100644 --- a/src/dsp/cdef_test.cc +++ b/src/dsp/cdef_test.cc @@ -46,10 +46,11 @@ constexpr int kSourceBufferSize = constexpr int kNumSpeedTests = 5000; const char* GetDirectionDigest(const int bitdepth, const int num_runs) { - static const char* const kDigest[2][2] = { + static const char* const kDigest[3][2] = { {"de78c820a1fec7e81385aa0a615dbf8c", "7bfc543244f932a542691480dc4541b2"}, - {"b54236de5d25e16c0f8678d9784cb85e", "559144cf183f3c69cb0e5d98cbf532ff"}}; - const int bitdepth_index = (bitdepth == 8) ? 0 : 1; + {"b54236de5d25e16c0f8678d9784cb85e", "559144cf183f3c69cb0e5d98cbf532ff"}, + {"5532919a157c4f937da9e822bdb105f7", "dd9dfca6dfca83777d942e693c17627a"}}; + const int bitdepth_index = (bitdepth - 8) / 2; const int run_index = (num_runs == 1) ? 0 : 1; return kDigest[bitdepth_index][run_index]; } @@ -59,6 +60,7 @@ const char* GetDirectionDigest(const int bitdepth, const int num_runs) { template <int bitdepth, typename Pixel> class CdefDirectionTest : public testing::TestWithParam<int> { public: + static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, ""); CdefDirectionTest() = default; CdefDirectionTest(const CdefDirectionTest&) = delete; CdefDirectionTest& operator=(const CdefDirectionTest&) = delete; @@ -167,6 +169,18 @@ INSTANTIATE_TEST_SUITE_P(NEON, CdefDirectionTest10bpp, testing::Values(0)); #endif #endif // LIBGAV1_MAX_BITDEPTH >= 10 +#if LIBGAV1_MAX_BITDEPTH == 12 +using CdefDirectionTest12bpp = CdefDirectionTest<12, uint16_t>; + +TEST_P(CdefDirectionTest12bpp, Correctness) { TestRandomValues(1); } + +TEST_P(CdefDirectionTest12bpp, DISABLED_Speed) { + TestRandomValues(kNumSpeedTests / 100); +} + +INSTANTIATE_TEST_SUITE_P(C, CdefDirectionTest12bpp, testing::Values(0)); +#endif // LIBGAV1_MAX_BITDEPTH == 12 + const char* GetDigest8bpp(int id) { static const char* const kDigest[] = { "b6fe1a1f5bbb23e35197160ce57d90bd", "8aed39871b19184f1d381b145779bc33", @@ -199,6 +213,23 @@ const char* GetDigest10bpp(int id) { } #endif // LIBGAV1_MAX_BITDEPTH >= 10 +#if LIBGAV1_MAX_BITDEPTH == 12 +const char* GetDigest12bpp(int id) { + static const char* const kDigest[] = { + "a32569989c42fd4254979f70c1c65f5a", "dc389048217633e2dd64126376be7d25", + "3b0e8dae294895330f349863b1773c39", "9741fe8d27d109cb99b7a9cdc030f52a", + "ab70f3729b52287c6432ba7624280a68", "c1e5cf39cbc8030b82e09633c6c67d42", + "d5120a196164ff5a0ad7aa8c02e9b064", "1133759f3aee3a362a0ab668f6faf843", + "feb0ab7f515665f79fce213e8cd2fb10", "e86ea55c2d6d5cc69716535bd455c99f", + "e463da1b9d089b6ee82c041794257fd7", "27800e4af0cceeaf0a95c96275a7befe", + "f42e426481db00582b327eb2971bca96", "6127ff289833dde0270000d8240f36b7", + "cc5dbaf70e2fef7729a8e2ea9937fbcf", "51850b4e3e2a3919e110376fcb6318d3", + "d5ac7ac25eb1b5aee293b2a2ec9de775", "64ecc00b2e24a2f07df833fb50ce09c3", + }; + return kDigest[id]; +} +#endif // LIBGAV1_MAX_BITDEPTH == 12 + struct CdefTestParam { CdefTestParam(int subsampling_x, int subsampling_y, int rows4x4, int columns4x4) @@ -224,6 +255,7 @@ std::ostream& operator<<(std::ostream& os, const CdefTestParam& param) { template <int bitdepth, typename Pixel> class CdefFilteringTest : public testing::TestWithParam<CdefTestParam> { public: + static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, ""); CdefFilteringTest() = default; CdefFilteringTest(const CdefFilteringTest&) = delete; CdefFilteringTest& operator=(const CdefFilteringTest&) = delete; @@ -328,19 +360,26 @@ void CdefFilteringTest<bitdepth, Pixel>::TestRandomValues(int num_runs) { } for (int plane = kPlaneY; plane < kMaxPlanes; ++plane) { - if (bitdepth == 8) { - test_utils::CheckMd5Digest(kCdef, kCdefFilterName, - GetDigest8bpp(id + plane), - reinterpret_cast<uint8_t*>(dest_[plane]), - sizeof(dest_[plane]), elapsed_time); + const char* expected_digest = nullptr; + switch (bitdepth) { + case 8: + expected_digest = GetDigest8bpp(id + plane); + break; #if LIBGAV1_MAX_BITDEPTH >= 10 - } else { - test_utils::CheckMd5Digest(kCdef, kCdefFilterName, - GetDigest10bpp(id + plane), - reinterpret_cast<uint8_t*>(dest_[plane]), - sizeof(dest_[plane]), elapsed_time); -#endif // LIBGAV1_MAX_BITDEPTH >= 10 + case 10: + expected_digest = GetDigest10bpp(id + plane); + break; +#endif +#if LIBGAV1_MAX_BITDEPTH == 12 + case 12: + expected_digest = GetDigest12bpp(id + plane); + break; +#endif } + ASSERT_NE(expected_digest, nullptr); + test_utils::CheckMd5Digest(kCdef, kCdefFilterName, expected_digest, + reinterpret_cast<uint8_t*>(dest_[plane]), + sizeof(dest_[plane]), elapsed_time); } } @@ -396,6 +435,19 @@ INSTANTIATE_TEST_SUITE_P(NEON, CdefFilteringTest10bpp, #endif #endif // LIBGAV1_MAX_BITDEPTH >= 10 +#if LIBGAV1_MAX_BITDEPTH == 12 +using CdefFilteringTest12bpp = CdefFilteringTest<12, uint16_t>; + +TEST_P(CdefFilteringTest12bpp, Correctness) { TestRandomValues(1); } + +TEST_P(CdefFilteringTest12bpp, DISABLED_Speed) { + TestRandomValues(kNumSpeedTests); +} + +INSTANTIATE_TEST_SUITE_P(C, CdefFilteringTest12bpp, + testing::ValuesIn(cdef_test_param)); +#endif // LIBGAV1_MAX_BITDEPTH == 12 + } // namespace } // namespace dsp } // namespace libgav1 diff --git a/src/dsp/constants.h b/src/dsp/constants.h index 7c1b62c..dd0a4e0 100644 --- a/src/dsp/constants.h +++ b/src/dsp/constants.h @@ -27,25 +27,7 @@ namespace libgav1 { enum { - // Documentation variables. - kBitdepth8 = 8, - kBitdepth10 = 10, - kBitdepth12 = 12, - // Weights are quadratic from '1' to '1 / block_size', scaled by - // 2^kSmoothWeightScale. - kSmoothWeightScale = 8, kCflLumaBufferStride = 32, - // InterRound0, Section 7.11.3.2. - kInterRoundBitsHorizontal = 3, // 8 & 10-bit. - kInterRoundBitsHorizontal12bpp = 5, - kInterRoundBitsCompoundVertical = 7, // 8, 10 & 12-bit compound prediction. - kInterRoundBitsVertical = 11, // 8 & 10-bit, single prediction. - kInterRoundBitsVertical12bpp = 9, - // Offset applied to 10bpp and 12bpp predictors to allow storing them in - // uint16_t. Removed before blending. - kCompoundOffset = (1 << 14) + (1 << 13), - kCdefSecondaryTap0 = 2, - kCdefSecondaryTap1 = 1, }; // anonymous enum extern const int8_t kFilterIntraTaps[kNumFilterIntraPredictors][8][8]; diff --git a/src/dsp/convolve.cc b/src/dsp/convolve.cc index f11b45e..6989da0 100644 --- a/src/dsp/convolve.cc +++ b/src/dsp/convolve.cc @@ -864,7 +864,93 @@ void Init10bpp() { #endif #endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS } +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +#if LIBGAV1_MAX_BITDEPTH == 12 +void Init12bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(12); + assert(dsp != nullptr); +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + dsp->convolve[0][0][0][0] = ConvolveCopy_C<12, uint16_t>; + dsp->convolve[0][0][0][1] = ConvolveHorizontal_C<12, uint16_t>; + dsp->convolve[0][0][1][0] = ConvolveVertical_C<12, uint16_t>; + dsp->convolve[0][0][1][1] = Convolve2D_C<12, uint16_t>; + + dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_C<12, uint16_t>; + dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_C<12, uint16_t>; + dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_C<12, uint16_t>; + dsp->convolve[0][1][1][1] = ConvolveCompound2D_C<12, uint16_t>; + + dsp->convolve[1][0][0][0] = ConvolveCopy_C<12, uint16_t>; + dsp->convolve[1][0][0][1] = + ConvolveIntraBlockCopy1D_C<12, uint16_t, /*is_horizontal=*/true>; + dsp->convolve[1][0][1][0] = + ConvolveIntraBlockCopy1D_C<12, uint16_t, /*is_horizontal=*/false>; + dsp->convolve[1][0][1][1] = ConvolveIntraBlockCopy2D_C<12, uint16_t>; + + dsp->convolve[1][1][0][0] = nullptr; + dsp->convolve[1][1][0][1] = nullptr; + dsp->convolve[1][1][1][0] = nullptr; + dsp->convolve[1][1][1][1] = nullptr; + + dsp->convolve_scale[0] = ConvolveScale2D_C<12, uint16_t>; + dsp->convolve_scale[1] = ConvolveCompoundScale2D_C<12, uint16_t>; +#else // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +#ifndef LIBGAV1_Dsp12bpp_ConvolveCopy + dsp->convolve[0][0][0][0] = ConvolveCopy_C<12, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp12bpp_ConvolveHorizontal + dsp->convolve[0][0][0][1] = ConvolveHorizontal_C<12, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp12bpp_ConvolveVertical + dsp->convolve[0][0][1][0] = ConvolveVertical_C<12, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp12bpp_Convolve2D + dsp->convolve[0][0][1][1] = Convolve2D_C<12, uint16_t>; +#endif + +#ifndef LIBGAV1_Dsp12bpp_ConvolveCompoundCopy + dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_C<12, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp12bpp_ConvolveCompoundHorizontal + dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_C<12, uint16_t>; #endif +#ifndef LIBGAV1_Dsp12bpp_ConvolveCompoundVertical + dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_C<12, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp12bpp_ConvolveCompound2D + dsp->convolve[0][1][1][1] = ConvolveCompound2D_C<12, uint16_t>; +#endif + +#ifndef LIBGAV1_Dsp12bpp_ConvolveIntraBlockCopy + dsp->convolve[1][0][0][0] = ConvolveCopy_C<12, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp12bpp_ConvolveIntraBlockHorizontal + dsp->convolve[1][0][0][1] = + ConvolveIntraBlockCopy1D_C<12, uint16_t, /*is_horizontal=*/true>; +#endif +#ifndef LIBGAV1_Dsp12bpp_ConvolveIntraBlockVertical + dsp->convolve[1][0][1][0] = + ConvolveIntraBlockCopy1D_C<12, uint16_t, /*is_horizontal=*/false>; +#endif +#ifndef LIBGAV1_Dsp12bpp_ConvolveIntraBlock2D + dsp->convolve[1][0][1][1] = ConvolveIntraBlockCopy2D_C<12, uint16_t>; +#endif + + dsp->convolve[1][1][0][0] = nullptr; + dsp->convolve[1][1][0][1] = nullptr; + dsp->convolve[1][1][1][0] = nullptr; + dsp->convolve[1][1][1][1] = nullptr; + +#ifndef LIBGAV1_Dsp12bpp_ConvolveScale2D + dsp->convolve_scale[0] = ConvolveScale2D_C<12, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp12bpp_ConvolveCompoundScale2D + dsp->convolve_scale[1] = ConvolveCompoundScale2D_C<12, uint16_t>; +#endif +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +} +#endif // LIBGAV1_MAX_BITDEPTH == 12 } // namespace @@ -873,6 +959,9 @@ void ConvolveInit_C() { #if LIBGAV1_MAX_BITDEPTH >= 10 Init10bpp(); #endif +#if LIBGAV1_MAX_BITDEPTH == 12 + Init12bpp(); +#endif } } // namespace dsp diff --git a/src/dsp/convolve.h b/src/dsp/convolve.h index 5bc0bad..8780bfc 100644 --- a/src/dsp/convolve.h +++ b/src/dsp/convolve.h @@ -17,6 +17,8 @@ #ifndef LIBGAV1_SRC_DSP_CONVOLVE_H_ #define LIBGAV1_SRC_DSP_CONVOLVE_H_ +#include <cassert> + // Pull in LIBGAV1_DspXXX defines representing the implementation status // of each function. The resulting value of each can be used by each module to // determine whether an implementation is needed at compile time. @@ -43,6 +45,35 @@ namespace dsp { // thread-safe. void ConvolveInit_C(); +inline int GetNumTapsInFilter(const int filter_index) { + if (filter_index < 2) { + // Despite the names these only use 6 taps. + // kInterpolationFilterEightTap + // kInterpolationFilterEightTapSmooth + return 6; + } + + if (filter_index == 2) { + // kInterpolationFilterEightTapSharp + return 8; + } + + if (filter_index == 3) { + // kInterpolationFilterBilinear + return 2; + } + + assert(filter_index > 3); + // For small sizes (width/height <= 4) the large filters are replaced with 4 + // tap options. + // If the original filters were |kInterpolationFilterEightTap| or + // |kInterpolationFilterEightTapSharp| then it becomes + // |kInterpolationFilterSwitchable|. + // If it was |kInterpolationFilterEightTapSmooth| then it becomes an unnamed 4 + // tap filter. + return 4; +} + } // namespace dsp } // namespace libgav1 diff --git a/src/dsp/convolve.inc b/src/dsp/convolve.inc index e0f755e..2e0b270 100644 --- a/src/dsp/convolve.inc +++ b/src/dsp/convolve.inc @@ -12,39 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -// Constants and utility functions used for convolve implementations. +// Constants used for convolve implementations. // This will be included inside an anonymous namespace on files where these are // necessary. -int GetNumTapsInFilter(const int filter_index) { - if (filter_index < 2) { - // Despite the names these only use 6 taps. - // kInterpolationFilterEightTap - // kInterpolationFilterEightTapSmooth - return 6; - } - - if (filter_index == 2) { - // kInterpolationFilterEightTapSharp - return 8; - } - - if (filter_index == 3) { - // kInterpolationFilterBilinear - return 2; - } - - assert(filter_index > 3); - // For small sizes (width/height <= 4) the large filters are replaced with 4 - // tap options. - // If the original filters were |kInterpolationFilterEightTap| or - // |kInterpolationFilterEightTapSharp| then it becomes - // |kInterpolationFilterSwitchable|. - // If it was |kInterpolationFilterEightTapSmooth| then it becomes an unnamed 4 - // tap filter. - return 4; -} - constexpr int kIntermediateAllocWidth = kMaxSuperBlockSizeInPixels; constexpr int kIntermediateStride = 8; constexpr int kHorizontalOffset = 3; diff --git a/src/dsp/convolve_test.cc b/src/dsp/convolve_test.cc index 295c814..42cdeb7 100644 --- a/src/dsp/convolve_test.cc +++ b/src/dsp/convolve_test.cc @@ -418,6 +418,166 @@ const char* GetConvolveScaleDigest10bpp(int id) { } #endif // LIBGAV1_MAX_BITDEPTH >= 10 +#if LIBGAV1_MAX_BITDEPTH == 12 +const char* GetConvolveDigest12bpp(int id) { + // Entries containing 'XXXXX...' are skipped. See the test for details. + static const char* const kDigest[ConvolveTestParam::kNumBlockSizes * 16] = { + "e25031afae184cc4d186cde7e3d51e33", "6fb55dec2506dae6c229469cdf2e7d83", + "9df34d27f5bd040d1ed1455b151cd1ff", "7f6829458f00edb88f78851dd1a08739", + "a8bbe9b6b9eaf6f681d91c981b994949", "21f74980b36cb246426f4bc3fe7c08c3", + "403c2ccced3b5b141a4c7559c0cd841b", "1c3c4c6cd8a3e79cd95d6038531b47e5", + "f18d6950d36619086ac0055bab528cb1", "37d9c5babddf24fe8cb061297297b769", + "c111000d4021381f3d18ea0e24a1b5f5", "4e1e4f0a592ff028e35f8187627d6e68", + "0ca9ad4614d32da05746f8712a46d861", "8a122ab194e2fdb7089b29be50af8c86", + "3c21326e22a80982d1b0ffc09be4beae", "f6c8d1fe2c1fb19604c49c6a49bd26a8", + "d3eda9d7aa80e4ea1f18bf565b143e57", "fe21bd1cb8e90466dc727f2223ea7aed", + "01efe3df83c715325aaddd4d4ce130ad", "ecaa751360121d3746b72932415fb998", + "291e67095399651dc5c8a033390f255f", "66b26828e434faf37ddc57d3e0abb6db", + "e9cd69e9ba70864e3d0b175ac0a177d6", "64e4db895a843cb05384f5997b1ba978", + "f305161c82de999d2c93eac65f609cfe", "4762b2bd27983ad916ec0a930c0eca6b", + "1631495ffae43a927267ebd476015ef1", "b0f22de7b10057e07af71f9bce4615ce", + "6fa29dc4be1a46d246a41d66a3d35cb4", "734601c2185bdf30ba9ded8b07003a05", + "524e4553d92c69e7e4ed934f7b806c6b", "3709c8950bc5fcc4a2b3ec68fc78bf7e", + "69c274d9f8e0fd6790495e9695251f1f", "ee30cc1232c27494ef53edd383568f25", + "e525dbeb0b4341952a92270dcfc51730", "b3685c9e783d3402497bbd49d28c7dd7", + "d1c9f02dc818e6b974794dfb7749aac8", "bdb9e4961f9aa8c25568d3394e968518", + "f5f74555adcad85f3ebd3cb85dc7b770", "737e2a0be806dbd701014f2078be7898", + "20a18294e3a9422193aa0a219fd80ede", "7106648ecb9ae24a54d1dbabf2a9e318", + "20f39cbd6b5ed87a6ae4f818932325c0", "a99666e3157e32a07c87b01e52091a76", + "123e4d533d478c3089c975323c85396b", "d2a8021f7683a0cdf2658418fa90a6fc", + "1437e192a3349db8702d5b90eb88dbc1", "fe097fc4aeed7cda0b0f405124efb19d", + "1988227c51fa589db1307fd890bb5972", "537e25a6c30b240dc1e3bddd1c3a0a03", + "aebe5cffaae448db5a08987a3375a428", "7127ae9bdc63df4459590dc02ca95403", + "7ad281903a210f2b1f39f7c40c8df272", "d4b97ba21f7b400ba9f9cd8bb1a576df", + "0884a824203aaf72c78f73fdaf2b23a2", "63d60388605c92daee55d517de622a9e", + "171ec49a779de1efa69510eefbd09bba", "541cf051579c5a10b9debd3bfdcb7f32", + "91c14451ad93ed88e96b5d639ce408de", "3b0313ec0e043d19744bf88c90e875a1", + "6adcb3cee91fe3a83b36deb11c5ad6dd", "c6d4bfad24616a88222681992a99d782", + "515dc0f2a41730d5c434e4f3c81b02c3", "1c69abdee3b9608a6094034badc2bec0", + "1785a0f321d7dd90aa8846961737a767", "dd12c5b8c341f2423d0d5db4f285d199", + "5741fb69aae1ca8a0fbe4f1478df88ef", "a4390ceb4e4e9f5cf6a47a9b11a97015", + "6778eb25df902092b440c3402e7f0f80", "5ad9d6b36f8898bb55e901c1c0c523da", + "73969b6c03bb5a7345a8b968b542668e", "f48192947e66d70f116193a4186d0186", + "53f60d0e89d7d994ec6d6131fb7e75ae", "c75f6f8813839ae3cf192baa29039265", + "9ff0852ebbad56663250f86ac3a3bf9b", "668938580a770ea7ace8bbf7d349e89f", + "5b06bb0a15ac465a250d9b209f05289f", "a2128f5c8692fed7e7c1c7af22ce9f72", + "f80f1d7a58869ec794258c0f7df14620", "ed1e03a35924c92ed2fc9808dc3f06f3", + "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", + "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "35ef89c35d2e8e46feb856c554c21c9f", + "b98ce33a1bf4fab840b7ef261b30dbc4", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", + "b98ce33a1bf4fab840b7ef261b30dbc4", "42263fb359c4fdf1c7cdb4980b3e97f2", + "1e7071b7db3144188bdcf5d199fe5355", "1e7071b7db3144188bdcf5d199fe5355", + "30d367304a87bd25f0ad2ff8e4b5eb41", "4abe6dbb3198219015838dbedf07297a", + "4abe6dbb3198219015838dbedf07297a", "acec349a95b5bba98bb830372fa15e73", + "a73ad8661256ce2fdf5110425eb260b2", "a73ad8661256ce2fdf5110425eb260b2", + "8ff2f049d3f972867f14775188fe589b", "87f5f9a07aea75c325e6d7ff6c96c7c2", + "87f5f9a07aea75c325e6d7ff6c96c7c2", "325fcde7d415d7aa4929a3ea013fb9cc", + "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", + "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "05aa29814d5ce35389dbcf20368850da", + "fbb89f907a040e70953e3364dbe1feda", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", + "44ac511baf45032078cc0b45e41dba79", "efb98974adc58d88e122293436bb9184", + "7eee18c1a16bcb4e7ef7b27f68ba884f", "b0904c9b118dd9a1f9f034c0ff82d1c1", + "54436deb5183dd9669dd4f5feadb3850", "4db1c310b7d9a8bd3e2b5d20fa820e3b", + "c40abc6b2d67527f48a287cd7e157428", "48ec3fcf509805f484c8e0948c3469be", + "cb7d4a76fa7de52ed2fe889785327b38", "f57983346815fa41e969c195c1c03774", + "7dba59b0de2c877666ded6bdaefdcc30", "4837f8ba2f67f17f28a38c5e2a434c73", + "09e06fe9dc7ef7818f2a96895235afd4", "002976970ec62b360f956b9c091782d4", + "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", + "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "78673b1084367e97b8dd83990adc5219", + "06b5d4a30b9efb6c1d95ef7957f49e76", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", + "ce460146922cd53970b510d460aa4062", "6fd051938b8efcec9ece042f1edc177a", + "f5ff0dcfe3c1a56e3856549d8ded416b", "b69bc2cfc17c6b4313264db96831f0d1", + "38a5e65bd71934becfb376eb3b9bc513", "32c1163aa4ca6b6c69d950aba7b06d48", + "0c22a6c014c6347983de4ca863f3b53f", "a80c5ee9eb2dfb9a0d56e92eb3f85d91", + "a9719722a150a81175427bc161b95d7a", "8237befd456131a488cc5b8b63f4aca5", + "51616abcd0beea53a78ffce106b974fc", "6c47b22270f01d27b404da07e1be1202", + "356268298d3887edaabd0169a912c94e", "d2b00216e106cb8c5450e2eff1f8481a", + "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", + "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "c2de3a582c79aee811076211c497d2bc", + "d1b6d9c73da41def26dd4f85fbd1bde8", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", + "d8374eb7825081b89f74b05c66bccd63", "d5f7d68c10b5eaf0fba6f93ee26266e6", + "94d19cb65f29db65e6656b588f431ade", "5126e95f0249024a6f6d426714bd5b20", + "d7d3654b9c2dabe13239875984770acd", "6491afd5d651aab80aa179b579b74341", + "037a5de0de89983808f8e8f6dc39110f", "5980073b7685c5c9b2ec027e06be2cbc", + "0abb9d035aca426b62ca0f3fab063bab", "fe002a902bb4ec24dfe3ea0fe381a472", + "1ac15726df1aa2cd8855162a91893379", "0758c3ac16467605d73c725a697c3dc1", + "97d894d85f6ccfa4ff81e0e8fdf03da1", "c3c7b362f063a18244ea542a42d2873c", + "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", + "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "7f6829458f00edb88f78851dd1a08739", + "a8bbe9b6b9eaf6f681d91c981b994949", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", + "403c2ccced3b5b141a4c7559c0cd841b", "1c3c4c6cd8a3e79cd95d6038531b47e5", + "f18d6950d36619086ac0055bab528cb1", "37d9c5babddf24fe8cb061297297b769", + "c111000d4021381f3d18ea0e24a1b5f5", "4e1e4f0a592ff028e35f8187627d6e68", + "0ca9ad4614d32da05746f8712a46d861", "8a122ab194e2fdb7089b29be50af8c86", + "3c21326e22a80982d1b0ffc09be4beae", "f6c8d1fe2c1fb19604c49c6a49bd26a8", + "d3eda9d7aa80e4ea1f18bf565b143e57", "fe21bd1cb8e90466dc727f2223ea7aed", + "01efe3df83c715325aaddd4d4ce130ad", "ecaa751360121d3746b72932415fb998", + "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", + "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "67b2ea94cc4d0b32db3ae3c29eee4d46", + "bcfec99ad75988fa1efc1733204f17f2", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", + "79c222c5796e50119f1921e7bc534a25", "ae3f7c189458f044e9c52399d37a55e2", + "fd6dde45bd2937331428de9ef4f8e869", "b384d065423f3d271b85781d76a73218", + "466ea0680c06f59e8b3bb293608731fb", "360541ba94f42d115fe687a97a457ffb", + "e5a0794d37af40c40a4d2c6d3f7d2aa2", "4eed285651a75614bd60adebbe2e185c", + "bbdbf93942282d7b9c4f07591a1764a6", "1288a9ec3e6f79213b6745e6e7568c44", + "4ff1310bfd656d69ed5c108a91a9b01a", "3380806b5f67eb3ebce42f8e7c05b256", + "09c4bdf0f30aca6812fb55a5ac06b1bd", "722c86ba6bf21f40742ee33b4edc17c4", + "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", + "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "f5303c96d1630f9840eaaba058cd818b", + "c20cd45782b2f52c05e4189912047570", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", + "d6360f96fe15a3ee1e903b0a53dcaaeb", "4b18995cdf2f5d18410d3a732c5932b1", + "6f62bf7395c3dfccc1565ba8424f20e8", "c9987ed30491cd28bbc711dd57228247", + "8e277ec837cbecf529ae2eb0578fddc1", "c0c132386f23c5f0fba055a12fb79547", + "6b5617ab78dd0916690dfa358298b7b3", "394abedca37f60d1a5148a4c975305ed", + "bb88881e0e4cf2d88c2d2b38b5833f20", "bef10806be8d58ea8e97870a813b075e", + "b4b017d1f792bea69d3b773db7c80c7c", "0660bc63041213a8a4d74724a3bc4291", + "5050c8c5388a561691fd414b00c041df", "9ed40c68de6a8008a902d7224f8b620f", + "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", + "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "ec10ce4a674424478a401847f744251d", + "bdd897eafc8ef2651a7bba5e523a6ac2", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", + "2745de4a6b29abb85ee513e22ad362c3", "8aaad384b7cd349b4b968e657ec15523", + "fb6c0723432bcd2246d51a90f5fb5826", "f8104ed5921ebd48c6eed16150ffe028", + "85c2e236b3e32bf731601237cf0594cd", "8bd6eefff9640766cdf64ab082cb1485", + "78b5cd9dde6c6a5900f3040c57172091", "aaa980506bd7bb1d75924a8025698d1a", + "90050a411d501f7166f6741832b0c342", "d6ec88b2c38e32511f3359e1d5f9d85b", + "96506b8b39274c8fe687ea39761997f1", "3322ea83995c2762fb60db993b401658", + "151b6e4ce60392639982fca5a73ac3d3", "d52a1038e135bef233674a843f8c7cb6", + }; + assert(id >= 0); + assert(id < sizeof(kDigest) / sizeof(kDigest[0])); + return kDigest[id]; +} + +const char* GetConvolveScaleDigest12bpp(int id) { + // Entries containing 'XXXXX...' are skipped. See the test for details. + static const char* const kDigest[ConvolveTestParam::kNumBlockSizes * 2] = { + "aea59b7a638f27acad2b90fd2b8c9fee", "be87ba981a0af25611a7d5f0970be9b3", + "7c81f1486cd607376d776bf2c6e81dec", "f683ba2a9b353bea35f26c1ed730f3c5", + "11e2d70daff1726093cb4fcae33ce0d6", "567575eac0dea2f379019b2d4bafe444", + "216479ed580d6e0d7c1d523015394814", "dcabbe5f5709a4b6634d77cc514e863a", + "4e888207fe917faeea2b44383ac16caf", "d617c5608fae3b01c507c7e88040fee3", + "eeac5d9b3dc005e76f13dfc7483eae48", "8ff0a82811f77303c4516bb8c761336f", + "95a7c315aaa208097b6ab006f1d07654", "da63527ee80e6772435cff8321a29a95", + "404457f72e7113d1f3797a39319fd3fe", "43cbccfe2663ec11c157319acfe629a5", + "1dc5b8dee4542f3d7fcf6b0fa325dfde", "16d4506674f2fcedfcd1e006eb097141", + "4fcf329ddb405cd6bbb0a6fb87e29eb3", "de77a781957653ea1750f79995605cdc", + "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", + "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "436f6fdc008d94a94bc6f516f98f402f", + "b436bd9036f08ba7e50cfc536911dbbd", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", + "720a01018856bd83f4d89a9024b14728", "b7e01a3f161007712ce342f59b2c51f2", + "922420ebe5dec4f19c259ebdf8a3259a", "979aaba579556207a7bbcc939123c1b2", + "89a30898cbaa4d64f9072173e8365864", "0586ff961f2e4228f4e38299fb25ae07", + "adb27a03f8b1b50fe2a52b5ca8d4fc28", "4f91cd92aab2e09f4b123251a8d2f219", + "620fba0fff163d96a1cd663d1af4a4c5", "bf7a0fa65b1a90ba34c834558fa2c86e", + "c21f7d7d16d047a27b871a7bf8476e2d", "a94b17c81f3ce2b47081bd8dd762a2e5", + "9078d20f59bc24862af3856acb8c0357", "ee510ce6b3d22de9e4bd7920a26fd69a", + }; + assert(id >= 0); + assert(id < sizeof(kDigest) / sizeof(kDigest[0])); + return kDigest[id]; +} +#endif // LIBGAV1_MAX_BITDEPTH == 12 + struct ConvolveTypeParam { ConvolveTypeParam(bool is_intra_block_copy, bool is_compound, bool has_vertical_filter, bool has_horizontal_filter) @@ -447,6 +607,7 @@ template <int bitdepth, typename Pixel> class ConvolveTest : public testing::TestWithParam< std::tuple<ConvolveTypeParam, ConvolveTestParam>> { public: + static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, ""); ConvolveTest() = default; ~ConvolveTest() override = default; @@ -725,14 +886,24 @@ void ConvolveTest<bitdepth, Pixel>::Test( if (!use_fixed_values) { // md5 sums are only calculated for random input. - const char* ref_digest; - if (bitdepth == 8) { - ref_digest = GetConvolveDigest8bpp(GetDigestId()); - } else { + const char* ref_digest = nullptr; + switch (bitdepth) { + case 8: + ref_digest = GetConvolveDigest8bpp(GetDigestId()); + break; #if LIBGAV1_MAX_BITDEPTH >= 10 - ref_digest = GetConvolveDigest10bpp(GetDigestId()); -#endif // LIBGAV1_MAX_BITDEPTH >= 10 + case 10: + ref_digest = GetConvolveDigest10bpp(GetDigestId()); + break; +#endif +#if LIBGAV1_MAX_BITDEPTH == 12 + case 12: + ref_digest = GetConvolveDigest12bpp(GetDigestId()); + break; +#endif } + ASSERT_NE(ref_digest, nullptr); + const char* direction; if (type_param_.has_vertical_filter && type_param_.has_horizontal_filter) { direction = "2D"; @@ -896,6 +1067,7 @@ class ConvolveScaleTest : public testing::TestWithParam< std::tuple<bool /*is_compound*/, ConvolveTestParam>> { public: + static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, ""); ConvolveScaleTest() = default; ~ConvolveScaleTest() override = default; @@ -1160,14 +1332,23 @@ void ConvolveScaleTest<bitdepth, Pixel>::Test( if (!use_fixed_values) { // md5 sums are only calculated for random input. - const char* ref_digest; - if (bitdepth == 8) { - ref_digest = GetConvolveScaleDigest8bpp(GetDigestId()); - } else { + const char* ref_digest = nullptr; + switch (bitdepth) { + case 8: + ref_digest = GetConvolveScaleDigest8bpp(GetDigestId()); + break; #if LIBGAV1_MAX_BITDEPTH >= 10 - ref_digest = GetConvolveScaleDigest10bpp(GetDigestId()); -#endif // LIBGAV1_MAX_BITDEPTH >= 10 + case 10: + ref_digest = GetConvolveScaleDigest10bpp(GetDigestId()); + break; +#endif +#if LIBGAV1_MAX_BITDEPTH == 12 + case 12: + ref_digest = GetConvolveScaleDigest12bpp(GetDigestId()); + break; +#endif } + ASSERT_NE(ref_digest, nullptr); const auto elapsed_time_us = static_cast<int>(absl::ToInt64Microseconds(elapsed_time)); @@ -1322,6 +1503,47 @@ INSTANTIATE_TEST_SUITE_P(NEON, ConvolveScaleTest10bpp, #endif // LIBGAV1_MAX_BITDEPTH >= 10 +#if LIBGAV1_MAX_BITDEPTH == 12 +using ConvolveTest12bpp = ConvolveTest<12, uint16_t>; + +TEST_P(ConvolveTest12bpp, FixedValues) { + Test(true, 0); + Test(true, 1); + Test(true, 128); + Test(true, (1 << 12) - 1); +} + +TEST_P(ConvolveTest12bpp, RandomValues) { Test(false, 0); } + +TEST_P(ConvolveTest12bpp, DISABLED_Speed) { + const int num_runs = static_cast<int>(1.0e7 / (param_.width * param_.height)); + Test(false, 0, num_runs); +} + +using ConvolveScaleTest12bpp = ConvolveScaleTest<12, uint16_t>; + +TEST_P(ConvolveScaleTest12bpp, FixedValues) { + Test(true, 0); + Test(true, 1); + Test(true, 128); + Test(true, (1 << 12) - 1); +} + +TEST_P(ConvolveScaleTest12bpp, RandomValues) { Test(false, 0); } + +TEST_P(ConvolveScaleTest12bpp, DISABLED_Speed) { + const int num_runs = static_cast<int>(1.0e7 / (param_.width * param_.height)); + Test(false, 0, num_runs); +} + +INSTANTIATE_TEST_SUITE_P(C, ConvolveTest12bpp, + testing::Combine(testing::ValuesIn(kConvolveTypeParam), + testing::ValuesIn(kConvolveParam))); +INSTANTIATE_TEST_SUITE_P(C, ConvolveScaleTest12bpp, + testing::Combine(testing::Bool(), + testing::ValuesIn(kConvolveParam))); +#endif // LIBGAV1_MAX_BITDEPTH == 12 + } // namespace } // namespace dsp } // namespace libgav1 diff --git a/src/dsp/distance_weighted_blend.cc b/src/dsp/distance_weighted_blend.cc index 34d10fc..ef83235 100644 --- a/src/dsp/distance_weighted_blend.cc +++ b/src/dsp/distance_weighted_blend.cc @@ -88,7 +88,22 @@ void Init10bpp() { #endif #endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS } +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +#if LIBGAV1_MAX_BITDEPTH == 12 +void Init12bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(12); + assert(dsp != nullptr); +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + dsp->distance_weighted_blend = DistanceWeightedBlend_C<12, uint16_t>; +#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + static_cast<void>(dsp); +#ifndef LIBGAV1_Dsp12bpp_DistanceWeightedBlend + dsp->distance_weighted_blend = DistanceWeightedBlend_C<12, uint16_t>; #endif +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +} +#endif // LIBGAV1_MAX_BITDEPTH == 12 } // namespace @@ -97,6 +112,9 @@ void DistanceWeightedBlendInit_C() { #if LIBGAV1_MAX_BITDEPTH >= 10 Init10bpp(); #endif +#if LIBGAV1_MAX_BITDEPTH == 12 + Init12bpp(); +#endif } } // namespace dsp diff --git a/src/dsp/distance_weighted_blend_test.cc b/src/dsp/distance_weighted_blend_test.cc index fdf058e..88040b4 100644 --- a/src/dsp/distance_weighted_blend_test.cc +++ b/src/dsp/distance_weighted_blend_test.cc @@ -47,6 +47,7 @@ template <int bitdepth, typename Pixel> class DistanceWeightedBlendTest : public testing::TestWithParam<BlockSize>, public test_utils::MaxAlignedAllocable { public: + static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, ""); DistanceWeightedBlendTest() = default; ~DistanceWeightedBlendTest() override = default; @@ -268,6 +269,56 @@ INSTANTIATE_TEST_SUITE_P(NEON, DistanceWeightedBlendTest10bpp, #endif #endif // LIBGAV1_MAX_BITDEPTH >= 10 +#if LIBGAV1_MAX_BITDEPTH == 12 +const char* GetDistanceWeightedBlendDigest12bpp(const BlockSize block_size) { + static const char* const kDigests[] = { + // 4xN + "e30bf8f5f294206ad1dd79bd10a20827", + "f0cfb60134562d9c5f2ec6ad106e01ef", + "ad0876244e1b769203266a9c75b74afc", + // 8xN + "5265b954479c15a80f427561c5f36ff4", + "7f157457d1671e4ecce7a0884e9e3f76", + "d2cef5cf217f2d1f787c8951b7fe7cb2", + "6d23059008adbbb84ac941c8b4968f5b", + // 16xN + "ae521a5656ed3440d1fa950c20d90a79", + "935bec0e12b5dd3e0c34b3de8ba51476", + "0334bafcdcd7ddddb673ded492bca25a", + "c5360f08d0be77c79dc19fb55a0c5fe0", + "c2d1e7a4244a8aaaac041aed0cefc148", + // 32xN + "ce7f3cf78ae4f836cf69763137f7f6a6", + "800e52ebb14d5831c047d391cd760f95", + "74aa2b412b42165f1967daf3042b4f17", + "140d4cc600944b629b1991e88a9fe97c", + // 64xN + "3d206f93229ee2cea5c5da4e0ac6445a", + "3d13028f8fffe79fd35752c0177291ca", + "e7a7669acb5979dc7b15a19eed09cd4c", + "599368f4971c203fc5fa32989fe8cb44", + // 128xN + "54b46af2e2c8d2081e26fa0315b4ffd7", + "602e769bb2104e78223e68e50e7e86a0", + }; + assert(block_size < kMaxBlockSizes); + return kDigests[block_size]; +} + +using DistanceWeightedBlendTest12bpp = DistanceWeightedBlendTest<12, uint16_t>; + +TEST_P(DistanceWeightedBlendTest12bpp, Blending) { + Test(GetDistanceWeightedBlendDigest12bpp(GetParam()), 1); +} + +TEST_P(DistanceWeightedBlendTest12bpp, DISABLED_Speed) { + Test(GetDistanceWeightedBlendDigest12bpp(GetParam()), kNumSpeedTests); +} + +INSTANTIATE_TEST_SUITE_P(C, DistanceWeightedBlendTest12bpp, + testing::ValuesIn(kTestParam)); +#endif // LIBGAV1_MAX_BITDEPTH == 12 + } // namespace } // namespace dsp diff --git a/src/dsp/dsp.cc b/src/dsp/dsp.cc index aac0ca0..97a064f 100644 --- a/src/dsp/dsp.cc +++ b/src/dsp/dsp.cc @@ -78,6 +78,12 @@ dsp::Dsp* GetWritableDspTable(int bitdepth) { return &dsp_10bpp; } #endif +#if LIBGAV1_MAX_BITDEPTH == 12 + case 12: { + static dsp::Dsp dsp_12bpp; + return &dsp_12bpp; + } +#endif } return nullptr; } @@ -157,6 +163,7 @@ void DspInit() { #if LIBGAV1_MAX_BITDEPTH >= 10 ConvolveInit10bpp_NEON(); InverseTransformInit10bpp_NEON(); + LoopFilterInit10bpp_NEON(); LoopRestorationInit10bpp_NEON(); #endif // LIBGAV1_MAX_BITDEPTH >= 10 #endif // LIBGAV1_ENABLE_NEON diff --git a/src/dsp/dsp_test.cc b/src/dsp/dsp_test.cc index 5c2a3aa..6d2817b 100644 --- a/src/dsp/dsp_test.cc +++ b/src/dsp/dsp_test.cc @@ -41,7 +41,9 @@ constexpr int kMaxTransform1dSize[kNumTransform1ds] = { }; void CheckTables(bool c_only) { -#if LIBGAV1_MAX_BITDEPTH >= 10 +#if LIBGAV1_MAX_BITDEPTH == 12 + static constexpr int kBitdepths[] = {kBitdepth8, kBitdepth10, kBitdepth12}; +#elif LIBGAV1_MAX_BITDEPTH >= 10 static constexpr int kBitdepths[] = {kBitdepth8, kBitdepth10}; #else static constexpr int kBitdepths[] = {kBitdepth8}; @@ -108,7 +110,9 @@ void CheckTables(bool c_only) { const uint32_t cpu_features = GetCpuInfo(); super_res_coefficients_is_nonnull = (cpu_features & kSSE4_1) != 0; #endif - if (c_only) super_res_coefficients_is_nonnull = false; + if (c_only || bitdepth == kBitdepth12) { + super_res_coefficients_is_nonnull = false; + } if (super_res_coefficients_is_nonnull) { EXPECT_NE(dsp->super_res_coefficients, nullptr); } else { @@ -234,6 +238,9 @@ TEST(Dsp, TablesArePopulatedCOnly) { #if LIBGAV1_MAX_BITDEPTH >= 10 test_utils::ResetDspTable(kBitdepth10); #endif +#if LIBGAV1_MAX_BITDEPTH == 12 + test_utils::ResetDspTable(kBitdepth12); +#endif dsp_internal::DspInit_C(); CheckTables(/*c_only=*/true); } @@ -241,15 +248,22 @@ TEST(Dsp, TablesArePopulatedCOnly) { TEST(Dsp, GetDspTable) { EXPECT_EQ(GetDspTable(1), nullptr); - EXPECT_NE(GetDspTable(8), nullptr); + EXPECT_NE(GetDspTable(kBitdepth8), nullptr); EXPECT_EQ(dsp_internal::GetWritableDspTable(1), nullptr); - EXPECT_NE(dsp_internal::GetWritableDspTable(8), nullptr); + EXPECT_NE(dsp_internal::GetWritableDspTable(kBitdepth8), nullptr); #if LIBGAV1_MAX_BITDEPTH >= 10 - EXPECT_NE(GetDspTable(10), nullptr); - EXPECT_NE(dsp_internal::GetWritableDspTable(10), nullptr); + EXPECT_NE(GetDspTable(kBitdepth10), nullptr); + EXPECT_NE(dsp_internal::GetWritableDspTable(kBitdepth10), nullptr); +#else + EXPECT_EQ(GetDspTable(kBitdepth10), nullptr); + EXPECT_EQ(dsp_internal::GetWritableDspTable(kBitdepth10), nullptr); +#endif +#if LIBGAV1_MAX_BITDEPTH == 12 + EXPECT_NE(GetDspTable(kBitdepth12), nullptr); + EXPECT_NE(dsp_internal::GetWritableDspTable(kBitdepth12), nullptr); #else - EXPECT_EQ(GetDspTable(10), nullptr); - EXPECT_EQ(dsp_internal::GetWritableDspTable(10), nullptr); + EXPECT_EQ(GetDspTable(kBitdepth12), nullptr); + EXPECT_EQ(dsp_internal::GetWritableDspTable(kBitdepth12), nullptr); #endif } diff --git a/src/dsp/film_grain.cc b/src/dsp/film_grain.cc index fa12b69..906230d 100644 --- a/src/dsp/film_grain.cc +++ b/src/dsp/film_grain.cc @@ -19,17 +19,16 @@ #include <cstddef> #include <cstdint> #include <cstring> -#include <new> -#include "src/dsp/common.h" #include "src/dsp/constants.h" #include "src/dsp/dsp.h" #include "src/dsp/film_grain_common.h" #include "src/utils/array_2d.h" #include "src/utils/common.h" #include "src/utils/compiler_attributes.h" -#include "src/utils/logging.h" +#include "src/utils/constants.h" #include "src/utils/memory.h" +#include "src/utils/types.h" namespace libgav1 { namespace dsp { @@ -45,7 +44,7 @@ void InitializeScalingLookupTable_C(int num_points, const uint8_t point_value[], memset(scaling_lut, 0, sizeof(scaling_lut[0]) * scaling_lut_length); return; } - constexpr int index_shift = bitdepth - kBitdepth8; + constexpr int index_shift = (bitdepth == kBitdepth10) ? 2 : 0; static_assert(sizeof(scaling_lut[0]) == 2, ""); Memset(scaling_lut, point_scaling[0], std::max(static_cast<int>(point_value[0]), 1) << index_shift); @@ -866,6 +865,121 @@ void Init10bpp() { } #endif // LIBGAV1_MAX_BITDEPTH >= 10 +#if LIBGAV1_MAX_BITDEPTH == 12 +void Init12bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(12); + assert(dsp != nullptr); +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + + // LumaAutoRegressionFunc + dsp->film_grain.luma_auto_regression[0] = + ApplyAutoRegressiveFilterToLumaGrain_C<kBitdepth12, int16_t>; + dsp->film_grain.luma_auto_regression[1] = + ApplyAutoRegressiveFilterToLumaGrain_C<kBitdepth12, int16_t>; + dsp->film_grain.luma_auto_regression[2] = + ApplyAutoRegressiveFilterToLumaGrain_C<kBitdepth12, int16_t>; + + // ChromaAutoRegressionFunc + // Chroma autoregression should never be called when lag is 0 and use_luma is + // false. + dsp->film_grain.chroma_auto_regression[0][0] = nullptr; + dsp->film_grain.chroma_auto_regression[0][1] = + ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 1, false>; + dsp->film_grain.chroma_auto_regression[0][2] = + ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 2, false>; + dsp->film_grain.chroma_auto_regression[0][3] = + ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 3, false>; + dsp->film_grain.chroma_auto_regression[1][0] = + ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 0, true>; + dsp->film_grain.chroma_auto_regression[1][1] = + ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 1, true>; + dsp->film_grain.chroma_auto_regression[1][2] = + ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 2, true>; + dsp->film_grain.chroma_auto_regression[1][3] = + ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 3, true>; + + // ConstructNoiseStripesFunc + dsp->film_grain.construct_noise_stripes[0] = + ConstructNoiseStripes_C<kBitdepth12, int16_t>; + dsp->film_grain.construct_noise_stripes[1] = + ConstructNoiseStripesWithOverlap_C<kBitdepth12, int16_t>; + + // ConstructNoiseImageOverlapFunc + dsp->film_grain.construct_noise_image_overlap = + ConstructNoiseImageOverlap_C<kBitdepth12, int16_t>; + + // InitializeScalingLutFunc + dsp->film_grain.initialize_scaling_lut = + InitializeScalingLookupTable_C<kBitdepth12>; + + // BlendNoiseWithImageLumaFunc + dsp->film_grain.blend_noise_luma = + BlendNoiseWithImageLuma_C<kBitdepth12, int16_t, uint16_t>; + + // BlendNoiseWithImageChromaFunc + dsp->film_grain.blend_noise_chroma[0] = + BlendNoiseWithImageChroma_C<kBitdepth12, int16_t, uint16_t>; + dsp->film_grain.blend_noise_chroma[1] = + BlendNoiseWithImageChromaWithCfl_C<kBitdepth12, int16_t, uint16_t>; +#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + static_cast<void>(dsp); +#ifndef LIBGAV1_Dsp12bpp_FilmGrainAutoregressionLuma + dsp->film_grain.luma_auto_regression[0] = + ApplyAutoRegressiveFilterToLumaGrain_C<kBitdepth12, int16_t>; + dsp->film_grain.luma_auto_regression[1] = + ApplyAutoRegressiveFilterToLumaGrain_C<kBitdepth12, int16_t>; + dsp->film_grain.luma_auto_regression[2] = + ApplyAutoRegressiveFilterToLumaGrain_C<kBitdepth12, int16_t>; +#endif +#ifndef LIBGAV1_Dsp12bpp_FilmGrainAutoregressionChroma + // Chroma autoregression should never be called when lag is 0 and use_luma is + // false. + dsp->film_grain.chroma_auto_regression[0][0] = nullptr; + dsp->film_grain.chroma_auto_regression[0][1] = + ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 1, false>; + dsp->film_grain.chroma_auto_regression[0][2] = + ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 2, false>; + dsp->film_grain.chroma_auto_regression[0][3] = + ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 3, false>; + dsp->film_grain.chroma_auto_regression[1][0] = + ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 0, true>; + dsp->film_grain.chroma_auto_regression[1][1] = + ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 1, true>; + dsp->film_grain.chroma_auto_regression[1][2] = + ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 2, true>; + dsp->film_grain.chroma_auto_regression[1][3] = + ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 3, true>; +#endif +#ifndef LIBGAV1_Dsp12bpp_FilmGrainConstructNoiseStripes + dsp->film_grain.construct_noise_stripes[0] = + ConstructNoiseStripes_C<kBitdepth12, int16_t>; + dsp->film_grain.construct_noise_stripes[1] = + ConstructNoiseStripesWithOverlap_C<kBitdepth12, int16_t>; +#endif +#ifndef LIBGAV1_Dsp12bpp_FilmGrainConstructNoiseImageOverlap + dsp->film_grain.construct_noise_image_overlap = + ConstructNoiseImageOverlap_C<kBitdepth12, int16_t>; +#endif +#ifndef LIBGAV1_Dsp12bpp_FilmGrainInitializeScalingLutFunc + dsp->film_grain.initialize_scaling_lut = + InitializeScalingLookupTable_C<kBitdepth12>; +#endif +#ifndef LIBGAV1_Dsp12bpp_FilmGrainBlendNoiseLuma + dsp->film_grain.blend_noise_luma = + BlendNoiseWithImageLuma_C<kBitdepth12, int16_t, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp12bpp_FilmGrainBlendNoiseChroma + dsp->film_grain.blend_noise_chroma[0] = + BlendNoiseWithImageChroma_C<kBitdepth12, int16_t, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp12bpp_FilmGrainBlendNoiseChromaWithCfl + dsp->film_grain.blend_noise_chroma[1] = + BlendNoiseWithImageChromaWithCfl_C<kBitdepth12, int16_t, uint16_t>; +#endif +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +} +#endif // LIBGAV1_MAX_BITDEPTH == 12 + } // namespace } // namespace film_grain @@ -874,6 +988,9 @@ void FilmGrainInit_C() { #if LIBGAV1_MAX_BITDEPTH >= 10 film_grain::Init10bpp(); #endif +#if LIBGAV1_MAX_BITDEPTH == 12 + film_grain::Init12bpp(); +#endif } } // namespace dsp diff --git a/src/dsp/film_grain_common.h b/src/dsp/film_grain_common.h index 2e6ad45..3c8d761 100644 --- a/src/dsp/film_grain_common.h +++ b/src/dsp/film_grain_common.h @@ -17,15 +17,7 @@ #ifndef LIBGAV1_SRC_DSP_FILM_GRAIN_COMMON_H_ #define LIBGAV1_SRC_DSP_FILM_GRAIN_COMMON_H_ -#include <cstddef> #include <cstdint> -#include <memory> -#include <type_traits> - -#include "src/dsp/common.h" -#include "src/utils/array_2d.h" -#include "src/utils/constants.h" -#include "src/utils/cpu.h" namespace libgav1 { diff --git a/src/dsp/intra_edge.cc b/src/dsp/intra_edge.cc index fe66db2..9875ef1 100644 --- a/src/dsp/intra_edge.cc +++ b/src/dsp/intra_edge.cc @@ -100,7 +100,26 @@ void Init10bpp() { #endif #endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS } +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +#if LIBGAV1_MAX_BITDEPTH == 12 +void Init12bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(12); + assert(dsp != nullptr); +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + dsp->intra_edge_filter = IntraEdgeFilter_C<uint16_t>; + dsp->intra_edge_upsampler = IntraEdgeUpsampler_C<12, uint16_t>; +#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + static_cast<void>(dsp); +#ifndef LIBGAV1_Dsp12bpp_IntraEdgeFilter + dsp->intra_edge_filter = IntraEdgeFilter_C<uint16_t>; +#endif +#ifndef LIBGAV1_Dsp12bpp_IntraEdgeUpsampler + dsp->intra_edge_upsampler = IntraEdgeUpsampler_C<12, uint16_t>; #endif +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +} +#endif // LIBGAV1_MAX_BITDEPTH == 12 } // namespace @@ -109,6 +128,9 @@ void IntraEdgeInit_C() { #if LIBGAV1_MAX_BITDEPTH >= 10 Init10bpp(); #endif +#if LIBGAV1_MAX_BITDEPTH == 12 + Init12bpp(); +#endif } } // namespace dsp diff --git a/src/dsp/intra_edge_test.cc b/src/dsp/intra_edge_test.cc index aca6f9e..b287544 100644 --- a/src/dsp/intra_edge_test.cc +++ b/src/dsp/intra_edge_test.cc @@ -76,6 +76,7 @@ constexpr EdgeFilterParams kIntraEdgeFilterParamList[] = { template <int bitdepth, typename Pixel> class IntraEdgeFilterTest : public testing::TestWithParam<EdgeFilterParams> { public: + static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, ""); IntraEdgeFilterTest() = default; IntraEdgeFilterTest(const IntraEdgeFilterTest&) = delete; IntraEdgeFilterTest& operator=(const IntraEdgeFilterTest&) = delete; @@ -315,11 +316,27 @@ TEST_P(IntraEdgeFilterTest10bpp, FixedInput) { } TEST_P(IntraEdgeFilterTest10bpp, DISABLED_Speed) { TestRandomValues(1e7); } -#endif +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +#if LIBGAV1_MAX_BITDEPTH == 12 +using IntraEdgeFilterTest12bpp = IntraEdgeFilterTest<12, uint16_t>; + +const char* GetIntraEdgeFilterDigest12bpp(int strength, int size) { + return GetIntraEdgeFilterDigest10bpp(strength, size); +} + +TEST_P(IntraEdgeFilterTest12bpp, FixedInput) { + TestFixedValues(GetIntraEdgeFilterDigest12bpp(strength_, size_)); + TestRandomValues(1); +} + +TEST_P(IntraEdgeFilterTest12bpp, DISABLED_Speed) { TestRandomValues(1e7); } +#endif // LIBGAV1_MAX_BITDEPTH == 12 template <int bitdepth, typename Pixel> class IntraEdgeUpsamplerTest : public testing::TestWithParam<int> { public: + static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, ""); IntraEdgeUpsamplerTest() = default; IntraEdgeUpsamplerTest(const IntraEdgeUpsamplerTest&) = delete; IntraEdgeUpsamplerTest& operator=(const IntraEdgeUpsamplerTest&) = delete; @@ -476,7 +493,22 @@ TEST_P(IntraEdgeUpsamplerTest10bpp, FixedInput) { } TEST_P(IntraEdgeUpsamplerTest10bpp, DISABLED_Speed) { TestRandomValues(5e7); } -#endif +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +#if LIBGAV1_MAX_BITDEPTH == 12 +using IntraEdgeUpsamplerTest12bpp = IntraEdgeUpsamplerTest<12, uint16_t>; + +const char* GetIntraEdgeUpsampleDigest12bpp(int size) { + return GetIntraEdgeUpsampleDigest10bpp(size); +} + +TEST_P(IntraEdgeUpsamplerTest12bpp, FixedInput) { + TestFixedValues(GetIntraEdgeUpsampleDigest12bpp(size_)); + TestRandomValues(1); +} + +TEST_P(IntraEdgeUpsamplerTest12bpp, DISABLED_Speed) { TestRandomValues(5e7); } +#endif // LIBGAV1_MAX_BITDEPTH == 12 INSTANTIATE_TEST_SUITE_P(C, IntraEdgeFilterTest8bpp, testing::ValuesIn(kIntraEdgeFilterParamList)); @@ -512,7 +544,15 @@ INSTANTIATE_TEST_SUITE_P(NEON, IntraEdgeUpsamplerTest10bpp, testing::ValuesIn(kIntraEdgeUpsampleSizes)); #endif -#endif +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +#if LIBGAV1_MAX_BITDEPTH == 12 +INSTANTIATE_TEST_SUITE_P(C, IntraEdgeFilterTest12bpp, + testing::ValuesIn(kIntraEdgeFilterParamList)); +INSTANTIATE_TEST_SUITE_P(C, IntraEdgeUpsamplerTest12bpp, + testing::ValuesIn(kIntraEdgeUpsampleSizes)); +#endif // LIBGAV1_MAX_BITDEPTH == 12 + } // namespace } // namespace dsp } // namespace libgav1 diff --git a/src/dsp/intrapred.cc b/src/dsp/intrapred.cc index 75af279..3162acc 100644 --- a/src/dsp/intrapred.cc +++ b/src/dsp/intrapred.cc @@ -1422,6 +1422,551 @@ void Init10bpp() { } // NOLINT(readability/fn_size) #endif // LIBGAV1_MAX_BITDEPTH >= 10 +#if LIBGAV1_MAX_BITDEPTH == 12 +using Defs12bpp = IntraPredBppDefs<12, uint16_t>; + +void Init12bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(12); + assert(dsp != nullptr); +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + INIT_INTRAPREDICTORS(DefsHbd, Defs12bpp); +#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcFill] = + Defs12bpp::_4x4::DcFill; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcTop] = + DefsHbd::_4x4::DcTop; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcLeft] = + DefsHbd::_4x4::DcLeft; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_IntraPredictorDc + dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDc] = + DefsHbd::_4x4::Dc; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_IntraPredictorVertical + dsp->intra_predictors[kTransformSize4x4][kIntraPredictorVertical] = + DefsHbd::_4x4::Vertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize4x4][kIntraPredictorHorizontal] = + DefsHbd::_4x4::Horizontal; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize4x4][kIntraPredictorPaeth] = + DefsHbd::_4x4::Paeth; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcFill] = + Defs12bpp::_4x8::DcFill; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcTop] = + DefsHbd::_4x8::DcTop; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcLeft] = + DefsHbd::_4x8::DcLeft; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_IntraPredictorDc + dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDc] = + DefsHbd::_4x8::Dc; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_IntraPredictorVertical + dsp->intra_predictors[kTransformSize4x8][kIntraPredictorVertical] = + DefsHbd::_4x8::Vertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize4x8][kIntraPredictorHorizontal] = + DefsHbd::_4x8::Horizontal; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize4x8][kIntraPredictorPaeth] = + DefsHbd::_4x8::Paeth; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcFill] = + Defs12bpp::_4x16::DcFill; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcTop] = + DefsHbd::_4x16::DcTop; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcLeft] = + DefsHbd::_4x16::DcLeft; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_IntraPredictorDc + dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDc] = + DefsHbd::_4x16::Dc; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_IntraPredictorVertical + dsp->intra_predictors[kTransformSize4x16][kIntraPredictorVertical] = + DefsHbd::_4x16::Vertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize4x16][kIntraPredictorHorizontal] = + DefsHbd::_4x16::Horizontal; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize4x16][kIntraPredictorPaeth] = + DefsHbd::_4x16::Paeth; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcFill] = + Defs12bpp::_8x4::DcFill; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcTop] = + DefsHbd::_8x4::DcTop; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcLeft] = + DefsHbd::_8x4::DcLeft; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_IntraPredictorDc + dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDc] = + DefsHbd::_8x4::Dc; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_IntraPredictorVertical + dsp->intra_predictors[kTransformSize8x4][kIntraPredictorVertical] = + DefsHbd::_8x4::Vertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize8x4][kIntraPredictorHorizontal] = + DefsHbd::_8x4::Horizontal; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize8x4][kIntraPredictorPaeth] = + DefsHbd::_8x4::Paeth; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcFill] = + Defs12bpp::_8x8::DcFill; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcTop] = + DefsHbd::_8x8::DcTop; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcLeft] = + DefsHbd::_8x8::DcLeft; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_IntraPredictorDc + dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDc] = + DefsHbd::_8x8::Dc; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_IntraPredictorVertical + dsp->intra_predictors[kTransformSize8x8][kIntraPredictorVertical] = + DefsHbd::_8x8::Vertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize8x8][kIntraPredictorHorizontal] = + DefsHbd::_8x8::Horizontal; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize8x8][kIntraPredictorPaeth] = + DefsHbd::_8x8::Paeth; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcFill] = + Defs12bpp::_8x16::DcFill; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcTop] = + DefsHbd::_8x16::DcTop; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcLeft] = + DefsHbd::_8x16::DcLeft; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_IntraPredictorDc + dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDc] = + DefsHbd::_8x16::Dc; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_IntraPredictorVertical + dsp->intra_predictors[kTransformSize8x16][kIntraPredictorVertical] = + DefsHbd::_8x16::Vertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize8x16][kIntraPredictorHorizontal] = + DefsHbd::_8x16::Horizontal; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize8x16][kIntraPredictorPaeth] = + DefsHbd::_8x16::Paeth; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcFill] = + Defs12bpp::_8x32::DcFill; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcTop] = + DefsHbd::_8x32::DcTop; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcLeft] = + DefsHbd::_8x32::DcLeft; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_IntraPredictorDc + dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDc] = + DefsHbd::_8x32::Dc; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_IntraPredictorVertical + dsp->intra_predictors[kTransformSize8x32][kIntraPredictorVertical] = + DefsHbd::_8x32::Vertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize8x32][kIntraPredictorHorizontal] = + DefsHbd::_8x32::Horizontal; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize8x32][kIntraPredictorPaeth] = + DefsHbd::_8x32::Paeth; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcFill] = + Defs12bpp::_16x4::DcFill; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcTop] = + DefsHbd::_16x4::DcTop; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcLeft] = + DefsHbd::_16x4::DcLeft; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_IntraPredictorDc + dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDc] = + DefsHbd::_16x4::Dc; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_IntraPredictorVertical + dsp->intra_predictors[kTransformSize16x4][kIntraPredictorVertical] = + DefsHbd::_16x4::Vertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize16x4][kIntraPredictorHorizontal] = + DefsHbd::_16x4::Horizontal; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize16x4][kIntraPredictorPaeth] = + DefsHbd::_16x4::Paeth; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcFill] = + Defs12bpp::_16x8::DcFill; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcTop] = + DefsHbd::_16x8::DcTop; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcLeft] = + DefsHbd::_16x8::DcLeft; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_IntraPredictorDc + dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDc] = + DefsHbd::_16x8::Dc; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_IntraPredictorVertical + dsp->intra_predictors[kTransformSize16x8][kIntraPredictorVertical] = + DefsHbd::_16x8::Vertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize16x8][kIntraPredictorHorizontal] = + DefsHbd::_16x8::Horizontal; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize16x8][kIntraPredictorPaeth] = + DefsHbd::_16x8::Paeth; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcFill] = + Defs12bpp::_16x16::DcFill; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcTop] = + DefsHbd::_16x16::DcTop; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcLeft] = + DefsHbd::_16x16::DcLeft; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_IntraPredictorDc + dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDc] = + DefsHbd::_16x16::Dc; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_IntraPredictorVertical + dsp->intra_predictors[kTransformSize16x16][kIntraPredictorVertical] = + DefsHbd::_16x16::Vertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize16x16][kIntraPredictorHorizontal] = + DefsHbd::_16x16::Horizontal; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize16x16][kIntraPredictorPaeth] = + DefsHbd::_16x16::Paeth; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcFill] = + Defs12bpp::_16x32::DcFill; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcTop] = + DefsHbd::_16x32::DcTop; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcLeft] = + DefsHbd::_16x32::DcLeft; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_IntraPredictorDc + dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDc] = + DefsHbd::_16x32::Dc; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_IntraPredictorVertical + dsp->intra_predictors[kTransformSize16x32][kIntraPredictorVertical] = + DefsHbd::_16x32::Vertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize16x32][kIntraPredictorHorizontal] = + DefsHbd::_16x32::Horizontal; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize16x32][kIntraPredictorPaeth] = + DefsHbd::_16x32::Paeth; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x64_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcFill] = + Defs12bpp::_16x64::DcFill; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x64_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcTop] = + DefsHbd::_16x64::DcTop; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x64_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcLeft] = + DefsHbd::_16x64::DcLeft; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x64_IntraPredictorDc + dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDc] = + DefsHbd::_16x64::Dc; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x64_IntraPredictorVertical + dsp->intra_predictors[kTransformSize16x64][kIntraPredictorVertical] = + DefsHbd::_16x64::Vertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x64_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize16x64][kIntraPredictorHorizontal] = + DefsHbd::_16x64::Horizontal; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x64_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize16x64][kIntraPredictorPaeth] = + DefsHbd::_16x64::Paeth; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcFill] = + Defs12bpp::_32x8::DcFill; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcTop] = + DefsHbd::_32x8::DcTop; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcLeft] = + DefsHbd::_32x8::DcLeft; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_IntraPredictorDc + dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDc] = + DefsHbd::_32x8::Dc; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_IntraPredictorVertical + dsp->intra_predictors[kTransformSize32x8][kIntraPredictorVertical] = + DefsHbd::_32x8::Vertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize32x8][kIntraPredictorHorizontal] = + DefsHbd::_32x8::Horizontal; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize32x8][kIntraPredictorPaeth] = + DefsHbd::_32x8::Paeth; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcFill] = + Defs12bpp::_32x16::DcFill; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcTop] = + DefsHbd::_32x16::DcTop; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcLeft] = + DefsHbd::_32x16::DcLeft; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_IntraPredictorDc + dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDc] = + DefsHbd::_32x16::Dc; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_IntraPredictorVertical + dsp->intra_predictors[kTransformSize32x16][kIntraPredictorVertical] = + DefsHbd::_32x16::Vertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize32x16][kIntraPredictorHorizontal] = + DefsHbd::_32x16::Horizontal; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize32x16][kIntraPredictorPaeth] = + DefsHbd::_32x16::Paeth; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcFill] = + Defs12bpp::_32x32::DcFill; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcTop] = + DefsHbd::_32x32::DcTop; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcLeft] = + DefsHbd::_32x32::DcLeft; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_IntraPredictorDc + dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDc] = + DefsHbd::_32x32::Dc; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_IntraPredictorVertical + dsp->intra_predictors[kTransformSize32x32][kIntraPredictorVertical] = + DefsHbd::_32x32::Vertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize32x32][kIntraPredictorHorizontal] = + DefsHbd::_32x32::Horizontal; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize32x32][kIntraPredictorPaeth] = + DefsHbd::_32x32::Paeth; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x64_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcFill] = + Defs12bpp::_32x64::DcFill; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x64_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcTop] = + DefsHbd::_32x64::DcTop; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x64_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcLeft] = + DefsHbd::_32x64::DcLeft; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x64_IntraPredictorDc + dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDc] = + DefsHbd::_32x64::Dc; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x64_IntraPredictorVertical + dsp->intra_predictors[kTransformSize32x64][kIntraPredictorVertical] = + DefsHbd::_32x64::Vertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x64_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize32x64][kIntraPredictorHorizontal] = + DefsHbd::_32x64::Horizontal; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x64_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize32x64][kIntraPredictorPaeth] = + DefsHbd::_32x64::Paeth; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize64x16_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcFill] = + Defs12bpp::_64x16::DcFill; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize64x16_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcTop] = + DefsHbd::_64x16::DcTop; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize64x16_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcLeft] = + DefsHbd::_64x16::DcLeft; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize64x16_IntraPredictorDc + dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDc] = + DefsHbd::_64x16::Dc; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize64x16_IntraPredictorVertical + dsp->intra_predictors[kTransformSize64x16][kIntraPredictorVertical] = + DefsHbd::_64x16::Vertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize64x16_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize64x16][kIntraPredictorHorizontal] = + DefsHbd::_64x16::Horizontal; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize64x16_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize64x16][kIntraPredictorPaeth] = + DefsHbd::_64x16::Paeth; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize64x32_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcFill] = + Defs12bpp::_64x32::DcFill; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize64x32_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcTop] = + DefsHbd::_64x32::DcTop; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize64x32_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcLeft] = + DefsHbd::_64x32::DcLeft; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize64x32_IntraPredictorDc + dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDc] = + DefsHbd::_64x32::Dc; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize64x32_IntraPredictorVertical + dsp->intra_predictors[kTransformSize64x32][kIntraPredictorVertical] = + DefsHbd::_64x32::Vertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize64x32_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize64x32][kIntraPredictorHorizontal] = + DefsHbd::_64x32::Horizontal; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize64x32_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize64x32][kIntraPredictorPaeth] = + DefsHbd::_64x32::Paeth; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize64x64_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcFill] = + Defs12bpp::_64x64::DcFill; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize64x64_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcTop] = + DefsHbd::_64x64::DcTop; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize64x64_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcLeft] = + DefsHbd::_64x64::DcLeft; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize64x64_IntraPredictorDc + dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDc] = + DefsHbd::_64x64::Dc; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize64x64_IntraPredictorVertical + dsp->intra_predictors[kTransformSize64x64][kIntraPredictorVertical] = + DefsHbd::_64x64::Vertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize64x64_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize64x64][kIntraPredictorHorizontal] = + DefsHbd::_64x64::Horizontal; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize64x64_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize64x64][kIntraPredictorPaeth] = + DefsHbd::_64x64::Paeth; +#endif +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +} // NOLINT(readability/fn_size) +#endif // LIBGAV1_MAX_BITDEPTH == 12 + #undef INIT_INTRAPREDICTORS_WxH #undef INIT_INTRAPREDICTORS } // namespace @@ -1431,6 +1976,9 @@ void IntraPredInit_C() { #if LIBGAV1_MAX_BITDEPTH >= 10 Init10bpp(); #endif +#if LIBGAV1_MAX_BITDEPTH == 12 + Init12bpp(); +#endif } } // namespace dsp diff --git a/src/dsp/intrapred_cfl.cc b/src/dsp/intrapred_cfl.cc index 0f7f4f2..798bb73 100644 --- a/src/dsp/intrapred_cfl.cc +++ b/src/dsp/intrapred_cfl.cc @@ -639,6 +639,263 @@ void Init10bpp() { } // NOLINT(readability/fn_size) #endif // LIBGAV1_MAX_BITDEPTH >= 10 +#if LIBGAV1_MAX_BITDEPTH == 12 +void Init12bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(12); + assert(dsp != nullptr); +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + INIT_CFL_INTRAPREDICTORS(12, uint16_t); +#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize4x4] = + CflIntraPredictor_C<4, 4, 12, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] = + CflSubsampler_C<4, 4, 12, uint16_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType422] = + CflSubsampler_C<4, 4, 12, uint16_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] = + CflSubsampler_C<4, 4, 12, uint16_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize4x8] = + CflIntraPredictor_C<4, 8, 12, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] = + CflSubsampler_C<4, 8, 12, uint16_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType422] = + CflSubsampler_C<4, 8, 12, uint16_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] = + CflSubsampler_C<4, 8, 12, uint16_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize4x16] = + CflIntraPredictor_C<4, 16, 12, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] = + CflSubsampler_C<4, 16, 12, uint16_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType422] = + CflSubsampler_C<4, 16, 12, uint16_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] = + CflSubsampler_C<4, 16, 12, uint16_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize8x4] = + CflIntraPredictor_C<8, 4, 12, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] = + CflSubsampler_C<8, 4, 12, uint16_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType422] = + CflSubsampler_C<8, 4, 12, uint16_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] = + CflSubsampler_C<8, 4, 12, uint16_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize8x8] = + CflIntraPredictor_C<8, 8, 12, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] = + CflSubsampler_C<8, 8, 12, uint16_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType422] = + CflSubsampler_C<8, 8, 12, uint16_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] = + CflSubsampler_C<8, 8, 12, uint16_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize8x16] = + CflIntraPredictor_C<8, 16, 12, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] = + CflSubsampler_C<8, 16, 12, uint16_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType422] = + CflSubsampler_C<8, 16, 12, uint16_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] = + CflSubsampler_C<8, 16, 12, uint16_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize8x32] = + CflIntraPredictor_C<8, 32, 12, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] = + CflSubsampler_C<8, 32, 12, uint16_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType422] = + CflSubsampler_C<8, 32, 12, uint16_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] = + CflSubsampler_C<8, 32, 12, uint16_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize16x4] = + CflIntraPredictor_C<16, 4, 12, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] = + CflSubsampler_C<16, 4, 12, uint16_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType422] = + CflSubsampler_C<16, 4, 12, uint16_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] = + CflSubsampler_C<16, 4, 12, uint16_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize16x8] = + CflIntraPredictor_C<16, 8, 12, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] = + CflSubsampler_C<16, 8, 12, uint16_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType422] = + CflSubsampler_C<16, 8, 12, uint16_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] = + CflSubsampler_C<16, 8, 12, uint16_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize16x16] = + CflIntraPredictor_C<16, 16, 12, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] = + CflSubsampler_C<16, 16, 12, uint16_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType422] = + CflSubsampler_C<16, 16, 12, uint16_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] = + CflSubsampler_C<16, 16, 12, uint16_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize16x32] = + CflIntraPredictor_C<16, 32, 12, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] = + CflSubsampler_C<16, 32, 12, uint16_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType422] = + CflSubsampler_C<16, 32, 12, uint16_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] = + CflSubsampler_C<16, 32, 12, uint16_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize32x8] = + CflIntraPredictor_C<32, 8, 12, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] = + CflSubsampler_C<32, 8, 12, uint16_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType422] = + CflSubsampler_C<32, 8, 12, uint16_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] = + CflSubsampler_C<32, 8, 12, uint16_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize32x16] = + CflIntraPredictor_C<32, 16, 12, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] = + CflSubsampler_C<32, 16, 12, uint16_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType422] = + CflSubsampler_C<32, 16, 12, uint16_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] = + CflSubsampler_C<32, 16, 12, uint16_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize32x32] = + CflIntraPredictor_C<32, 32, 12, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] = + CflSubsampler_C<32, 32, 12, uint16_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType422] = + CflSubsampler_C<32, 32, 12, uint16_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] = + CflSubsampler_C<32, 32, 12, uint16_t, 1, 1>; +#endif + +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + // Cfl predictors are available only for transform sizes with max(width, + // height) <= 32. Set all others to nullptr. + for (const auto i : kTransformSizesLargerThan32x32) { + dsp->cfl_intra_predictors[i] = nullptr; + for (int j = 0; j < kNumSubsamplingTypes; ++j) { + dsp->cfl_subsamplers[i][j] = nullptr; + } + } +} // NOLINT(readability/fn_size) +#endif // LIBGAV1_MAX_BITDEPTH == 12 + #undef INIT_CFL_INTRAPREDICTOR_WxH #undef INIT_CFL_INTRAPREDICTORS @@ -649,6 +906,9 @@ void IntraPredCflInit_C() { #if LIBGAV1_MAX_BITDEPTH >= 10 Init10bpp(); #endif +#if LIBGAV1_MAX_BITDEPTH == 12 + Init12bpp(); +#endif } } // namespace dsp diff --git a/src/dsp/intrapred_cfl_test.cc b/src/dsp/intrapred_cfl_test.cc index 82f1d2f..8415d51 100644 --- a/src/dsp/intrapred_cfl_test.cc +++ b/src/dsp/intrapred_cfl_test.cc @@ -49,6 +49,7 @@ template <int bitdepth, typename Pixel> class IntraPredTestBase : public testing::TestWithParam<TransformSize>, public test_utils::MaxAlignedAllocable { public: + static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, ""); IntraPredTestBase() { switch (tx_size_) { case kNumTransformSizes: @@ -127,6 +128,7 @@ class IntraPredTestBase : public testing::TestWithParam<TransformSize>, template <int bitdepth, typename Pixel> class CflIntraPredTest : public IntraPredTestBase<bitdepth, Pixel> { public: + static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, ""); CflIntraPredTest() = default; CflIntraPredTest(const CflIntraPredTest&) = delete; CflIntraPredTest& operator=(const CflIntraPredTest&) = delete; @@ -274,6 +276,7 @@ void CflIntraPredTest<bitdepth, Pixel>::TestRandomValues() { template <int bitdepth, typename Pixel, SubsamplingType subsampling_type> class CflSubsamplerTest : public IntraPredTestBase<bitdepth, Pixel> { public: + static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, ""); CflSubsamplerTest() = default; CflSubsamplerTest(const CflSubsamplerTest&) = delete; CflSubsamplerTest& operator=(const CflSubsamplerTest&) = delete; @@ -654,8 +657,6 @@ TEST_P(CflSubsamplerTest8bpp420, Random) { TestRandomValues(); } //------------------------------------------------------------------------------ #if LIBGAV1_MAX_BITDEPTH >= 10 -//------------------------------------------------------------------------------ - using CflIntraPredTest10bpp = CflIntraPredTest<10, uint16_t>; const char* GetCflIntraPredDigest10bpp(TransformSize tx_size) { @@ -853,9 +854,238 @@ TEST_P(CflSubsamplerTest10bpp420, FixedInput) { TEST_P(CflSubsamplerTest10bpp420, Overflow) { TestSaturatedValues(); } TEST_P(CflSubsamplerTest10bpp420, Random) { TestRandomValues(); } - #endif // LIBGAV1_MAX_BITDEPTH >= 10 +//------------------------------------------------------------------------------ + +#if LIBGAV1_MAX_BITDEPTH == 12 +using CflIntraPredTest12bpp = CflIntraPredTest<12, uint16_t>; + +const char* GetCflIntraPredDigest12bpp(TransformSize tx_size) { + static const char* const kDigest4x4 = "1d92a681a58f99396f22acd8b3154e2b"; + static const char* const kDigest4x8 = "cf6833ebc64c9ae45f192ee384ef4aa3"; + static const char* const kDigest4x16 = "06a4fbb8590aca98a045c902ed15c777"; + static const char* const kDigest8x4 = "ad5944c7455f731ae8dd28b2b25a1b9f"; + static const char* const kDigest8x8 = "c19621e42ca2bc184d5065131d27be2c"; + static const char* const kDigest8x16 = "8faa7c95e8c3c18621168ed6759c1ac1"; + static const char* const kDigest8x32 = "502699ef7a8c7aebc8c3bc653e733703"; + static const char* const kDigest16x4 = "7f30bb038217967336fb8548a6f7df45"; + static const char* const kDigest16x8 = "b70943098d0fb256c2943e2ebdbe6d34"; + static const char* const kDigest16x16 = "4c34f5669880ab78d648b16b68ea0c24"; + static const char* const kDigest16x32 = "5d85daf690020ed235617870a1a179b1"; + static const char* const kDigest32x8 = "f8eec12e58c469ffb698fc60b13b927c"; + static const char* const kDigest32x16 = "f272bb7e5d2df333aa63d806c95e6748"; + static const char* const kDigest32x32 = "c737987c0a5414b03e6014f145dd999c"; + + switch (tx_size) { + case kTransformSize4x4: + return kDigest4x4; + case kTransformSize4x8: + return kDigest4x8; + case kTransformSize4x16: + return kDigest4x16; + case kTransformSize8x4: + return kDigest8x4; + case kTransformSize8x8: + return kDigest8x8; + case kTransformSize8x16: + return kDigest8x16; + case kTransformSize8x32: + return kDigest8x32; + case kTransformSize16x4: + return kDigest16x4; + case kTransformSize16x8: + return kDigest16x8; + case kTransformSize16x16: + return kDigest16x16; + case kTransformSize16x32: + return kDigest16x32; + case kTransformSize32x8: + return kDigest32x8; + case kTransformSize32x16: + return kDigest32x16; + case kTransformSize32x32: + return kDigest32x32; + default: + ADD_FAILURE() << "Unknown transform size: " << tx_size; + return nullptr; + } +} + +TEST_P(CflIntraPredTest12bpp, DISABLED_Speed) { + const auto num_runs = + static_cast<int>(2.0e9 / (block_width_ * block_height_)); + TestSpeed(GetCflIntraPredDigest12bpp(tx_size_), num_runs); +} + +TEST_P(CflIntraPredTest12bpp, FixedInput) { + TestSpeed(GetCflIntraPredDigest12bpp(tx_size_), 1); +} + +TEST_P(CflIntraPredTest12bpp, Overflow) { TestSaturatedValues(); } + +TEST_P(CflIntraPredTest12bpp, Random) { TestRandomValues(); } + +//------------------------------------------------------------------------------ + +using CflSubsamplerTest12bpp444 = + CflSubsamplerTest<12, uint16_t, kSubsamplingType444>; +using CflSubsamplerTest12bpp422 = + CflSubsamplerTest<12, uint16_t, kSubsamplingType422>; +using CflSubsamplerTest12bpp420 = + CflSubsamplerTest<12, uint16_t, kSubsamplingType420>; + +const char* GetCflSubsamplerDigest12bpp(TransformSize tx_size, + SubsamplingType subsampling_type) { + static const char* const kDigests4x4[3] = { + "44af37c60e9ccaacea004b57d5dea4cf", + "e29dd1d93f23b23778ed8cd85910d987", + "81e5dac2fd4c90f872ab814ed0f76ae5", + }; + static const char* const kDigests4x8[3] = { + "bfc04aed9fe41ec07b0462a219652d16", + "693dd064636a0aa3be7aa098e867c512", + "0636c25d88aacd85d63e56011e7c5d15", + }; + static const char* const kDigests4x16[3] = { + "6479ab30377288e75a78068d47c7e194", + "7d6f9b8b3eb85e73626118fc9210e622", + "1f3d474cd7c86899da90e515b8b7a906", + }; + static const char* const kDigests8x4[3] = { + "7da5a2029bcdab159225c475fdff02da", + "096bfef24caa0670d2cd7b0bb63a7ba6", + "f749310dfc8a6129ed438dbc845470c0", + }; + static const char* const kDigests8x8[3] = { + "08494051a7ff50718313a79ec7c51f92", + "637efad0630e253f7cce11af1a0af456", + "b220faf7dfedef860d59079dcf201757", + }; + static const char* const kDigests8x16[3] = { + "19f027af516e88d3b9e613e578deb126", + "4f3bb155d70f9ea76d05b2f41b297a0c", + "b7504347eeda1e59ba8e36385c219e40", + }; + static const char* const kDigests8x32[3] = { + "b8f1ef01c5672c87ee1004bb3cd7b8bc", + "b3e3318b050eb1c165d1e320ef622fa7", + "67754f7c5ae84dc23bb76ffaa2fa848e", + }; + static const char* const kDigests16x4[3] = { + "f687fb4e22d8a1446eeb4915036874f4", + "7b5ef3d393a98dfe0ba49a0db2083465", + "840bbb6edaa50e9f7d391033a3dda2d9", + }; + static const char* const kDigests16x8[3] = { + "dd9aed11d115a028035f0cee5b90d433", + "340d5d0784356ea199d3d751f4d6ed5e", + "e55f6fb5f34d829727e9dc2068098933", + }; + static const char* const kDigests16x16[3] = { + "1df36a20d76a405c6273b88b38693cf9", + "2a7590d01df60b4bc6f10bfdb07b7a65", + "510ee31a5bd609e8f4542bb817539668", + }; + static const char* const kDigests16x32[3] = { + "bdbc13b9fb7c3c50d25fda57f86f5ad9", + "7c138c568794b3d0c8aabff2edc07efd", + "581bef267c2a66e4c2fb079968440dbe", + }; + static const char* const kDigests32x8[3] = { + "26f62743793811475e2afe1414c5fee1", + "6e6bf1678a04f2f727f0679564fb3630", + "a4c15562c26dbcfa43fe03a2b6e728b5", + }; + static const char* const kDigests32x16[3] = { + "791f0713bbf032081da8ec08e58b9cd3", + "5dc7a673e92767186ae86996f4a30691", + "651f09d1244c817d92d1baa094c86f56", + }; + static const char* const kDigests32x32[3] = { + "543a9d76e7238d88ba86218ec47c1f49", + "b0f2b29aae4858c1f09c27fc4344fd15", + "1d45083875fed14c4e5f149384a3cd2d", + }; + + switch (tx_size) { + case kTransformSize4x4: + return kDigests4x4[subsampling_type]; + case kTransformSize4x8: + return kDigests4x8[subsampling_type]; + case kTransformSize4x16: + return kDigests4x16[subsampling_type]; + case kTransformSize8x4: + return kDigests8x4[subsampling_type]; + case kTransformSize8x8: + return kDigests8x8[subsampling_type]; + case kTransformSize8x16: + return kDigests8x16[subsampling_type]; + case kTransformSize8x32: + return kDigests8x32[subsampling_type]; + case kTransformSize16x4: + return kDigests16x4[subsampling_type]; + case kTransformSize16x8: + return kDigests16x8[subsampling_type]; + case kTransformSize16x16: + return kDigests16x16[subsampling_type]; + case kTransformSize16x32: + return kDigests16x32[subsampling_type]; + case kTransformSize32x8: + return kDigests32x8[subsampling_type]; + case kTransformSize32x16: + return kDigests32x16[subsampling_type]; + case kTransformSize32x32: + return kDigests32x32[subsampling_type]; + default: + ADD_FAILURE() << "Unknown transform size: " << tx_size; + return nullptr; + } +} + +TEST_P(CflSubsamplerTest12bpp444, DISABLED_Speed) { + const auto num_runs = + static_cast<int>(2.0e9 / (block_width_ * block_height_)); + TestSpeed(GetCflSubsamplerDigest12bpp(tx_size_, SubsamplingType()), num_runs); +} + +TEST_P(CflSubsamplerTest12bpp444, FixedInput) { + TestSpeed(GetCflSubsamplerDigest12bpp(tx_size_, SubsamplingType()), 1); +} + +TEST_P(CflSubsamplerTest12bpp444, Overflow) { TestSaturatedValues(); } + +TEST_P(CflSubsamplerTest12bpp444, Random) { TestRandomValues(); } + +TEST_P(CflSubsamplerTest12bpp422, DISABLED_Speed) { + const auto num_runs = + static_cast<int>(2.0e9 / (block_width_ * block_height_)); + TestSpeed(GetCflSubsamplerDigest12bpp(tx_size_, SubsamplingType()), num_runs); +} + +TEST_P(CflSubsamplerTest12bpp422, FixedInput) { + TestSpeed(GetCflSubsamplerDigest12bpp(tx_size_, SubsamplingType()), 1); +} + +TEST_P(CflSubsamplerTest12bpp422, Overflow) { TestSaturatedValues(); } + +TEST_P(CflSubsamplerTest12bpp422, Random) { TestRandomValues(); } + +TEST_P(CflSubsamplerTest12bpp420, DISABLED_Speed) { + const auto num_runs = + static_cast<int>(2.0e9 / (block_width_ * block_height_)); + TestSpeed(GetCflSubsamplerDigest12bpp(tx_size_, SubsamplingType()), num_runs); +} + +TEST_P(CflSubsamplerTest12bpp420, FixedInput) { + TestSpeed(GetCflSubsamplerDigest12bpp(tx_size_, SubsamplingType()), 1); +} + +TEST_P(CflSubsamplerTest12bpp420, Overflow) { TestSaturatedValues(); } + +TEST_P(CflSubsamplerTest12bpp420, Random) { TestRandomValues(); } +#endif // LIBGAV1_MAX_BITDEPTH == 12 + // Cfl predictors are available only for transform sizes with // max(width, height) <= 32. constexpr TransformSize kTransformSizesSmallerThan32x32[] = { @@ -918,6 +1148,17 @@ INSTANTIATE_TEST_SUITE_P(NEON, CflSubsamplerTest10bpp420, #endif // LIBGAV1_MAX_BITDEPTH >= 10 +#if LIBGAV1_MAX_BITDEPTH == 12 +INSTANTIATE_TEST_SUITE_P(C, CflIntraPredTest12bpp, + testing::ValuesIn(kTransformSizesSmallerThan32x32)); +INSTANTIATE_TEST_SUITE_P(C, CflSubsamplerTest12bpp444, + testing::ValuesIn(kTransformSizesSmallerThan32x32)); +INSTANTIATE_TEST_SUITE_P(C, CflSubsamplerTest12bpp422, + testing::ValuesIn(kTransformSizesSmallerThan32x32)); +INSTANTIATE_TEST_SUITE_P(C, CflSubsamplerTest12bpp420, + testing::ValuesIn(kTransformSizesSmallerThan32x32)); +#endif // LIBGAV1_MAX_BITDEPTH == 12 + } // namespace } // namespace dsp diff --git a/src/dsp/intrapred_directional.cc b/src/dsp/intrapred_directional.cc index 21a40b5..9146074 100644 --- a/src/dsp/intrapred_directional.cc +++ b/src/dsp/intrapred_directional.cc @@ -94,11 +94,19 @@ void DirectionalIntraPredictorZone1_C( } while (++y < height); } +// clang 14.0.0 produces incorrect code with LIBGAV1_RESTRICT. +// https://github.com/llvm/llvm-project/issues/54427 +#if defined(__clang__) && __clang_major__ == 14 +#define LOCAL_RESTRICT +#else +#define LOCAL_RESTRICT LIBGAV1_RESTRICT +#endif + template <typename Pixel> void DirectionalIntraPredictorZone2_C( - void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, - const void* LIBGAV1_RESTRICT const top_row, - const void* LIBGAV1_RESTRICT const left_column, const int width, + void* LOCAL_RESTRICT const dest, ptrdiff_t stride, + const void* LOCAL_RESTRICT const top_row, + const void* LOCAL_RESTRICT const left_column, const int width, const int height, const int xstep, const int ystep, const bool upsampled_top, const bool upsampled_left) { const auto* const top = static_cast<const Pixel*>(top_row); @@ -143,6 +151,8 @@ void DirectionalIntraPredictorZone2_C( } while (++y < height); } +#undef LOCAL_RESTRICT + template <typename Pixel> void DirectionalIntraPredictorZone3_C( void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, @@ -236,6 +246,34 @@ void Init10bpp() { } #endif // LIBGAV1_MAX_BITDEPTH >= 10 +#if LIBGAV1_MAX_BITDEPTH == 12 +void Init12bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(12); + assert(dsp != nullptr); +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + dsp->directional_intra_predictor_zone1 = + DirectionalIntraPredictorZone1_C<uint16_t>; + dsp->directional_intra_predictor_zone2 = + DirectionalIntraPredictorZone2_C<uint16_t>; + dsp->directional_intra_predictor_zone3 = + DirectionalIntraPredictorZone3_C<uint16_t>; +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + static_cast<void>(dsp); +#ifndef LIBGAV1_Dsp12bpp_DirectionalIntraPredictorZone1 + dsp->directional_intra_predictor_zone1 = + DirectionalIntraPredictorZone1_C<uint16_t>; +#endif +#ifndef LIBGAV1_Dsp12bpp_DirectionalIntraPredictorZone2 + dsp->directional_intra_predictor_zone2 = + DirectionalIntraPredictorZone2_C<uint16_t>; +#endif +#ifndef LIBGAV1_Dsp12bpp_DirectionalIntraPredictorZone3 + dsp->directional_intra_predictor_zone3 = + DirectionalIntraPredictorZone3_C<uint16_t>; +#endif +} +#endif // LIBGAV1_MAX_BITDEPTH == 12 + } // namespace void IntraPredDirectionalInit_C() { @@ -243,6 +281,9 @@ void IntraPredDirectionalInit_C() { #if LIBGAV1_MAX_BITDEPTH >= 10 Init10bpp(); #endif +#if LIBGAV1_MAX_BITDEPTH == 12 + Init12bpp(); +#endif } } // namespace dsp diff --git a/src/dsp/intrapred_directional_test.cc b/src/dsp/intrapred_directional_test.cc index 9e98242..8d4fa63 100644 --- a/src/dsp/intrapred_directional_test.cc +++ b/src/dsp/intrapred_directional_test.cc @@ -60,6 +60,7 @@ template <int bitdepth, typename Pixel> class IntraPredTestBase : public testing::TestWithParam<TransformSize>, public test_utils::MaxAlignedAllocable { public: + static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, ""); IntraPredTestBase() { switch (tx_size_) { case kNumTransformSizes: @@ -150,6 +151,7 @@ class IntraPredTestBase : public testing::TestWithParam<TransformSize>, template <int bitdepth, typename Pixel> class DirectionalIntraPredTest : public IntraPredTestBase<bitdepth, Pixel> { public: + static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, ""); DirectionalIntraPredTest() = default; DirectionalIntraPredTest(const DirectionalIntraPredTest&) = delete; DirectionalIntraPredTest& operator=(const DirectionalIntraPredTest&) = delete; @@ -716,7 +718,7 @@ const char* const* GetDirectionalIntraPredDigests8bpp(TransformSize tx_size) { TEST_P(DirectionalIntraPredTest8bpp, DISABLED_Speed) { #if LIBGAV1_ENABLE_NEON - const auto num_runs = static_cast<int>(2e7 / (block_width_ * block_height_)); + const auto num_runs = static_cast<int>(2e5 / (block_width_ * block_height_)); #else const int num_runs = static_cast<int>(4e7 / (block_width_ * block_height_)); #endif @@ -737,8 +739,8 @@ TEST_P(DirectionalIntraPredTest8bpp, Overflow) { TestSaturatedValues(); } TEST_P(DirectionalIntraPredTest8bpp, Random) { TestRandomValues(); } //------------------------------------------------------------------------------ -#if LIBGAV1_MAX_BITDEPTH >= 10 +#if LIBGAV1_MAX_BITDEPTH >= 10 using DirectionalIntraPredTest10bpp = DirectionalIntraPredTest<10, uint16_t>; const char* const* GetDirectionalIntraPredDigests10bpp(TransformSize tx_size) { @@ -885,7 +887,7 @@ const char* const* GetDirectionalIntraPredDigests10bpp(TransformSize tx_size) { TEST_P(DirectionalIntraPredTest10bpp, DISABLED_Speed) { #if LIBGAV1_ENABLE_NEON - const int num_runs = static_cast<int>(2e7 / (block_width_ * block_height_)); + const int num_runs = static_cast<int>(2e5 / (block_width_ * block_height_)); #else const int num_runs = static_cast<int>(4e7 / (block_width_ * block_height_)); #endif @@ -904,9 +906,178 @@ TEST_P(DirectionalIntraPredTest10bpp, FixedInput) { TEST_P(DirectionalIntraPredTest10bpp, Overflow) { TestSaturatedValues(); } TEST_P(DirectionalIntraPredTest10bpp, Random) { TestRandomValues(); } - #endif // LIBGAV1_MAX_BITDEPTH >= 10 +//------------------------------------------------------------------------------ + +#if LIBGAV1_MAX_BITDEPTH == 12 +using DirectionalIntraPredTest12bpp = DirectionalIntraPredTest<12, uint16_t>; + +const char* const* GetDirectionalIntraPredDigests12bpp(TransformSize tx_size) { + static const char* const kDigests4x4[kNumDirectionalIntraPredictors] = { + "78f3297743f75e928e755b6ffa2d3050", + "7315da39861c6e3ef2e47c913e3be349", + "5609cb40b575f24d05880df202a60bd3", + }; + static const char* const kDigests4x8[kNumDirectionalIntraPredictors] = { + "efb2363d3c25427abe198806c8ba4d6b", + "b5aaa41665a10e7e7944fb7fc90fd59a", + "5a85610342339ca3109d775fa18dc25c", + }; + static const char* const kDigests4x16[kNumDirectionalIntraPredictors] = { + "9045679914980ea1f579d84509397b6e", + "f9f50bdc9f81a93095fd9d6998174aa7", + "46c1f82e85b8ba5b03bab41a2f561483", + }; + static const char* const kDigests8x4[kNumDirectionalIntraPredictors] = { + "a0ae0956b2b667c528b7803d733d49da", + "5d9f60ef8904c4faedb6cfc19e54418a", + "4ffdcbbbcb23bca8286f1c286b9cb3e8", + }; + static const char* const kDigests8x8[kNumDirectionalIntraPredictors] = { + "086116c6b116613b8b47a086726566ea", + "141dca7fcae0e4d4b88887a618271ea1", + "3575a34278aa0fb1eed934290982f4a7", + }; + static const char* const kDigests8x16[kNumDirectionalIntraPredictors] = { + "7922f40216c78a40abaf675667e79493", + "55d20588240171df2e24d105ee1563ad", + "674b4d8f4dbf514d22e21cc4baeda1d3", + }; + static const char* const kDigests8x32[kNumDirectionalIntraPredictors] = { + "32d4d7e256d3b304026ddb5430cf6a09", + "72f4be2569f4e067c252d51ff4030de3", + "6779a132e1bac0ac43c2373f56553ed8", + }; + static const char* const kDigests16x4[kNumDirectionalIntraPredictors] = { + "1be2e0efc1403f9e22cfb8aeb28763d9", + "558c8a5418ac91d21a5839c454a9391f", + "7693ebef9b86416ebd6e78e98fcafba7", + }; + static const char* const kDigests16x8[kNumDirectionalIntraPredictors] = { + "e6217ed1c673ae42e84f8757316b580d", + "028aa582c11a9733f0cd693211a067c5", + "082de9fc7c4bc80a8ec8522b5a5cb52c", + }; + static const char* const kDigests16x16[kNumDirectionalIntraPredictors] = { + "e3b293c09bdc9c5c543ad046a3f0d64f", + "2de5803a6ed497c1039c8e6d675c1dd3", + "05742f807560f5d5206e54b70097dc4a", + }; + static const char* const kDigests16x32[kNumDirectionalIntraPredictors] = { + "57f2ca4ba56be253eff7e6b73df5003d", + "ef8bea00437e01fb798a22cda59f0191", + "989ff38c96600c2f108d6e6fa381fd13", + }; + static const char* const kDigests16x64[kNumDirectionalIntraPredictors] = { + "f5540f4874c02aa2222a3ba75106f841", + "17e5d20f798a96c39abc8a81e7aa7bc6", + "0fe9ea14c9dcae466b4a36f1c7db6978", + }; + static const char* const kDigests32x8[kNumDirectionalIntraPredictors] = { + "aff9429951ab1885c0d9ed29aa1b6a9f", + "4b686e2a879bf0b4aadd06b412e0eb48", + "39325d71cddc272bfa1dd2dc80d09ffe", + }; + static const char* const kDigests32x16[kNumDirectionalIntraPredictors] = { + "b83dffdf8bad2b7c3808925b6138ca1e", + "3656b58c7aaf2025979b4a3ed8a2841e", + "cfcc0c6ae3fa5e7d45dec581479459f6", + }; + static const char* const kDigests32x32[kNumDirectionalIntraPredictors] = { + "3c91b3b9e2df73ffb718e0bf53c5a5c2", + "0dbe27603e111158e70d99e181befb83", + "edecbffb32ae1e49b66b6e55ad0af6c6", + }; + static const char* const kDigests32x64[kNumDirectionalIntraPredictors] = { + "a3290917f755c7ccdc7b77eb3c6c89a7", + "42f89db41fbb366ddb78ef79a043f3e3", + "7f7bcbe33aa003b166677c68d12490e9", + }; + static const char* const kDigests64x16[kNumDirectionalIntraPredictors] = { + "d4f4c6b70a82695f843e9227bd7d9cc8", + "550a0bd87936801651d552e229b683e9", + "a4c730ad71f566a930c5672e1b2f48f1", + }; + static const char* const kDigests64x32[kNumDirectionalIntraPredictors] = { + "2087c9264c4c5fea9a6fe20dcedbe2b9", + "d4dd51d9578a3fc2eb75086fba867c22", + "6121a67d63e40107e780d0938aeb3d21", + }; + static const char* const kDigests64x64[kNumDirectionalIntraPredictors] = { + "09c3818a07bc54467634c2bfce66f58f", + "8da453b8d72d73d71ba15a14ddd59db4", + "9bc939aa54445722469b120b8a505cb3", + }; + + switch (tx_size) { + case kTransformSize4x4: + return kDigests4x4; + case kTransformSize4x8: + return kDigests4x8; + case kTransformSize4x16: + return kDigests4x16; + case kTransformSize8x4: + return kDigests8x4; + case kTransformSize8x8: + return kDigests8x8; + case kTransformSize8x16: + return kDigests8x16; + case kTransformSize8x32: + return kDigests8x32; + case kTransformSize16x4: + return kDigests16x4; + case kTransformSize16x8: + return kDigests16x8; + case kTransformSize16x16: + return kDigests16x16; + case kTransformSize16x32: + return kDigests16x32; + case kTransformSize16x64: + return kDigests16x64; + case kTransformSize32x8: + return kDigests32x8; + case kTransformSize32x16: + return kDigests32x16; + case kTransformSize32x32: + return kDigests32x32; + case kTransformSize32x64: + return kDigests32x64; + case kTransformSize64x16: + return kDigests64x16; + case kTransformSize64x32: + return kDigests64x32; + case kTransformSize64x64: + return kDigests64x64; + default: + ADD_FAILURE() << "Unknown transform size: " << tx_size; + return nullptr; + } +} + +TEST_P(DirectionalIntraPredTest12bpp, DISABLED_Speed) { +#if LIBGAV1_ENABLE_NEON + const int num_runs = static_cast<int>(2e7 / (block_width_ * block_height_)); +#else + const int num_runs = static_cast<int>(4e7 / (block_width_ * block_height_)); +#endif + for (int i = kZone1; i < kNumZones; ++i) { + TestSpeed(GetDirectionalIntraPredDigests12bpp(tx_size_), + static_cast<Zone>(i), num_runs); + } +} + +TEST_P(DirectionalIntraPredTest12bpp, FixedInput) { + for (int i = kZone1; i < kNumZones; ++i) { + TestSpeed(GetDirectionalIntraPredDigests12bpp(tx_size_), + static_cast<Zone>(i), 1); + } +} + +TEST_P(DirectionalIntraPredTest12bpp, Overflow) { TestSaturatedValues(); } +TEST_P(DirectionalIntraPredTest12bpp, Random) { TestRandomValues(); } +#endif // LIBGAV1_MAX_BITDEPTH == 12 + constexpr TransformSize kTransformSizes[] = { kTransformSize4x4, kTransformSize4x8, kTransformSize4x16, kTransformSize8x4, kTransformSize8x8, kTransformSize8x16, @@ -938,9 +1109,13 @@ INSTANTIATE_TEST_SUITE_P(SSE41, DirectionalIntraPredTest10bpp, INSTANTIATE_TEST_SUITE_P(NEON, DirectionalIntraPredTest10bpp, testing::ValuesIn(kTransformSizes)); #endif // LIBGAV1_ENABLE_NEON - #endif // LIBGAV1_MAX_BITDEPTH >= 10 +#if LIBGAV1_MAX_BITDEPTH == 12 +INSTANTIATE_TEST_SUITE_P(C, DirectionalIntraPredTest12bpp, + testing::ValuesIn(kTransformSizes)); +#endif // LIBGAV1_MAX_BITDEPTH == 12 + } // namespace } // namespace dsp diff --git a/src/dsp/intrapred_filter.cc b/src/dsp/intrapred_filter.cc index 9a45eff..2d183cf 100644 --- a/src/dsp/intrapred_filter.cc +++ b/src/dsp/intrapred_filter.cc @@ -131,6 +131,21 @@ void Init10bpp() { } #endif // LIBGAV1_MAX_BITDEPTH >= 10 +#if LIBGAV1_MAX_BITDEPTH == 12 +void Init12bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(12); + assert(dsp != nullptr); +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + dsp->filter_intra_predictor = FilterIntraPredictor_C<12, uint16_t>; +#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + static_cast<void>(dsp); +#ifndef LIBGAV1_Dsp12bpp_FilterIntraPredictor + dsp->filter_intra_predictor = FilterIntraPredictor_C<12, uint16_t>; +#endif +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +} +#endif // LIBGAV1_MAX_BITDEPTH == 12 + } // namespace void IntraPredFilterInit_C() { @@ -138,6 +153,9 @@ void IntraPredFilterInit_C() { #if LIBGAV1_MAX_BITDEPTH >= 10 Init10bpp(); #endif +#if LIBGAV1_MAX_BITDEPTH == 12 + Init12bpp(); +#endif } } // namespace dsp diff --git a/src/dsp/intrapred_filter_test.cc b/src/dsp/intrapred_filter_test.cc index fe1efdc..c8d60a0 100644 --- a/src/dsp/intrapred_filter_test.cc +++ b/src/dsp/intrapred_filter_test.cc @@ -52,6 +52,7 @@ template <int bitdepth, typename Pixel> class IntraPredTestBase : public testing::TestWithParam<TransformSize>, public test_utils::MaxAlignedAllocable { public: + static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, ""); IntraPredTestBase() { switch (tx_size_) { case kNumTransformSizes: @@ -130,6 +131,7 @@ class IntraPredTestBase : public testing::TestWithParam<TransformSize>, template <int bitdepth, typename Pixel> class FilterIntraPredTest : public IntraPredTestBase<bitdepth, Pixel> { public: + static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, ""); FilterIntraPredTest() = default; FilterIntraPredTest(const FilterIntraPredTest&) = delete; FilterIntraPredTest& operator=(const FilterIntraPredTest&) = delete; @@ -519,6 +521,132 @@ TEST_P(FilterIntraPredTest10bpp, FixedInput) { TEST_P(FilterIntraPredTest10bpp, Overflow) { TestSaturatedValues(); } #endif // LIBGAV1_MAX_BITDEPTH >= 10 + +//------------------------------------------------------------------------------ + +#if LIBGAV1_MAX_BITDEPTH == 12 +using FilterIntraPredTest12bpp = FilterIntraPredTest<12, uint16_t>; + +const char* const* GetFilterIntraPredDigests12bpp(TransformSize tx_size) { + static const char* const kDigests4x4[kNumFilterIntraPredictors] = { + "27682e2763f742e0c7156a263af54fe1", "f6fe9b73d8a2024b3125d25a42028be3", + "8a232b8caa41f8c4f0b547f0aa072fd7", "411b24dc872e91de3a607f18b51c4e34", + "9a106b70ca2df5317afc90aba0316a98", + }; + static const char* const kDigests4x8[kNumFilterIntraPredictors] = { + "a0d3f3a8f498727af0844a6df90da971", "bb02998e3d5d7b4643db616a5ce75c51", + "eaa39425427c155dea1836c37fc14f7e", "747cc4fa0c9e3418f4a15ded9f846599", + "c1a2aeaa01dd3edac4c26f74e01d8d57", + }; + static const char* const kDigests4x16[kNumFilterIntraPredictors] = { + "80c01fdef14e3db28987e323801c998e", "de5a2f59384a096324eebe843d4b8ba5", + "f85e18efc9297793392607cdd84d8bc4", "d84bf2d9d4996c2f7fd82b6bbd52577b", + "9d73771de09c17bd494f1f5f75ab1111", + }; + static const char* const kDigests8x4[kNumFilterIntraPredictors] = { + "7df2b038c4d816eb4949de6b933f0632", "0f1c45dd6e8d5534de0c9a279087ea8b", + "1b79f3b10facd9ffc404cbafdd73aa43", "e19adec4f14d72c5157f9faf7fc9b23e", + "a30ed988ea6ed797d4bf0945ffe7e330", + }; + static const char* const kDigests8x8[kNumFilterIntraPredictors] = { + "097a0c14d89ece69e779fa755a2b75c0", "ebadfc559b20246dcd8d74413ff4d088", + "097c91bedc1e703b3eb54361d94df59a", "765bbad37b91e644292beac5f06811be", + "f3c809461fa3325f0d33087ca79c47d0", + }; + static const char* const kDigests8x16[kNumFilterIntraPredictors] = { + "36464af48b38005b61f7f528a0b0c8ba", "47fa0868224c71d28d3cdcf247282c13", + "ca34bb57a37ee3e5428814ec63f52117", "420bdca6b643f4421d465345cc264167", + "339c124c07a611a65952dc9996ba6e12", + }; + static const char* const kDigests8x32[kNumFilterIntraPredictors] = { + "99ca0d3b3fbdd4661a2c07bdb2752a70", "6fedae1dbfe721210b65e08dc77847dd", + "956810089f81dc9334103111afec2fbb", "ede4f0bee06def6d8a2037939415d845", + "ca146dfe0edbdac3066a0ca387fb6277", + }; + static const char* const kDigests16x4[kNumFilterIntraPredictors] = { + "b0f7d5dbf7f9aa3f0ab13273de80dc9d", "a3537f2b60426e9f83aeef973161fcfd", + "d4f868f793ab232bee17b49afcfc28a0", "fc43429761d10723b5f377eb6513e59a", + "f59aabb06574ce24e1d1113753edb098", + }; + static const char* const kDigests16x8[kNumFilterIntraPredictors] = { + "0b539f1e2ecf0300bf3838ab1d80952c", "44f01a4324cda8d27ea44a8bd3620526", + "a57819a22b422e7da9d85f09504a2c57", "dbff6a417a8f3606575acb3c98efe091", + "534e8e8cd4b73cb4f6ec22f903727efa", + }; + static const char* const kDigests16x16[kNumFilterIntraPredictors] = { + "247192bd6a5c2821b8694e4669361103", "1935044a6220ac6315a58b402465b6da", + "bdce29a3e988b804d429da1446a34c2a", "4697132c20395fabac2662cb8b1ce35a", + "3d07a7beaff6925175fcd9a8e69542e6", + }; + static const char* const kDigests16x32[kNumFilterIntraPredictors] = { + "3429b83b7ba723bdd2e3e368979b51b0", "cd099d0eb7f4a20547f91d9402e3394a", + "a6a7cc4e0f8ed34424264107b3657fb8", "0125ace62bec7c7ff7240bf5b6f689c5", + "a0722dba921b078a6d569ecb81777bf8", + }; + static const char* const kDigests32x8[kNumFilterIntraPredictors] = { + "44b1b086ee37a93406e5db95dca825d7", "fdeed5c4644dc288f6dcc148e8d2867a", + "b241d112f6fa7a24c44706fb76e49132", "a782dcf01a16231276dbd20121bad640", + "4da9c0efd0bcb31f911af52779317fb9", + }; + static const char* const kDigests32x16[kNumFilterIntraPredictors] = { + "bf9704995a0a868c45280cac3415c0a7", "373626072ade7c8d709ab732149fd3ae", + "9e4a2062aa86ac8dc5164002c953c7ca", "62eede30996d0e55afcf513fe9ad3c58", + "a5f3bb32688d5189341304d12e4e6449", + }; + static const char* const kDigests32x32[kNumFilterIntraPredictors] = { + "bd93c4ddbe0f06e3f12be25ce490f68c", "bfe772b203b83c982f35a8ed0682cd16", + "d357ae05ce215f4c5af650ae82909081", "bd640d3c511edaac1753b64c81afb75d", + "4d05d67e02a7c4af7ae981b0eb8a4d7b", + }; + + switch (tx_size) { + case kTransformSize4x4: + return kDigests4x4; + case kTransformSize4x8: + return kDigests4x8; + case kTransformSize4x16: + return kDigests4x16; + case kTransformSize8x4: + return kDigests8x4; + case kTransformSize8x8: + return kDigests8x8; + case kTransformSize8x16: + return kDigests8x16; + case kTransformSize8x32: + return kDigests8x32; + case kTransformSize16x4: + return kDigests16x4; + case kTransformSize16x8: + return kDigests16x8; + case kTransformSize16x16: + return kDigests16x16; + case kTransformSize16x32: + return kDigests16x32; + case kTransformSize32x8: + return kDigests32x8; + case kTransformSize32x16: + return kDigests32x16; + case kTransformSize32x32: + return kDigests32x32; + default: + ADD_FAILURE() << "Unknown transform size: " << tx_size; + return nullptr; + } +} + +TEST_P(FilterIntraPredTest12bpp, DISABLED_Speed) { + const auto num_runs = + static_cast<int>(2.5e8 / (block_width_ * block_height_)); + TestSpeed(GetFilterIntraPredDigests12bpp(tx_size_), num_runs); +} + +TEST_P(FilterIntraPredTest12bpp, FixedInput) { + TestSpeed(GetFilterIntraPredDigests12bpp(tx_size_), 1); +} + +TEST_P(FilterIntraPredTest12bpp, Overflow) { TestSaturatedValues(); } +#endif // LIBGAV1_MAX_BITDEPTH == 12 + // Filter-intra and Cfl predictors are available only for transform sizes // with max(width, height) <= 32. constexpr TransformSize kTransformSizesSmallerThan32x32[] = { @@ -549,6 +677,11 @@ INSTANTIATE_TEST_SUITE_P(NEON, FilterIntraPredTest10bpp, #endif // LIBGAV1_ENABLE_NEON #endif // LIBGAV1_MAX_BITDEPTH >= 10 +#if LIBGAV1_MAX_BITDEPTH == 12 +INSTANTIATE_TEST_SUITE_P(C, FilterIntraPredTest12bpp, + testing::ValuesIn(kTransformSizesSmallerThan32x32)); +#endif // LIBGAV1_MAX_BITDEPTH == 12 + } // namespace } // namespace dsp diff --git a/src/dsp/intrapred_smooth.cc b/src/dsp/intrapred_smooth.cc index 0c7f272..16b8274 100644 --- a/src/dsp/intrapred_smooth.cc +++ b/src/dsp/intrapred_smooth.cc @@ -714,6 +714,266 @@ void Init10bpp() { } // NOLINT(readability/fn_size) #endif // LIBGAV1_MAX_BITDEPTH >= 10 +#if LIBGAV1_MAX_BITDEPTH == 12 +using DefsHbd = SmoothDefs<uint16_t>; + +void Init12bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(12); + assert(dsp != nullptr); +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + INIT_SMOOTH(DefsHbd); +#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + static_cast<void>(dsp); +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmooth] = + DefsHbd::_4x4::Smooth; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothVertical] = + DefsHbd::_4x4::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothHorizontal] = + DefsHbd::_4x4::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmooth] = + DefsHbd::_4x8::Smooth; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothVertical] = + DefsHbd::_4x8::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothHorizontal] = + DefsHbd::_4x8::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmooth] = + DefsHbd::_4x16::Smooth; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothVertical] = + DefsHbd::_4x16::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothHorizontal] = + DefsHbd::_4x16::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmooth] = + DefsHbd::_8x4::Smooth; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothVertical] = + DefsHbd::_8x4::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothHorizontal] = + DefsHbd::_8x4::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmooth] = + DefsHbd::_8x8::Smooth; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothVertical] = + DefsHbd::_8x8::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothHorizontal] = + DefsHbd::_8x8::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmooth] = + DefsHbd::_8x16::Smooth; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothVertical] = + DefsHbd::_8x16::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothHorizontal] = + DefsHbd::_8x16::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmooth] = + DefsHbd::_8x32::Smooth; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothVertical] = + DefsHbd::_8x32::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothHorizontal] = + DefsHbd::_8x32::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmooth] = + DefsHbd::_16x4::Smooth; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothVertical] = + DefsHbd::_16x4::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothHorizontal] = + DefsHbd::_16x4::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmooth] = + DefsHbd::_16x8::Smooth; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothVertical] = + DefsHbd::_16x8::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothHorizontal] = + DefsHbd::_16x8::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmooth] = + DefsHbd::_16x16::Smooth; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothVertical] = + DefsHbd::_16x16::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothHorizontal] = + DefsHbd::_16x16::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmooth] = + DefsHbd::_16x32::Smooth; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothVertical] = + DefsHbd::_16x32::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothHorizontal] = + DefsHbd::_16x32::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x64_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmooth] = + DefsHbd::_16x64::Smooth; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x64_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothVertical] = + DefsHbd::_16x64::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize16x64_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothHorizontal] = + DefsHbd::_16x64::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmooth] = + DefsHbd::_32x8::Smooth; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothVertical] = + DefsHbd::_32x8::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothHorizontal] = + DefsHbd::_32x8::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmooth] = + DefsHbd::_32x16::Smooth; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothVertical] = + DefsHbd::_32x16::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothHorizontal] = + DefsHbd::_32x16::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmooth] = + DefsHbd::_32x32::Smooth; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothVertical] = + DefsHbd::_32x32::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothHorizontal] = + DefsHbd::_32x32::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x64_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmooth] = + DefsHbd::_32x64::Smooth; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x64_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothVertical] = + DefsHbd::_32x64::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize32x64_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothHorizontal] = + DefsHbd::_32x64::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp12bpp_TransformSize64x16_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmooth] = + DefsHbd::_64x16::Smooth; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize64x16_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothVertical] = + DefsHbd::_64x16::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize64x16_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothHorizontal] = + DefsHbd::_64x16::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp12bpp_TransformSize64x32_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmooth] = + DefsHbd::_64x32::Smooth; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize64x32_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothVertical] = + DefsHbd::_64x32::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize64x32_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothHorizontal] = + DefsHbd::_64x32::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp12bpp_TransformSize64x64_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmooth] = + DefsHbd::_64x64::Smooth; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize64x64_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothVertical] = + DefsHbd::_64x64::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp12bpp_TransformSize64x64_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothHorizontal] = + DefsHbd::_64x64::SmoothHorizontal; +#endif +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +} // NOLINT(readability/fn_size) +#endif // LIBGAV1_MAX_BITDEPTH == 12 + #undef INIT_SMOOTH_WxH #undef INIT_SMOOTH } // namespace @@ -723,6 +983,9 @@ void IntraPredSmoothInit_C() { #if LIBGAV1_MAX_BITDEPTH >= 10 Init10bpp(); #endif +#if LIBGAV1_MAX_BITDEPTH == 12 + Init12bpp(); +#endif } } // namespace dsp diff --git a/src/dsp/intrapred_smooth.h b/src/dsp/intrapred_smooth.h index 6802003..06454af 100644 --- a/src/dsp/intrapred_smooth.h +++ b/src/dsp/intrapred_smooth.h @@ -38,6 +38,12 @@ namespace libgav1 { namespace dsp { +enum { + // Weights are quadratic from '1' to '1 / block_size', scaled by + // 2^kSmoothWeightScale. + kSmoothWeightScale = 8, +}; + // Initializes Dsp::intra_predictors[][kIntraPredictorSmooth.*]. // This function is not thread-safe. void IntraPredSmoothInit_C(); diff --git a/src/dsp/intrapred_test.cc b/src/dsp/intrapred_test.cc index 335aa2f..cca1c73 100644 --- a/src/dsp/intrapred_test.cc +++ b/src/dsp/intrapred_test.cc @@ -47,6 +47,7 @@ template <int bitdepth, typename Pixel> class IntraPredTestBase : public testing::TestWithParam<TransformSize>, public test_utils::MaxAlignedAllocable { public: + static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, ""); IntraPredTestBase() { switch (tx_size_) { case kNumTransformSizes: @@ -125,6 +126,7 @@ class IntraPredTestBase : public testing::TestWithParam<TransformSize>, template <int bitdepth, typename Pixel> class IntraPredTest : public IntraPredTestBase<bitdepth, Pixel> { public: + static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, ""); IntraPredTest() = default; IntraPredTest(const IntraPredTest&) = delete; IntraPredTest& operator=(const IntraPredTest&) = delete; @@ -666,6 +668,203 @@ TEST_P(IntraPredTest10bpp, Overflow) { TestSaturatedValues(); } TEST_P(IntraPredTest10bpp, Random) { TestRandomValues(); } #endif // LIBGAV1_MAX_BITDEPTH >= 10 +#if LIBGAV1_MAX_BITDEPTH == 12 +using IntraPredTest12bpp = IntraPredTest<12, uint16_t>; + +const char* const* GetIntraPredDigests12bpp(TransformSize tx_size) { + static const char* const kDigests4x4[kNumIntraPredictors] = { + "f7008e0f65bdeed97375ae5e98e3309b", "a34cc5d9d1ef875df4ee2ce010d0a80a", + "74f615beeb217ad317ced813851be36a", "b3312e86313805b061c66a08e09de653", + "2db47240c95530b39084bdacccf4bb8e", "76bb839cac394b5777c64b6d4b570a27", + "a74ee60527be86059e822f7463f49ad5", "b157a40aaa14391c237471ba6d148a50", + "d4f7bd2e97e2b23f7a6a059837a10b2a", "8a9bcb30e9aff59b6feef5d1bf546d28", + }; + static const char* const kDigests4x8[kNumIntraPredictors] = { + "4c2a59e1d4a58c129c709f05d1a83f4a", "5fbedd99a90a20727195dfbe8f9969ad", + "d4645e21ccf5f6d3c4ca7a3d9b0156ba", "98aa17ea5423192c81a04afd2d2669ed", + "67dad5b5eefdeb2af1e4d3875b282c6c", "881dcafd6323509fb80cd5bbdf2870c4", + "03ece373dfd56bd2fd86ad00ad6f5000", "41b28f2578d2ed7f38e708d57b231948", + "9f935505190f52ff4da9556e43f607be", "815700d2abb055bce6902d130e77416d", + }; + static const char* const kDigests4x16[kNumIntraPredictors] = { + "bfc47cd4eef143a6ebf517730756a718", "ef07a3af3e353f9dfaebc48c8ac92c82", + "ceec5d9d24254efd3c6a00cbf11dd24d", "4e07f512a69cf95608c3c0c3013ed808", + "cedb7c900bb6839026bf79d054edb4fc", "48d958a18a019809f12eb2ad2eb358bc", + "8f296f4b9fb621a910368609cc2cccdf", "073a6f2ca8a23d6131ff97e2a3b736e1", + "f4772cc60b68c4f958c08c0fd8eb8d48", "2f8946cf19abecf0fda3addbfb8f9dcf", + }; + static const char* const kDigests8x4[kNumIntraPredictors] = { + "4f245b07a91e6d604da9f22cf277d6f1", "a6dc25d1e24ba9e842c312f67eea211d", + "0475204441f44ea95bfd69c6e04eaed8", "313bcf1e2fc762d31ff765d3c18a6f67", + "7e9223ece684a1885c2108741052c6c8", "79f1e6f070d9b1d0f1de2ff77bccc0dc", + "63adca1101ee4799b1cfa26d88aa0657", "e8b940a5e39ea5313930c903464de843", + "42a8e470d3b000f4f57c44c632f0051b", "e8a57663f73da3d4320f8e82a3fecfc2", + }; + static const char* const kDigests8x8[kNumIntraPredictors] = { + "7fa3c8bdd9ce04dc4df27863499cf4d4", "83f1312edd9af928a1cef60613730bc3", + "ceb35042adc6095a545b490f20e5d81b", "73aa503f329a055ff59a24093e682c41", + "14a9a427525ec38d2eb13e698728e911", "9143ddf66234e74acc156565d684fcac", + "05182bbe4fd90f3b496033ee5b7c54f9", "d9c6184c23af1f5a903a4a00539b883a", + "c4c2d4000ca2defc7a8169215121d9fc", "0b938bc7782b32796bffece28d17bb69", + }; + static const char* const kDigests8x16[kNumIntraPredictors] = { + "50197f063138616c37ef09f8bf8a3016", "ef2008f6d9f2176feb17b7d4312022e2", + "0d243ffbba0a2e65738d7ee768620c36", "51b52564a2733c2c56ba319db5d8e3b8", + "0e2b41482ac1347c3bb6d0e394fe7bec", "edb43c19850452e6b20dfb2c001adb0b", + "6cd29f537b5e4180f5aaefd9140b65ef", "6808f618bdff33e0f3d6db60ea487bc1", + "0303c17746192b0c52b4d75ea97ca24d", "225d1debd7828fa01bc9a610a443cda9", + }; + static const char* const kDigests8x32[kNumIntraPredictors] = { + "dc047c402c6ac4014d621fbd41b460d5", "49eb33c3a112f059e02d6d4b99da8b41", + "c906c9105a406ae6c63e69f57ed2fc7c", "2ead452591ddd2455660f96ce79314ab", + "437a2a78562752ee8291227f88e0323a", "51834dbdcf1e89667ffbb931bec9006c", + "959c1778e11a7c61a5a97176c79ecb6a", "2e51e44dd1953fc6fccc3b1c1ca602ed", + "7f94114cddb0ba780cc0c8d00db3f8d2", "b5b3770e6061249a3206915a3f9464e7", + }; + static const char* const kDigests16x4[kNumIntraPredictors] = { + "9deb173fa962d9adde8a9ae256708c32", "264624b41e43cfe9378ee9b4fb5028a6", + "404919a41bdc7f1a1f9d089223373bb8", "5294ed9fcc16eaf5f9a1f66a2a36ae7c", + "a2ed1fa4262bca265dcc62eb1586f0ac", "58494af62f86464dbe471130b2bc4ab0", + "fe1f25f7096fc3426cc7964326cc46ad", "cf7f6c8f7257436b9934cecf3b7523e1", + "6325036f243abfcd7777754e6a7bdacc", "9dce11a98e18422b04dd9d7be7d420da", + }; + static const char* const kDigests16x8[kNumIntraPredictors] = { + "92d5b7d4033dcd8cb729bf8e166e339a", "6cbd9f198828fd3422c9bfaf8c2f1c1d", + "2b204014b6dc477f67b36818bcdab1ca", "2ce0b9cf224d4654168c559d7c1424c2", + "ec70341b9dd57b379f5283820c9461c7", "3fe1e2a20e44171c90ebca5a45b83460", + "0305852b25351ff472a45f45ec1638fa", "565c78271fbe3b25b0eee542095be005", + "8bc15e98659cef6236bcb072541bb2ca", "875c87bf4daba7cb436ea2fdb5a427dd", + }; + static const char* const kDigests16x16[kNumIntraPredictors] = { + "c9d12bce78d8846f081345906e1315f4", "0b57c8fde6dec15458b1c289245100cb", + "1c11978c4e6bbc77767395c63d2f70a8", "e749f26b26b46d8cb7cb13c1c777db94", + "40459af05e865e94ff7adcdec1685c15", "f3ae419e99a60dbde3afa24ba6588a36", + "fe3912418bca24cee3132de2c193d1fc", "cdc8e3ce27a12f1cbfe01d1adf2eb6bd", + "ce354b30ce15a6918172dea55a292b93", "e762d01726d641194982a5fb8c148eb7", + }; + static const char* const kDigests16x32[kNumIntraPredictors] = { + "ad8f118b07e053df3887215449633a07", "e8979aa743aef82937d93d87fc9fdb85", + "a8afb62cbf602cfcd4b570832afe1d55", "404183cf003764a4f032f0f4810cd42c", + "4afcf1bc5589a13b11679571aa953b86", "202df8f5a2d7eb3816de172608115f2b", + "ce42bca92d6d7f9df85dbaac72e35064", "61c463c8070b78ca2bdf578044fec440", + "3abf6e4d779208e15e3f9a0dfc0254f9", "13df5504084105af7c66a1b013fe44e1", + }; + static const char* const kDigests16x64[kNumIntraPredictors] = { + "3ac1f642019493dec1b737d7a3a1b4e5", "cbf69d5d157c9f3355a4757b1d6e3414", + "96d00ddc7537bf7f196006591b733b4e", "8cba1b70a0bde29e8ef235cedc5faa7d", + "35f9ee300d7fa3c97338e81a6f21dcd4", "aae335442e77c8ebc280f16ea50ba9c7", + "a6140fdac2278644328be094d88731db", "2df93621b6ff100f7008432d509f4161", + "c77bf5aee39e7ed4a3dd715f816f452a", "02109bd63557d90225c32a8f1338258e", + }; + static const char* const kDigests32x8[kNumIntraPredictors] = { + "155688dec409ff50f2333c14a6367247", "cf935e78abafa6ff7258c5af229f55b6", + "b4bf83a28ba319c597151a041ff838c3", "fe97f3e6cd5fe6c5979670c11d940dda", + "b898c9a989e1e72461a6f47e913d5383", "bb73baa6476ce90118e83e2fd08f2299", + "c93be6d8ec318bd805899466821bb779", "ab366991ef842e9d417d52241f6966e6", + "9e7e4c96a271e9e40771eac39c21f661", "9459f2e6d1291b8b8a2fe0635ce1a33d", + }; + static const char* const kDigests32x16[kNumIntraPredictors] = { + "48374c1241409e26d81e5106c73da420", "97c918bdba2ece52156dbc776b9b70d4", + "a44ce9c03f6622a3e93bfe3b928eb6f1", "2384ad95e3e7302f20857121e187aa48", + "47e72c6dc0087b6fd99e91cff854c269", "142dc3cbb05b82a496780f7fc3d66ccc", + "4a39fb768efcd4f30d6eae816e6a68c4", "d0c31f9d52d984a0335557eafe2b47fa", + "81b3af5c7893729b837e4d304917f7cd", "941cbcd411887dc7fa3a5c7395690d1a", + }; + static const char* const kDigests32x32[kNumIntraPredictors] = { + "00892ee43a1bbb11347c1f44fb94b1a2", "d66397ba868e62cec99daf5ea73bebd0", + "65fe746e79ac1e779caae8abcc15eb6b", "8e308fe96b9845112d79c54f9d7981a0", + "47bc8847a7c9aed3417cd5250ba57875", "1a4008b7f0f61a3c73a2ee1d1452b414", + "24d25ef488bb457a5a4c4892e47a363d", "6d9d964f5317ab32a8edf57c23775238", + "544fc36c1a35c588359ae492cb5bc143", "ac170d94dbd944e9723de9c18bace1a3", + }; + static const char* const kDigests32x64[kNumIntraPredictors] = { + "7d0bd7dea26226741dbca9a97f27fa74", "a8bdc852ef704dd4975c61893e8fbc3f", + "f29d6d03c143ddf96fef04c19f2c8333", "ad9cfc395a5c5644a21d958c7274ac14", + "45c27c5cca9a91b6ae8379feb0881c9f", "8a0b78df1e001b85c874d686eac4aa1b", + "ce9fa75fac54a3f6c0cc3f2083b938f1", "c0dca10d88762c954af18dc9e3791a39", + "61df229eddfccab913b8fda4bb02f9ac", "4f4df6bc8d50a5600b573f0e44d70e66", + }; + static const char* const kDigests64x16[kNumIntraPredictors] = { + "e99d072de858094c98b01bd4a6772634", "525da4b187acd81b1ff1116b60461141", + "1348f249690d9eefe09d9ad7ead2c801", "a5e2f9fb685d5f4a048e9a96affd25a4", + "873bfa9dc24693f19721f7c8d527f7d3", "0acfc6507bd3468e9679efc127d6e4b9", + "57d03f8d079c7264854e22ac1157cfae", "6c2c4036f70c7d957a9399b5436c0774", + "42b8e4a97b7f8416c72a5148c031c0b1", "a38a2c5f79993dfae8530e9e25800893", + }; + static const char* const kDigests64x32[kNumIntraPredictors] = { + "68bd283cfd1a125f6b2ee47cee874d36", "b4581311a0a73d95dfac7f8f44591032", + "5ecc7fdc52d2f575ad4f2d0e9e6b1e11", "db9d82921fd88b24fdff6f849f2f9c87", + "804179f05c032908a5e36077bb87c994", "fc5fd041a8ee779015394d0c066ee43c", + "68f5579ccadfe9a1baafb158334a3db2", "fe237e45e215ab06d79046da9ad71e84", + "9a8a938a6824551bf7d21b8fd1d70ea1", "eb7332f2017cd96882c76e7136aeaf53", + }; + static const char* const kDigests64x64[kNumIntraPredictors] = { + "d9a906c0e692b22e1b4414e71a704b7e", "12ac11889ae5f55b7781454efd706a6a", + "3f1ef5f473a49eba743f17a3324adf9d", "a6baa0d4bfb2269a94c7a38f86a4bccf", + "47d4cadd56f70c11ff8f3e5d8df81161", "de997744cf24c16c5ac2a36b02b351cc", + "23781211ae178ddeb6c4bb97a6bd7d83", "a79d2e28340ca34b9e37daabbf030f63", + "0372bd3ddfc258750a6ac106b70587f4", "228ef625d9460cbf6fa253a16a730976", + }; + + switch (tx_size) { + case kTransformSize4x4: + return kDigests4x4; + case kTransformSize4x8: + return kDigests4x8; + case kTransformSize4x16: + return kDigests4x16; + case kTransformSize8x4: + return kDigests8x4; + case kTransformSize8x8: + return kDigests8x8; + case kTransformSize8x16: + return kDigests8x16; + case kTransformSize8x32: + return kDigests8x32; + case kTransformSize16x4: + return kDigests16x4; + case kTransformSize16x8: + return kDigests16x8; + case kTransformSize16x16: + return kDigests16x16; + case kTransformSize16x32: + return kDigests16x32; + case kTransformSize16x64: + return kDigests16x64; + case kTransformSize32x8: + return kDigests32x8; + case kTransformSize32x16: + return kDigests32x16; + case kTransformSize32x32: + return kDigests32x32; + case kTransformSize32x64: + return kDigests32x64; + case kTransformSize64x16: + return kDigests64x16; + case kTransformSize64x32: + return kDigests64x32; + case kTransformSize64x64: + return kDigests64x64; + default: + ADD_FAILURE() << "Unknown transform size: " << tx_size; + return nullptr; + } +} + +TEST_P(IntraPredTest12bpp, DISABLED_Speed) { + const auto num_runs = + static_cast<int>(2.0e9 / (block_width_ * block_height_)); + TestSpeed(GetIntraPredDigests12bpp(tx_size_), num_runs); +} + +TEST_P(IntraPredTest12bpp, FixedInput) { + TestSpeed(GetIntraPredDigests12bpp(tx_size_), 1); +} + +TEST_P(IntraPredTest12bpp, Overflow) { TestSaturatedValues(); } +TEST_P(IntraPredTest12bpp, Random) { TestRandomValues(); } +#endif // LIBGAV1_MAX_BITDEPTH == 12 + constexpr TransformSize kTransformSizes[] = { kTransformSize4x4, kTransformSize4x8, kTransformSize4x16, kTransformSize8x4, kTransformSize8x8, kTransformSize8x16, @@ -700,6 +899,11 @@ INSTANTIATE_TEST_SUITE_P(NEON, IntraPredTest10bpp, #endif // LIBGAV1_MAX_BITDEPTH >= 10 +#if LIBGAV1_MAX_BITDEPTH == 12 +INSTANTIATE_TEST_SUITE_P(C, IntraPredTest12bpp, + testing::ValuesIn(kTransformSizes)); +#endif // LIBGAV1_MAX_BITDEPTH == 12 + } // namespace } // namespace dsp diff --git a/src/dsp/inverse_transform.cc b/src/dsp/inverse_transform.cc index 1b0064f..0bbdffa 100644 --- a/src/dsp/inverse_transform.cc +++ b/src/dsp/inverse_transform.cc @@ -18,6 +18,7 @@ #include <cassert> #include <cstdint> #include <cstring> +#include <type_traits> #include "src/dsp/dsp.h" #include "src/utils/array_2d.h" @@ -25,6 +26,15 @@ #include "src/utils/compiler_attributes.h" #include "src/utils/logging.h" +#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) +#undef LIBGAV1_ENABLE_TRANSFORM_RANGE_CHECK +#endif + +#if defined(LIBGAV1_ENABLE_TRANSFORM_RANGE_CHECK) && \ + LIBGAV1_ENABLE_TRANSFORM_RANGE_CHECK +#include <cinttypes> +#endif + namespace libgav1 { namespace dsp { namespace { @@ -34,24 +44,25 @@ namespace { constexpr uint8_t kTransformColumnShift = 4; -#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) -#undef LIBGAV1_ENABLE_TRANSFORM_RANGE_CHECK -#endif - -int32_t RangeCheckValue(int32_t value, int8_t range) { +template <typename T> +int32_t RangeCheckValue(T value, int8_t range) { #if defined(LIBGAV1_ENABLE_TRANSFORM_RANGE_CHECK) && \ LIBGAV1_ENABLE_TRANSFORM_RANGE_CHECK + static_assert( + std::is_same<T, int32_t>::value || std::is_same<T, std::int64_t>::value, + ""); assert(range <= 32); const auto min = static_cast<int32_t>(-(uint32_t{1} << (range - 1))); const auto max = static_cast<int32_t>((uint32_t{1} << (range - 1)) - 1); if (min > value || value > max) { - LIBGAV1_DLOG(ERROR, "coeff out of bit range, value: %d bit range %d\n", - value, range); + LIBGAV1_DLOG(ERROR, + "coeff out of bit range, value: %" PRId64 " bit range %d", + static_cast<int64_t>(value), range); assert(min <= value && value <= max); } #endif // LIBGAV1_ENABLE_TRANSFORM_RANGE_CHECK static_cast<void>(range); - return value; + return static_cast<int32_t>(value); } template <typename Residual> @@ -433,7 +444,13 @@ void Adst4_C(void* dest, int8_t range) { // Section 7.13.2.6: It is a requirement of bitstream conformance that all // values stored in the s and x arrays by this process are representable by // a signed integer using range + 12 bits of precision. - int32_t s[7]; + // Note the intermediate value can only exceed INT32_MAX with invalid 12-bit + // content. For simplicity in unoptimized code, int64_t is used for both 10 & + // 12-bit. SIMD implementations can allow these to rollover on platforms + // where this has defined behavior. + using Intermediate = + typename std::conditional<sizeof(Residual) == 2, int32_t, int64_t>::type; + Intermediate s[7]; s[0] = RangeCheckValue(kAdst4Multiplier[0] * dst[0], range + 12); s[1] = RangeCheckValue(kAdst4Multiplier[1] * dst[0], range + 12); s[2] = RangeCheckValue(kAdst4Multiplier[2] * dst[1], range + 12); @@ -454,19 +471,23 @@ void Adst4_C(void* dest, int8_t range) { s[0] = RangeCheckValue(s[0] + s[3], range + 12); s[1] = RangeCheckValue(s[1] - s[4], range + 12); s[3] = s[2]; - s[2] = RangeCheckValue(kAdst4Multiplier[2] * b7, range + 12); + // With range checking enabled b7 would be trapped above. This prevents an + // integer sanitizer warning. In SIMD implementations the multiply can be + // allowed to rollover on platforms where this has defined behavior. + const auto adst2_b7 = static_cast<Intermediate>(kAdst4Multiplier[2]) * b7; + s[2] = RangeCheckValue(adst2_b7, range + 12); // stage 4. s[0] = RangeCheckValue(s[0] + s[5], range + 12); s[1] = RangeCheckValue(s[1] - s[6], range + 12); // stages 5 and 6. - const int32_t x0 = RangeCheckValue(s[0] + s[3], range + 12); - const int32_t x1 = RangeCheckValue(s[1] + s[3], range + 12); - int32_t x3 = RangeCheckValue(s[0] + s[1], range + 12); + const Intermediate x0 = RangeCheckValue(s[0] + s[3], range + 12); + const Intermediate x1 = RangeCheckValue(s[1] + s[3], range + 12); + Intermediate x3 = RangeCheckValue(s[0] + s[1], range + 12); x3 = RangeCheckValue(x3 - s[3], range + 12); - int32_t dst_0 = RightShiftWithRounding(x0, 12); - int32_t dst_1 = RightShiftWithRounding(x1, 12); - int32_t dst_2 = RightShiftWithRounding(s[2], 12); - int32_t dst_3 = RightShiftWithRounding(x3, 12); + auto dst_0 = static_cast<int32_t>(RightShiftWithRounding(x0, 12)); + auto dst_1 = static_cast<int32_t>(RightShiftWithRounding(x1, 12)); + auto dst_2 = static_cast<int32_t>(RightShiftWithRounding(s[2], 12)); + auto dst_3 = static_cast<int32_t>(RightShiftWithRounding(x3, 12)); if (sizeof(Residual) == 2) { // If the first argument to RightShiftWithRounding(..., 12) is only // slightly smaller than 2^27 - 1 (e.g., 0x7fffe4e), adding 2^11 to it @@ -840,6 +861,10 @@ void Adst16DcOnly_C(void* dest, int8_t range, bool should_round, int row_shift, template <typename Residual> void Identity4Row_C(void* dest, int8_t shift) { + // Note the intermediate value can only exceed 32 bits with 12-bit content. + // For simplicity in unoptimized code, int64_t is used for both 10 & 12-bit. + using Intermediate = + typename std::conditional<sizeof(Residual) == 2, int32_t, int64_t>::type; assert(shift == 0 || shift == 1); auto* const dst = static_cast<Residual*>(dest); // If |shift| is 0, |rounding| should be 1 << 11. If |shift| is 1, |rounding| @@ -847,10 +872,10 @@ void Identity4Row_C(void* dest, int8_t shift) { // values of |shift|. const int32_t rounding = (1 + (shift << 1)) << 11; for (int i = 0; i < 4; ++i) { - // The intermediate value here will have to fit into an int32_t for it to be - // bitstream conformant. The multiplication is promoted to int32_t by - // defining kIdentity4Multiplier as int32_t. - int32_t dst_i = (dst[i] * kIdentity4Multiplier + rounding) >> (12 + shift); + const auto intermediate = + static_cast<Intermediate>(dst[i]) * kIdentity4Multiplier; + int32_t dst_i = + static_cast<int32_t>((intermediate + rounding) >> (12 + shift)); if (sizeof(Residual) == 2) { dst_i = Clip3(dst_i, INT16_MIN, INT16_MAX); } @@ -874,16 +899,24 @@ void Identity4Column_C(void* dest, int8_t /*shift*/) { template <int bitdepth, typename Residual> void Identity4DcOnly_C(void* dest, int8_t /*range*/, bool should_round, int row_shift, bool is_row) { + // Note the intermediate value can only exceed 32 bits with 12-bit content. + // For simplicity in unoptimized code, int64_t is used for both 10 & 12-bit. + using Intermediate = + typename std::conditional<sizeof(Residual) == 2, int32_t, int64_t>::type; auto* const dst = static_cast<Residual*>(dest); if (is_row) { if (should_round) { - dst[0] = RightShiftWithRounding(dst[0] * kTransformRowMultiplier, 12); + const auto intermediate = + static_cast<Intermediate>(dst[0]) * kTransformRowMultiplier; + dst[0] = RightShiftWithRounding(intermediate, 12); } const int32_t rounding = (1 + (row_shift << 1)) << 11; + const auto intermediate = + static_cast<Intermediate>(dst[0]) * kIdentity4Multiplier; int32_t dst_i = - (dst[0] * kIdentity4Multiplier + rounding) >> (12 + row_shift); + static_cast<int32_t>((intermediate + rounding) >> (12 + row_shift)); if (sizeof(Residual) == 2) { dst_i = Clip3(dst_i, INT16_MIN, INT16_MAX); } @@ -923,11 +956,17 @@ void Identity8Column_C(void* dest, int8_t /*shift*/) { template <int bitdepth, typename Residual> void Identity8DcOnly_C(void* dest, int8_t /*range*/, bool should_round, int row_shift, bool is_row) { + // Note the intermediate value can only exceed 32 bits with 12-bit content. + // For simplicity in unoptimized code, int64_t is used for both 10 & 12-bit. + using Intermediate = + typename std::conditional<sizeof(Residual) == 2, int32_t, int64_t>::type; auto* const dst = static_cast<Residual*>(dest); if (is_row) { if (should_round) { - dst[0] = RightShiftWithRounding(dst[0] * kTransformRowMultiplier, 12); + const auto intermediate = + static_cast<Intermediate>(dst[0]) * kTransformRowMultiplier; + dst[0] = RightShiftWithRounding(intermediate, 12); } int32_t dst_i = RightShiftWithRounding(MultiplyBy2(dst[0]), row_shift); @@ -954,13 +993,19 @@ void Identity8DcOnly_C(void* dest, int8_t /*range*/, bool should_round, template <typename Residual> void Identity16Row_C(void* dest, int8_t shift) { assert(shift == 1 || shift == 2); + // Note the intermediate value can only exceed 32 bits with 12-bit content. + // For simplicity in unoptimized code, int64_t is used for both 10 & 12-bit. + using Intermediate = + typename std::conditional<sizeof(Residual) == 2, int32_t, int64_t>::type; auto* const dst = static_cast<Residual*>(dest); const int32_t rounding = (1 + (1 << shift)) << 11; for (int i = 0; i < 16; ++i) { - // The intermediate value here will have to fit into an int32_t for it to be - // bitstream conformant. The multiplication is promoted to int32_t by - // defining kIdentity16Multiplier as int32_t. - int32_t dst_i = (dst[i] * kIdentity16Multiplier + rounding) >> (12 + shift); + // Note the intermediate value can only exceed 32 bits with 12-bit content. + // For simplicity in unoptimized code, int64_t is used for all cases. + const auto intermediate = + static_cast<Intermediate>(dst[i]) * kIdentity16Multiplier; + int32_t dst_i = + static_cast<int32_t>((intermediate + rounding) >> (12 + shift)); if (sizeof(Residual) == 2) { dst_i = Clip3(dst_i, INT16_MIN, INT16_MAX); } @@ -985,16 +1030,24 @@ void Identity16Column_C(void* dest, int8_t /*shift*/) { template <int bitdepth, typename Residual> void Identity16DcOnly_C(void* dest, int8_t /*range*/, bool should_round, int row_shift, bool is_row) { + // Note the intermediate value can only exceed 32 bits with 12-bit content. + // For simplicity in unoptimized code, int64_t is used for both 10 & 12-bit. + using Intermediate = + typename std::conditional<sizeof(Residual) == 2, int32_t, int64_t>::type; auto* const dst = static_cast<Residual*>(dest); if (is_row) { if (should_round) { - dst[0] = RightShiftWithRounding(dst[0] * kTransformRowMultiplier, 12); + const auto intermediate = + static_cast<Intermediate>(dst[0]) * kTransformRowMultiplier; + dst[0] = RightShiftWithRounding(intermediate, 12); } const int32_t rounding = (1 + (1 << row_shift)) << 11; + const auto intermediate = + static_cast<Intermediate>(dst[0]) * kIdentity16Multiplier; int32_t dst_i = - (dst[0] * kIdentity16Multiplier + rounding) >> (12 + row_shift); + static_cast<int32_t>((intermediate + rounding) >> (12 + row_shift)); if (sizeof(Residual) == 2) { dst_i = Clip3(dst_i, INT16_MIN, INT16_MAX); } @@ -1034,11 +1087,17 @@ void Identity32Column_C(void* dest, int8_t /*shift*/) { template <int bitdepth, typename Residual> void Identity32DcOnly_C(void* dest, int8_t /*range*/, bool should_round, int row_shift, bool is_row) { + // Note the intermediate value can only exceed 32 bits with 12-bit content. + // For simplicity in unoptimized code, int64_t is used for both 10 & 12-bit. + using Intermediate = + typename std::conditional<sizeof(Residual) == 2, int32_t, int64_t>::type; auto* const dst = static_cast<Residual*>(dest); if (is_row) { if (should_round) { - dst[0] = RightShiftWithRounding(dst[0] * kTransformRowMultiplier, 12); + const auto intermediate = + static_cast<Intermediate>(dst[0]) * kTransformRowMultiplier; + dst[0] = RightShiftWithRounding(intermediate, 12); } int32_t dst_i = RightShiftWithRounding(MultiplyBy4(dst[0]), row_shift); @@ -1612,6 +1671,148 @@ void Init10bpp() { } #endif // LIBGAV1_MAX_BITDEPTH >= 10 +#if LIBGAV1_MAX_BITDEPTH == 12 +void Init12bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(12); + assert(dsp != nullptr); + static_cast<void>(dsp); +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + InitAll<12, int32_t, uint16_t>(dsp); +#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +#ifndef LIBGAV1_Dsp12bpp_Transform1dSize4_Transform1dDct + dsp->inverse_transforms[kTransform1dDct][kTransform1dSize4][kRow] = + TransformLoop_C<12, int32_t, uint16_t, kTransform1dDct, + DctDcOnly_C<12, int32_t, 2>, Dct_C<int32_t, 2>, + /*is_row=*/true>; + dsp->inverse_transforms[kTransform1dDct][kTransform1dSize4][kColumn] = + TransformLoop_C<12, int32_t, uint16_t, kTransform1dDct, + DctDcOnly_C<12, int32_t, 2>, Dct_C<int32_t, 2>, + /*is_row=*/false>; +#endif +#ifndef LIBGAV1_Dsp12bpp_Transform1dSize8_Transform1dDct + dsp->inverse_transforms[kTransform1dDct][kTransform1dSize8][kRow] = + TransformLoop_C<12, int32_t, uint16_t, kTransform1dDct, + DctDcOnly_C<12, int32_t, 3>, Dct_C<int32_t, 3>, + /*is_row=*/true>; + dsp->inverse_transforms[kTransform1dDct][kTransform1dSize8][kColumn] = + TransformLoop_C<12, int32_t, uint16_t, kTransform1dDct, + DctDcOnly_C<12, int32_t, 3>, Dct_C<int32_t, 3>, + /*is_row=*/false>; +#endif +#ifndef LIBGAV1_Dsp12bpp_Transform1dSize16_Transform1dDct + dsp->inverse_transforms[kTransform1dDct][kTransform1dSize16][kRow] = + TransformLoop_C<12, int32_t, uint16_t, kTransform1dDct, + DctDcOnly_C<12, int32_t, 4>, Dct_C<int32_t, 4>, + /*is_row=*/true>; + dsp->inverse_transforms[kTransform1dDct][kTransform1dSize16][kColumn] = + TransformLoop_C<12, int32_t, uint16_t, kTransform1dDct, + DctDcOnly_C<12, int32_t, 4>, Dct_C<int32_t, 4>, + /*is_row=*/false>; +#endif +#ifndef LIBGAV1_Dsp12bpp_Transform1dSize32_Transform1dDct + dsp->inverse_transforms[kTransform1dDct][kTransform1dSize32][kRow] = + TransformLoop_C<12, int32_t, uint16_t, kTransform1dDct, + DctDcOnly_C<12, int32_t, 5>, Dct_C<int32_t, 5>, + /*is_row=*/true>; + dsp->inverse_transforms[kTransform1dDct][kTransform1dSize32][kColumn] = + TransformLoop_C<12, int32_t, uint16_t, kTransform1dDct, + DctDcOnly_C<12, int32_t, 5>, Dct_C<int32_t, 5>, + /*is_row=*/false>; +#endif +#ifndef LIBGAV1_Dsp12bpp_Transform1dSize64_Transform1dDct + dsp->inverse_transforms[kTransform1dDct][kTransform1dSize64][kRow] = + TransformLoop_C<12, int32_t, uint16_t, kTransform1dDct, + DctDcOnly_C<12, int32_t, 6>, Dct_C<int32_t, 6>, + /*is_row=*/true>; + dsp->inverse_transforms[kTransform1dDct][kTransform1dSize64][kColumn] = + TransformLoop_C<12, int32_t, uint16_t, kTransform1dDct, + DctDcOnly_C<12, int32_t, 6>, Dct_C<int32_t, 6>, + /*is_row=*/false>; +#endif +#ifndef LIBGAV1_Dsp12bpp_Transform1dSize4_Transform1dAdst + dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize4][kRow] = + TransformLoop_C<12, int32_t, uint16_t, kTransform1dAdst, + Adst4DcOnly_C<12, int32_t>, Adst4_C<int32_t>, + /*is_row=*/true>; + dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize4][kColumn] = + TransformLoop_C<12, int32_t, uint16_t, kTransform1dAdst, + Adst4DcOnly_C<12, int32_t>, Adst4_C<int32_t>, + /*is_row=*/false>; +#endif +#ifndef LIBGAV1_Dsp12bpp_Transform1dSize8_Transform1dAdst + dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize8][kRow] = + TransformLoop_C<12, int32_t, uint16_t, kTransform1dAdst, + Adst8DcOnly_C<12, int32_t>, Adst8_C<int32_t>, + /*is_row=*/true>; + dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize8][kColumn] = + TransformLoop_C<12, int32_t, uint16_t, kTransform1dAdst, + Adst8DcOnly_C<12, int32_t>, Adst8_C<int32_t>, + /*is_row=*/false>; +#endif +#ifndef LIBGAV1_Dsp12bpp_Transform1dSize16_Transform1dAdst + dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize16][kRow] = + TransformLoop_C<12, int32_t, uint16_t, kTransform1dAdst, + Adst16DcOnly_C<12, int32_t>, Adst16_C<int32_t>, + /*is_row=*/true>; + dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize16][kColumn] = + TransformLoop_C<12, int32_t, uint16_t, kTransform1dAdst, + Adst16DcOnly_C<12, int32_t>, Adst16_C<int32_t>, + /*is_row=*/false>; +#endif +#ifndef LIBGAV1_Dsp12bpp_Transform1dSize4_Transform1dIdentity + dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize4][kRow] = + TransformLoop_C<12, int32_t, uint16_t, kTransform1dIdentity, + Identity4DcOnly_C<12, int32_t>, Identity4Row_C<int32_t>, + /*is_row=*/true>; + dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize4][kColumn] = + TransformLoop_C<12, int32_t, uint16_t, kTransform1dIdentity, + Identity4DcOnly_C<12, int32_t>, + Identity4Column_C<int32_t>, /*is_row=*/false>; +#endif +#ifndef LIBGAV1_Dsp12bpp_Transform1dSize8_Transform1dIdentity + dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize8][kRow] = + TransformLoop_C<12, int32_t, uint16_t, kTransform1dIdentity, + Identity8DcOnly_C<12, int32_t>, Identity8Row_C<int32_t>, + /*is_row=*/true>; + dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize8][kColumn] = + TransformLoop_C<12, int32_t, uint16_t, kTransform1dIdentity, + Identity8DcOnly_C<12, int32_t>, + Identity8Column_C<int32_t>, /*is_row=*/false>; +#endif +#ifndef LIBGAV1_Dsp12bpp_Transform1dSize16_Transform1dIdentity + dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize16][kRow] = + TransformLoop_C<12, int32_t, uint16_t, kTransform1dIdentity, + Identity16DcOnly_C<12, int32_t>, Identity16Row_C<int32_t>, + /*is_row=*/true>; + dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize16][kColumn] = + TransformLoop_C<12, int32_t, uint16_t, kTransform1dIdentity, + Identity16DcOnly_C<12, int32_t>, + Identity16Column_C<int32_t>, /*is_row=*/false>; +#endif +#ifndef LIBGAV1_Dsp12bpp_Transform1dSize32_Transform1dIdentity + dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize32][kRow] = + TransformLoop_C<12, int32_t, uint16_t, kTransform1dIdentity, + Identity32DcOnly_C<12, int32_t>, Identity32Row_C<int32_t>, + /*is_row=*/true>; + dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize32][kColumn] = + TransformLoop_C<12, int32_t, uint16_t, kTransform1dIdentity, + Identity32DcOnly_C<12, int32_t>, + Identity32Column_C<int32_t>, /*is_row=*/false>; +#endif +#ifndef LIBGAV1_Dsp12bpp_Transform1dSize4_Transform1dWht + dsp->inverse_transforms[kTransform1dWht][kTransform1dSize4][kRow] = + TransformLoop_C<12, int32_t, uint16_t, kTransform1dWht, + Wht4DcOnly_C<12, int32_t>, Wht4_C<int32_t>, + /*is_row=*/true>; + dsp->inverse_transforms[kTransform1dWht][kTransform1dSize4][kColumn] = + TransformLoop_C<12, int32_t, uint16_t, kTransform1dWht, + Wht4DcOnly_C<12, int32_t>, Wht4_C<int32_t>, + /*is_row=*/false>; +#endif +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +} +#endif // LIBGAV1_MAX_BITDEPTH == 12 + } // namespace void InverseTransformInit_C() { @@ -1619,10 +1820,12 @@ void InverseTransformInit_C() { #if LIBGAV1_MAX_BITDEPTH >= 10 Init10bpp(); #endif +#if LIBGAV1_MAX_BITDEPTH == 12 + Init12bpp(); +#endif // Local functions that may be unused depending on the optimizations // available. - static_cast<void>(RangeCheckValue); static_cast<void>(kBitReverseLookup); } diff --git a/src/dsp/inverse_transform_test.cc b/src/dsp/inverse_transform_test.cc index 0ae23df..081dcc1 100644 --- a/src/dsp/inverse_transform_test.cc +++ b/src/dsp/inverse_transform_test.cc @@ -69,6 +69,7 @@ template <int bitdepth, typename SrcPixel, typename DstPixel> class InverseTransformTestBase : public testing::TestWithParam<TransformSize>, public test_utils::MaxAlignedAllocable { public: + static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, ""); InverseTransformTestBase() { switch (tx_size_) { case kNumTransformSizes: @@ -148,6 +149,7 @@ template <int bitdepth, typename Pixel, typename DstPixel> class InverseTransformTest : public InverseTransformTestBase<bitdepth, Pixel, DstPixel> { public: + static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, ""); InverseTransformTest() = default; InverseTransformTest(const InverseTransformTest&) = delete; InverseTransformTest& operator=(const InverseTransformTest&) = delete; @@ -533,6 +535,19 @@ INSTANTIATE_TEST_SUITE_P(NEON, InverseTransformTest10bpp, #endif #endif // LIBGAV1_MAX_BITDEPTH >= 10 +#if LIBGAV1_MAX_BITDEPTH == 12 +using InverseTransformTest12bpp = InverseTransformTest<12, int32_t, uint16_t>; + +TEST_P(InverseTransformTest12bpp, Random) { TestRandomValues(1); } + +TEST_P(InverseTransformTest12bpp, DISABLED_Speed) { TestRandomValues(12000); } + +TEST_P(InverseTransformTest12bpp, DcRandom) { TestDcOnlyRandomValue(1); } + +INSTANTIATE_TEST_SUITE_P(C, InverseTransformTest12bpp, + testing::ValuesIn(kTransformSizesAll)); +#endif // LIBGAV1_MAX_BITDEPTH == 12 + } // namespace } // namespace dsp diff --git a/src/dsp/libgav1_dsp.cmake b/src/dsp/libgav1_dsp.cmake index 4bd1443..fedb35b 100644 --- a/src/dsp/libgav1_dsp.cmake +++ b/src/dsp/libgav1_dsp.cmake @@ -113,6 +113,7 @@ list(APPEND libgav1_dsp_sources_neon "${libgav1_source}/dsp/arm/inverse_transform_10bit_neon.cc" "${libgav1_source}/dsp/arm/inverse_transform_neon.cc" "${libgav1_source}/dsp/arm/inverse_transform_neon.h" + "${libgav1_source}/dsp/arm/loop_filter_10bit_neon.cc" "${libgav1_source}/dsp/arm/loop_filter_neon.cc" "${libgav1_source}/dsp/arm/loop_filter_neon.h" "${libgav1_source}/dsp/arm/loop_restoration_10bit_neon.cc" diff --git a/src/dsp/loop_filter.cc b/src/dsp/loop_filter.cc index 14d47bf..bb0583f 100644 --- a/src/dsp/loop_filter.cc +++ b/src/dsp/loop_filter.cc @@ -603,6 +603,73 @@ void Init10bpp() { } #endif // LIBGAV1_MAX_BITDEPTH >= 10 +#if LIBGAV1_MAX_BITDEPTH == 12 +using Defs12bpp = LoopFilterFuncs_C<12, uint16_t>; + +void Init12bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(12); + assert(dsp != nullptr); +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] = + Defs12bpp::Horizontal4; + dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] = + Defs12bpp::Vertical4; + + dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] = + Defs12bpp::Horizontal6; + dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] = + Defs12bpp::Vertical6; + + dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] = + Defs12bpp::Horizontal8; + dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] = + Defs12bpp::Vertical8; + + dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] = + Defs12bpp::Horizontal14; + dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] = + Defs12bpp::Vertical14; +#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + static_cast<void>(dsp); +#ifndef LIBGAV1_Dsp12bpp_LoopFilterSize4_LoopFilterTypeHorizontal + dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] = + Defs12bpp::Horizontal4; +#endif +#ifndef LIBGAV1_Dsp12bpp_LoopFilterSize4_LoopFilterTypeVertical + dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] = + Defs12bpp::Vertical4; +#endif + +#ifndef LIBGAV1_Dsp12bpp_LoopFilterSize6_LoopFilterTypeHorizontal + dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] = + Defs12bpp::Horizontal6; +#endif +#ifndef LIBGAV1_Dsp12bpp_LoopFilterSize6_LoopFilterTypeVertical + dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] = + Defs12bpp::Vertical6; +#endif + +#ifndef LIBGAV1_Dsp12bpp_LoopFilterSize8_LoopFilterTypeHorizontal + dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] = + Defs12bpp::Horizontal8; +#endif +#ifndef LIBGAV1_Dsp12bpp_LoopFilterSize8_LoopFilterTypeVertical + dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] = + Defs12bpp::Vertical8; +#endif + +#ifndef LIBGAV1_Dsp12bpp_LoopFilterSize14_LoopFilterTypeHorizontal + dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] = + Defs12bpp::Horizontal14; +#endif +#ifndef LIBGAV1_Dsp12bpp_LoopFilterSize14_LoopFilterTypeVertical + dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] = + Defs12bpp::Vertical14; +#endif +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +} +#endif // LIBGAV1_MAX_BITDEPTH == 12 + } // namespace void LoopFilterInit_C() { @@ -610,6 +677,9 @@ void LoopFilterInit_C() { #if LIBGAV1_MAX_BITDEPTH >= 10 Init10bpp(); #endif +#if LIBGAV1_MAX_BITDEPTH == 12 + Init12bpp(); +#endif // Local functions that may be unused depending on the optimizations // available. static_cast<void>(AdjustThresholds); diff --git a/src/dsp/loop_filter_test.cc b/src/dsp/loop_filter_test.cc index d013a1b..63ed530 100644 --- a/src/dsp/loop_filter_test.cc +++ b/src/dsp/loop_filter_test.cc @@ -106,6 +106,7 @@ void InitInput(Pixel* dst, const int stride, const int bitdepth, template <int bitdepth, typename Pixel> class LoopFilterTest : public testing::TestWithParam<LoopFilterSize> { public: + static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, ""); LoopFilterTest() = default; LoopFilterTest(const LoopFilterTest&) = delete; LoopFilterTest& operator=(const LoopFilterTest&) = delete; @@ -132,6 +133,9 @@ class LoopFilterTest : public testing::TestWithParam<LoopFilterSize> { } } else if (absl::StartsWith(test_case, "NEON/")) { LoopFilterInit_NEON(); +#if LIBGAV1_MAX_BITDEPTH >= 10 + LoopFilterInit10bpp_NEON(); +#endif } else { FAIL() << "Unrecognized architecture prefix in test case name: " << test_case; @@ -203,22 +207,23 @@ void LoopFilterTest<bitdepth, Pixel>::TestRandomValues( template <int bitdepth, typename Pixel> void LoopFilterTest<bitdepth, Pixel>::TestSaturatedValues() const { - const LoopFilterType filter = kLoopFilterTypeHorizontal; - if (cur_loop_filters_[filter] == nullptr) return; - Pixel dst[kNumPixels], ref[kNumPixels]; const auto value = static_cast<Pixel>((1 << bitdepth) - 1); for (auto& r : dst) r = value; memcpy(ref, dst, sizeof(dst)); - const int outer_thresh = 24; - const int inner_thresh = 8; - const int hev_thresh = 0; - cur_loop_filters_[filter](dst + 8 + kBlockStride * 8, kBlockStride, - outer_thresh, inner_thresh, hev_thresh); - ASSERT_TRUE(test_utils::CompareBlocks(ref, dst, kBlockStride, kBlockStride, - kBlockStride, kBlockStride, true)) - << "kLoopFilterTypeHorizontal output doesn't match reference"; + for (int i = 0; i < kNumLoopFilterTypes; ++i) { + if (cur_loop_filters_[i] == nullptr) return; + const int outer_thresh = 24; + const int inner_thresh = 8; + const int hev_thresh = 0; + cur_loop_filters_[i](dst + 8 + kBlockStride * 8, kBlockStride, outer_thresh, + inner_thresh, hev_thresh); + ASSERT_TRUE(test_utils::CompareBlocks(ref, dst, kBlockStride, kBlockStride, + kBlockStride, kBlockStride, true)) + << ToString(static_cast<LoopFilterType>(i)) + << " output doesn't match reference"; + } } //------------------------------------------------------------------------------ @@ -328,6 +333,8 @@ TEST_P(LoopFilterTest10bpp, FixedInput) { TestRandomValues(GetDigests10bpp(size_), kNumTests); } +TEST_P(LoopFilterTest10bpp, SaturatedValues) { TestSaturatedValues(); } + INSTANTIATE_TEST_SUITE_P(C, LoopFilterTest10bpp, testing::ValuesIn(kLoopFilterSizes)); @@ -339,7 +346,59 @@ INSTANTIATE_TEST_SUITE_P(SSE41, LoopFilterTest10bpp, INSTANTIATE_TEST_SUITE_P(NEON, LoopFilterTest10bpp, testing::ValuesIn(kLoopFilterSizes)); #endif -#endif +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +//------------------------------------------------------------------------------ + +#if LIBGAV1_MAX_BITDEPTH == 12 +using LoopFilterTest12bpp = LoopFilterTest<12, uint16_t>; + +const char* const* GetDigests12bpp(LoopFilterSize size) { + static const char* const kDigestsSize4[kNumLoopFilterTypes] = { + "a14599cbfe2daee633d556a15c47b1f6", + "1f0a0794832de1012e2fed6b1cb02e69", + }; + static const char* const kDigestsSize6[kNumLoopFilterTypes] = { + "c76b24a73139239db10f16f36e01a625", + "3f75d904e9dcb1886e84a0f03f60f31e", + }; + static const char* const kDigestsSize8[kNumLoopFilterTypes] = { + "57c6f0efe2ab3957f5500ca2a9670f37", + "caa1f90c2eb2b65b280d678f8fcf6be8", + }; + static const char* const kDigestsSize14[kNumLoopFilterTypes] = { + "0c58f7466c36c3f4a2c1b4aa1b80f0b3", + "63077978326e6dddb5b2c3bfe6d684f5", + }; + + switch (size) { + case kLoopFilterSize4: + return kDigestsSize4; + case kLoopFilterSize6: + return kDigestsSize6; + case kLoopFilterSize8: + return kDigestsSize8; + case kLoopFilterSize14: + return kDigestsSize14; + default: + ADD_FAILURE() << "Unknown loop filter size" << size; + return nullptr; + } +} + +TEST_P(LoopFilterTest12bpp, DISABLED_Speed) { + TestRandomValues(nullptr, kNumSpeedTests); +} + +TEST_P(LoopFilterTest12bpp, FixedInput) { + TestRandomValues(GetDigests12bpp(size_), kNumTests); +} + +TEST_P(LoopFilterTest12bpp, SaturatedValues) { TestSaturatedValues(); } + +INSTANTIATE_TEST_SUITE_P(C, LoopFilterTest12bpp, + testing::ValuesIn(kLoopFilterSizes)); +#endif // LIBGAV1_MAX_BITDEPTH == 12 } // namespace diff --git a/src/dsp/loop_restoration.cc b/src/dsp/loop_restoration.cc index 2301a3e..eb8052c 100644 --- a/src/dsp/loop_restoration.cc +++ b/src/dsp/loop_restoration.cc @@ -922,7 +922,6 @@ void Init8bpp() { } #if LIBGAV1_MAX_BITDEPTH >= 10 - void Init10bpp() { Dsp* const dsp = dsp_internal::GetWritableDspTable(10); assert(dsp != nullptr); @@ -939,8 +938,27 @@ void Init10bpp() { #endif #endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS } - #endif // LIBGAV1_MAX_BITDEPTH >= 10 + +#if LIBGAV1_MAX_BITDEPTH == 12 +void Init12bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(12); + assert(dsp != nullptr); +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + dsp->loop_restorations[0] = WienerFilter_C<12, uint16_t>; + dsp->loop_restorations[1] = SelfGuidedFilter_C<12, uint16_t>; +#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + static_cast<void>(dsp); +#ifndef LIBGAV1_Dsp12bpp_WienerFilter + dsp->loop_restorations[0] = WienerFilter_C<12, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp12bpp_SelfGuidedFilter + dsp->loop_restorations[1] = SelfGuidedFilter_C<12, uint16_t>; +#endif +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +} +#endif // LIBGAV1_MAX_BITDEPTH == 12 + } // namespace void LoopRestorationInit_C() { @@ -948,6 +966,9 @@ void LoopRestorationInit_C() { #if LIBGAV1_MAX_BITDEPTH >= 10 Init10bpp(); #endif +#if LIBGAV1_MAX_BITDEPTH == 12 + Init12bpp(); +#endif } } // namespace dsp diff --git a/src/dsp/loop_restoration.h b/src/dsp/loop_restoration.h index de80926..8fefc40 100644 --- a/src/dsp/loop_restoration.h +++ b/src/dsp/loop_restoration.h @@ -39,16 +39,6 @@ namespace libgav1 { namespace dsp { -enum { - // Precision of a division table (mtable) - kSgrProjScaleBits = 20, - kSgrProjReciprocalBits = 12, - // Core self-guided restoration precision bits. - kSgrProjSgrBits = 8, - // Precision bits of generated values higher than source before projection. - kSgrProjRestoreBits = 4 -}; // anonymous enum - extern const uint8_t kSgrMaLookup[256]; // Initializes Dsp::loop_restorations. This function is not thread-safe. diff --git a/src/dsp/loop_restoration_test.cc b/src/dsp/loop_restoration_test.cc index 4c54bc6..5c645b8 100644 --- a/src/dsp/loop_restoration_test.cc +++ b/src/dsp/loop_restoration_test.cc @@ -55,6 +55,7 @@ template <int bitdepth, typename Pixel> class SelfGuidedFilterTest : public testing::TestWithParam<int>, public test_utils::MaxAlignedAllocable { public: + static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, ""); SelfGuidedFilterTest() = default; SelfGuidedFilterTest(const SelfGuidedFilterTest&) = delete; SelfGuidedFilterTest& operator=(const SelfGuidedFilterTest&) = delete; @@ -159,26 +160,34 @@ void SelfGuidedFilterTest<bitdepth, Pixel>::SetInputData( template <int bitdepth, typename Pixel> void SelfGuidedFilterTest<bitdepth, Pixel>::TestFixedValues(int test_index, Pixel value) { - static const char* const kDigest[][2][kNumRadiusTypes] = { + static const char* const kDigest[][3][kNumRadiusTypes] = { {{"7b78783ff4f03625a50c2ebfd574adca", "4faa0810639016f11a9f761ce28c38b0", "a03314fc210bee68c7adbb44d2bbdac7"}, {"fce031d1339cfef5016e76a643538a71", "d439e1060de3f07b5b29c9b0b7c08e54", - "a6583fe9359877f4a259c81d900fc4fb"}}, + "a6583fe9359877f4a259c81d900fc4fb"}, + {"8f9b6944c8965f34d444a667da3b0ebe", "84fa62c491c67c3a435fd5140e7a4f82", + "d04b62d97228789e5c6928d40d5d900e"}}, {{"948ea16a90c4cefef87ce5b0ee105fc6", "76740629877b721432b84dbbdb4e352a", "27100f37b3e42a5f2a051e1566edb6f8"}, {"dd320de3bc82f4ba69738b2190ea9f85", "bf82f271e30a1aca91e53b086e133fb3", - "69c274ac59c99999e1bfbf2fc4586ebd"}}, + "69c274ac59c99999e1bfbf2fc4586ebd"}, + {"86ff2318bf8a584b8d5edd710681d621", "f6e1c104a764d6766cc278d5b216855a", + "6d928703526ab114efba865ff5b11886"}}, {{"9fbf1b246011250f38532a543cc6dd74", "d5c1e0142390ebb51b075c49f8ee9ff4", "92f31086ba2f9e1508983b22d93a4e5c"}, {"2198321e6b95e7199738e60f5ddc6966", "34f74626027ffca010c824ddf0942b13", - "43dd7df2c2a601262c68cd8af1c61b82"}}, + "43dd7df2c2a601262c68cd8af1c61b82"}, + {"1ab6138c3a82ac8ccd840f0553fdfb58", "be3bf92633f7165d3ad9c327d2dd53fe", + "41115efff3adeb541e04db23faa22f23"}}, {{"42364ff8dbdbd6706fa3b8855a4258be", "a7843fdfd4d3c0d80ba812b353b4d6b4", "f8a6a025827f29f857bed3e28ba3ea33"}, {"b83c1f8d7712e37f9b21b033822e37ed", "589daf2e3e6f8715873920515cfc1b42", - "20dcbe8e317a4373bebf11d56adc5f02"}}}; + "20dcbe8e317a4373bebf11d56adc5f02"}, + {"7971a60337fcdb662c92db051bd0bb41", "75f89f346c2a37bf0c6695c0482531e6", + "1595eeacd62cdce4d2fb094534c22c1e"}}}; if (target_self_guided_filter_func_ == nullptr) return; ASSERT_LT(value, 1 << bitdepth); - constexpr int bd_index = (bitdepth == 8) ? 0 : 1; + constexpr int bd_index = (bitdepth - 8) / 2; libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed()); const Pixel* const src = src_ + kOffset; Pixel* const dst = dst_ + kOffset; @@ -207,29 +216,39 @@ void SelfGuidedFilterTest<bitdepth, Pixel>::TestFixedValues(int test_index, template <int bitdepth, typename Pixel> void SelfGuidedFilterTest<bitdepth, Pixel>::TestRandomValues(bool speed) { - static const char* const kDigest[][2][kNumRadiusTypes] = { + static const char* const kDigest[][3][kNumRadiusTypes] = { {{"9f8358ed820943fa0abe3a8ebb5887db", "fb5d48870165522341843bcbfa8674fb", "ca67159cd29475ac5d52ca4a0df3ea10"}, {"a78641886ea0cf8757057d1d91e01434", "1b95172a5f2f9c514c78afa4cf8e5678", - "a8ba988283d9e1ad1f0dcdbf6bbdaade"}}, + "a8ba988283d9e1ad1f0dcdbf6bbdaade"}, + {"d95e98d031f9ba290e5183777d1e4905", "f806853cfadb50e6dbd4898412b92934", + "741fbfdb79cda695afedda3d51dbb27f"}}, {{"f219b445e5c80ffb5dd0359cc2cb4dd4", "699b2c9ddca1cbb0d4fc24cbcbe951e9", "a4005899fa8d3c3c4669910f93ff1290"}, {"10a75cab3c78b891c8c6d92d55f685d1", "d46f158f57c628136f6f298ee8ca6e0e", - "07203ad761775d5d317f2b7884afd9fe"}}, + "07203ad761775d5d317f2b7884afd9fe"}, + {"76b9ef906090fa81af64cce3bba0a54a", "8eecc59acdef8953aa9a96648c0ccd2c", + "6e45a0ef60e0475f470dc93552047f07"}}, {{"000d4e382be4003b514c9135893d0a37", "8fb082dca975be363bfc9c2d317ae084", "475bcb6a58f87da7723f6227bc2aca0e"}, {"4d589683f69ccc5b416149dcc5c835d5", "986b6832df1f6020d50be61ae121e42f", - "7cb5c5dbdb3d1c54cfa00def450842dc"}}, + "7cb5c5dbdb3d1c54cfa00def450842dc"}, + {"0e3dc23150d18c9d366d15e174727311", "8495122917770d822f1842ceff987b03", + "4aeb9db902072cefd6af0aff8aaabd24"}}, {{"fd43bfe34d63614554dd29fb24b12173", "5c1ba74ba3062c769d5c3c86a85ac9b9", "f1eda6d15b37172199d9949c2315832f"}, {"a11be3117fb77e8fe113581b06f98bd1", "df94d12b774ad5cf744c871e707c36c8", - "b23dc0b54c3500248d53377030428a61"}}, + "b23dc0b54c3500248d53377030428a61"}, + {"9c331f2b9410354685fe904f6c022dfa", "b540b0045b7723fbe962fd675db4b077", + "3cecd1158126c9c9cc2873ecc8c1a135"}}, {{"f3079b3b21d8dc6fce7bb1fd104be359", "c6fcbc686cfb97ab3a64f445d73aad36", "23966cba3e0e7803eeb951905861e0dd"}, {"7210391a6fe26e5ca5ea205bc38aa035", "4c3e6eccad3ea152d320ecd1077169de", - "dcee48f94126a2132963e86e93dd4903"}}}; + "dcee48f94126a2132963e86e93dd4903"}, + {"beb3dd8a2dbc5f83ef171b0ffcead3ab", "c373bd9c46bdb89a3d1e41759c315025", + "cd407b212ab46fd4a451d5dc93a0ce4a"}}}; if (target_self_guided_filter_func_ == nullptr) return; - constexpr int bd_index = (bitdepth == 8) ? 0 : 1; + constexpr int bd_index = (bitdepth - 8) / 2; const int num_inputs = speed ? 1 : 5; #if LIBGAV1_ENABLE_NEON const int num_tests = speed ? 4000 : 1; @@ -324,10 +343,28 @@ INSTANTIATE_TEST_SUITE_P(NEON, SelfGuidedFilterTest10bpp, #endif // LIBGAV1_MAX_BITDEPTH >= 10 +#if LIBGAV1_MAX_BITDEPTH == 12 +using SelfGuidedFilterTest12bpp = SelfGuidedFilterTest<12, uint16_t>; + +TEST_P(SelfGuidedFilterTest12bpp, Correctness) { + TestFixedValues(0, 0); + TestFixedValues(1, 1); + TestFixedValues(2, 2048); + TestFixedValues(3, 4095); + TestRandomValues(false); +} + +TEST_P(SelfGuidedFilterTest12bpp, DISABLED_Speed) { TestRandomValues(true); } + +INSTANTIATE_TEST_SUITE_P(C, SelfGuidedFilterTest12bpp, + testing::ValuesIn(kUnitWidths)); +#endif // LIBGAV1_MAX_BITDEPTH == 12 + template <int bitdepth, typename Pixel> class WienerFilterTest : public testing::TestWithParam<int>, public test_utils::MaxAlignedAllocable { public: + static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, ""); WienerFilterTest() = default; WienerFilterTest(const WienerFilterTest&) = delete; WienerFilterTest& operator=(const WienerFilterTest&) = delete; @@ -433,14 +470,17 @@ void WienerFilterTest<bitdepth, Pixel>::SetInputData( template <int bitdepth, typename Pixel> void WienerFilterTest<bitdepth, Pixel>::TestFixedValues(int digest_id, Pixel value) { - static const char* const kDigest[2][4] = { + static const char* const kDigest[3][4] = { {"74fc90760a14b13340cb718f200ba350", "5bacaca0128cd36f4805330b3787771d", "1109e17545cc4fbd5810b8b77e19fc36", "e7f914ec9d065aba92338016e17a526c"}, {"c8cc38790ceb0bea1eb989686755e1e5", "70f573b7e8875262c638a68d2f317916", - "193b19065899c835cb513149eb36d135", "f1dff65e3e53558b303ef0a2e3f3ba98"}}; + "193b19065899c835cb513149eb36d135", "f1dff65e3e53558b303ef0a2e3f3ba98"}, + {"c8cc38790ceb0bea1eb989686755e1e5", "70f573b7e8875262c638a68d2f317916", + "961eeb92bd9d85eb47e3961ee93d279a", "039a279232bc90eebc0ec2fe3e18a7e1"}, + }; if (target_wiener_filter_func_ == nullptr) return; ASSERT_LT(value, 1 << bitdepth); - constexpr int bd_index = (bitdepth == 8) ? 0 : 1; + constexpr int bd_index = (bitdepth - 8) / 2; const Pixel* const src = src_ + kOffset; Pixel* const dst = dst_ + kOffset; for (const auto vertical_order : kWienerOrders) { @@ -470,7 +510,7 @@ void WienerFilterTest<bitdepth, Pixel>::TestFixedValues(int digest_id, template <int bitdepth, typename Pixel> void WienerFilterTest<bitdepth, Pixel>::TestRandomValues(bool speed) { - static const char* const kDigest[2][kNumWienerOrders][kNumWienerOrders] = { + static const char* const kDigest[3][kNumWienerOrders][kNumWienerOrders] = { {{"40d0cf56d2ffb4f581e68b0fc97f547f", "5c04745209b684ba98004ebb0f64e70b", "545ed7d3f7e7ca3b86b4ada31f7aaee7", "0d6b2967f1bd1d99b720e563fe0cf03f"}, {"44b37076f0cf27f6eb506aca50c1d3e4", "e927d64dc9249e05a65e10ee75baa7d9", @@ -488,9 +528,19 @@ void WienerFilterTest<bitdepth, Pixel>::TestRandomValues(bool speed) { "d77430783e173ebd1b30e5d9336c8b69", "e159a3620747458dff7ed3d20da1a4b7"}, {"5346fa07d195c257548a332753b057a3", "c77674bc0a638abc4d38d58e494fc7cf", "7cbc1562a9dd08e1973b3b9ac1afc765", - "3c91bf1a34672cd40bf261c5820d3ec3"}}}; + "3c91bf1a34672cd40bf261c5820d3ec3"}}, + {{"501b57370c781372b514accd03d161af", "a4569b5eff7f7e8b696934d192619be5", + "24eb2aa43118a8822f7a6a7384ab9ea7", "edd7ac227733b5a4496bfdbdf4eb34d7"}, + {"77624cf73299a1bd928eae3eb8945dbe", "b3f311cacbf45fa892761462d31b2598", + "977c063d93a4b95cb365363763faa4da", "02313c9d360a1e0180ed05d3e4444c3d"}, + {"f499655ecdcbe0ac48553f1eee758589", "a009c83c03e47cbd05c1243e28579bd9", + "d5f0b4fd761ff51efce949e6c5ec4833", "e3a9a57aacd2e6cfe0f792a885b3e0e3"}, + {"b4cf906e9bb02ffca15c1e9575962ca2", "d0ca9f933978c0c31175ba1b28a44ae8", + "81ac1475530ffbd1c8d3ce7da87ffe6b", + "b96412949c2e31b29388222ac8914fa2"}}, + }; if (target_wiener_filter_func_ == nullptr) return; - constexpr int bd_index = (bitdepth == 8) ? 0 : 1; + constexpr int bd_index = (bitdepth - 8) / 2; #if LIBGAV1_ENABLE_NEON const int num_tests = speed ? 5000 : 1; #else @@ -630,9 +680,27 @@ INSTANTIATE_TEST_SUITE_P(SSE41, WienerFilterTest10bpp, INSTANTIATE_TEST_SUITE_P(NEON, WienerFilterTest10bpp, testing::ValuesIn(kUnitWidths)); #endif - #endif // LIBGAV1_MAX_BITDEPTH >= 10 +#if LIBGAV1_MAX_BITDEPTH == 12 +using WienerFilterTest12bpp = WienerFilterTest<12, uint16_t>; + +TEST_P(WienerFilterTest12bpp, Correctness) { + TestFixedValues(0, 0); + TestFixedValues(1, 1); + TestFixedValues(2, 2048); + TestFixedValues(3, 4095); + TestRandomValues(false); +} + +TEST_P(WienerFilterTest12bpp, DISABLED_Speed) { TestRandomValues(true); } + +TEST_P(WienerFilterTest12bpp, TestCompare2C) { TestCompare2C(); } + +INSTANTIATE_TEST_SUITE_P(C, WienerFilterTest12bpp, + testing::ValuesIn(kUnitWidths)); +#endif // LIBGAV1_MAX_BITDEPTH == 12 + } // namespace } // namespace dsp } // namespace libgav1 diff --git a/src/dsp/mask_blend.cc b/src/dsp/mask_blend.cc index 207fde0..34d7fe8 100644 --- a/src/dsp/mask_blend.cc +++ b/src/dsp/mask_blend.cc @@ -197,7 +197,50 @@ void Init10bpp() { dsp->inter_intra_mask_blend_8bpp[2] = nullptr; #endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS } +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +#if LIBGAV1_MAX_BITDEPTH == 12 +void Init12bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(12); + assert(dsp != nullptr); +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + dsp->mask_blend[0][0] = MaskBlend_C<12, uint16_t, false, 0, 0>; + dsp->mask_blend[1][0] = MaskBlend_C<12, uint16_t, false, 1, 0>; + dsp->mask_blend[2][0] = MaskBlend_C<12, uint16_t, false, 1, 1>; + dsp->mask_blend[0][1] = MaskBlend_C<12, uint16_t, true, 0, 0>; + dsp->mask_blend[1][1] = MaskBlend_C<12, uint16_t, true, 1, 0>; + dsp->mask_blend[2][1] = MaskBlend_C<12, uint16_t, true, 1, 1>; + // These are only used with 8-bit. + dsp->inter_intra_mask_blend_8bpp[0] = nullptr; + dsp->inter_intra_mask_blend_8bpp[1] = nullptr; + dsp->inter_intra_mask_blend_8bpp[2] = nullptr; +#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + static_cast<void>(dsp); +#ifndef LIBGAV1_Dsp12bpp_MaskBlend444 + dsp->mask_blend[0][0] = MaskBlend_C<12, uint16_t, false, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp12bpp_MaskBlend422 + dsp->mask_blend[1][0] = MaskBlend_C<12, uint16_t, false, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp12bpp_MaskBlend420 + dsp->mask_blend[2][0] = MaskBlend_C<12, uint16_t, false, 1, 1>; #endif +#ifndef LIBGAV1_Dsp12bpp_MaskBlendInterIntra444 + dsp->mask_blend[0][1] = MaskBlend_C<12, uint16_t, true, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp12bpp_MaskBlendInterIntra422 + dsp->mask_blend[1][1] = MaskBlend_C<12, uint16_t, true, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp12bpp_MaskBlendInterIntra420 + dsp->mask_blend[2][1] = MaskBlend_C<12, uint16_t, true, 1, 1>; +#endif + // These are only used with 8-bit. + dsp->inter_intra_mask_blend_8bpp[0] = nullptr; + dsp->inter_intra_mask_blend_8bpp[1] = nullptr; + dsp->inter_intra_mask_blend_8bpp[2] = nullptr; +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +} +#endif // LIBGAV1_MAX_BITDEPTH == 12 } // namespace @@ -206,6 +249,9 @@ void MaskBlendInit_C() { #if LIBGAV1_MAX_BITDEPTH >= 10 Init10bpp(); #endif +#if LIBGAV1_MAX_BITDEPTH == 12 + Init12bpp(); +#endif } } // namespace dsp diff --git a/src/dsp/mask_blend_test.cc b/src/dsp/mask_blend_test.cc index be80b11..29dd43b 100644 --- a/src/dsp/mask_blend_test.cc +++ b/src/dsp/mask_blend_test.cc @@ -14,6 +14,7 @@ #include "src/dsp/mask_blend.h" +#include <cassert> #include <cstddef> #include <cstdint> #include <cstring> @@ -103,6 +104,8 @@ const char* GetDigest8bpp(int id) { "beb984e88b6f9b96ae6efe5da23ad16b", "1083b829ea766b1d4eb0bb96e9fb3bff", "be8abad1da69e4d238a45fc02a0061cf", }; + assert(id >= 0); + assert(id < sizeof(kDigest) / sizeof(kDigest[0])); return kDigest[id]; } @@ -157,10 +160,69 @@ const char* GetDigest10bpp(int id) { "56823ef9a8e21c9c7441cc9ed870d648", "52f4c7a0b7177175302652cbc482f442", "f4a4f4d7c8b93c0486cf3cbaa26fbc19", }; + assert(id >= 0); + assert(id < sizeof(kDigest) / sizeof(kDigest[0])); return kDigest[id]; } #endif // LIBGAV1_MAX_BITDEPTH >= 10 +#if LIBGAV1_MAX_BITDEPTH == 12 +const char* GetDigest12bpp(int id) { + static const char* const kDigest[] = { + "79a505b3877177197c94f0faeb0c9ec6", "cd22657d242f30c88bb83eae9efbbcce", + "c4c60a60976d119df3832ff6956e0181", "796bd78bf2346e8dfd61cecbf508ea0e", + "79e06cc6f880daf6cdb59b9b3a8efe1c", "f0643108e6b57bd566bc0d47b2dc64a1", + "8272a471e538ca469eaf5c997309589c", "3094741b63a29925da83dc1dc187654a", + "d0141df80f2335ed6051397cb2a5bc61", "33d9fd317b74f4572afbe004f991ca83", + "ea2413cd11bf1da93de9285381b471df", "c4f78ae2b994a3a999cb3f5dac2bb498", + "44804ec226453bc5f688506b56ad2a8a", "9de9c12a5f3bb8d4af13da8807dfe53f", + "c190dac15c08f2e591b222e1d75b60c2", "c46889b58b44d242e24b91ef531e9176", + "b6697e1256b60b3426a8980c7c6f9a80", "1e0eb156152fbb74b0cff41bdbdf98b5", + "98ab6c0abc45fd44565f84e66dc71133", "f2f2126fac1b7c0c7b7ff511c6f3c91e", + "0cc720e878cfa35f9b72762d08adb1bf", "6efee9ce87e098122dd05525f4c74a2f", + "187270514a93bd7065d2cfdb02146959", "947be7f2921b5a192d4296b2060a215c", + "42f02b046eda2a94133032184fdaa26d", "487e94b20867e7021dd1f10d477c3acf", + "9f9eac4394d8821f5c14857a28c5549b", "75d781b60c1f4aa44ceb6bc65f597a52", + "779f9ac3c01a86812964ccc38da2711a", "16dc8824efbd7a47808ccdbf8e37df56", + "e72899a8ddf6cc816e1917c25739a512", "96a4bcaedae79b55399d931fecd64312", + "5c5e8f4a4f0153315133e4e86a02c3a6", "d1c339b6f6cc0eabdd6674028e1f4260", + "4ef5868adaf6712d033dce9e51837c0b", "ed90a4ddfc463dddfe71314bc3415b4e", + "2312299492a47246269d6d37e67c8c0c", "56baf1c4453c5cf5ce3d6857cff4aa8f", + "d534ce3430377b355c3f59695cfb188b", "f40248f1a6fac4299c9645350138f598", + "f2e3cbbd066d9d28304667d82312d950", "e8a7784eb367b72b96486bec856b873c", + "02941ae2cf8272b353268a30cf9c2ee0", "8f6273a5fa62b9a4225ebdbf2ce44e27", + "85bb0aaba73fe8c89dcee6b5c55d5cfc", "c28c63a4e46ee2a98dd2b58379971c8c", + "4af35738c29d27ca9930a488bacdffe6", "34a419cc3e6ab21cf099d244169d253e", + "7c5b8d19ac8a81b37011fabac10143d0", "e582811e05def83270d8f65060fe8966", + "24662536326615a3c325409e780f65bf", "717a7f7e99d329a74391477ef3c6d738", + "e0f38a3dba4c6e060b6ca12a18d75fc2", "fbd0cba6a27eb06e74c5ed376187e05c", + "14dfb487c4a7e989629a195810b814ee", "3cf6d595317ec46e08f6eaa0f0e99b43", + "b3cb98c418ea854e433b612fc532bac5", "262206cee670c082361497e51cbd0f43", + "84c11b103a9b0a61f07493dcd269e6fd", "bd9bd9994057371252398bf52c7586f0", + "72e5537ba5f04fe17b7a371bd12ca0e2", "5986a20b406ceed273f9e41bc0c4c775", + "d5eb9ea00ce19079b49562ba4a8cb574", "3205e6f3c532a63f8d5d939fa46bc444", + "cfb21ac467f21954903948d4e6c9a2a1", "bd9fd6aab18bbba8096746f9ed35a640", + "d42ec4f13f042014c5b4af5f03d19034", "8a7fdee2b57ac641e03365625850f5d6", + "d18638521275b3aa9dd463d067d6a390", "a7a71c433d85576198b52608c99cab47", + "96e2a2443bf8cfe32d7590c5011c7523", "6fbe7cd83208937229c11a8e3be5e1e9", + "ecf66dac310e332a108be639171b5cf3", "327b1656c61d795c30a914f52e3d7629", + "157d26190bde1a6f34680708bff5d02e", "d927bba0073263a7914a4076a5edfe29", + "b88930ec68e5e49da8204ef21635cea2", "58e174ed0036b1ac1f5a9bdd44860222", + "415055dfa80c6fe7c12e4d16cac22168", "9058939bfb5998d6ecd71d87a52be893", + "847894efa35f1528732ec3584f62f86f", "8aa9b33c0d9695690cb4088c32f31214", + "11e28ab9a3192a2bc9ffd3fd0a466a13", "f246009c5efafd9310fa8e365d23cab4", + "2381fcd9ee0ffceba5509879d9f5709d", "1cf1dc7c7c6ecf1f3381455c99e2239e", + "e74601883b53791045f50bbcbbbcc803", "22926eecefa94f9f39b9bb9dbb183e5b", + "128c24f5a5342aebb21bdaa87907daf7", "11c39f844a2e51cc4c80ffe1afa58e70", + "2c0548cff2145031e304d8f97abfd751", "66e1a3daf84029341b999b18bf86e5b3", + "0f790f210d5366bbad7eb352b4909dd9", + }; + assert(id >= 0); + assert(id < sizeof(kDigest) / sizeof(kDigest[0])); + return kDigest[id]; +} +#endif // LIBGAV1_MAX_BITDEPTH == 12 + struct MaskBlendTestParam { MaskBlendTestParam(BlockSize block_size, int subsampling_x, int subsampling_y, bool is_inter_intra, bool is_wedge_inter_intra) @@ -192,6 +254,7 @@ template <int bitdepth, typename Pixel> class MaskBlendTest : public testing::TestWithParam<MaskBlendTestParam>, public test_utils::MaxAlignedAllocable { public: + static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, ""); MaskBlendTest() = default; ~MaskBlendTest() override = default; @@ -310,6 +373,7 @@ void MaskBlendTest<bitdepth, Pixel>::Test(const char* const digest, PredType* src_2 = source2_; uint8_t* src_2_8bpp = source2_8bpp_; const ptrdiff_t src_2_stride = param_.is_inter_intra ? kStride : width; + const ptrdiff_t mask_stride = param_.width; uint8_t* mask_row = mask_; const int range_mask = (1 << (bitdepth)) - 1; for (int y = 0; y < height; ++y) { @@ -340,7 +404,7 @@ void MaskBlendTest<bitdepth, Pixel>::Test(const char* const digest, mask_row[x] = rnd.Rand8() & 63; mask_row[x] += rnd.Rand8() & 1; // Range of mask is [0, 64]. } - mask_row += kStride; + mask_row += mask_stride; } absl::Duration elapsed_time; @@ -351,7 +415,7 @@ void MaskBlendTest<bitdepth, Pixel>::Test(const char* const digest, static_assert(sizeof(source2_8bpp_cache_) == sizeof(source2_8bpp_), ""); // source2_8bpp_ is modified in the call. memcpy(source2_8bpp_cache_, source2_8bpp_, sizeof(source2_8bpp_)); - func_8bpp_(source1_8bpp_, source2_8bpp_, src_2_stride, mask_, kStride, + func_8bpp_(source1_8bpp_, source2_8bpp_, src_2_stride, mask_, mask_stride, width, height); for (int y = 0; y < height; ++y) { for (int x = 0; x < width; ++x) { @@ -363,7 +427,7 @@ void MaskBlendTest<bitdepth, Pixel>::Test(const char* const digest, if (bitdepth != 8) { ASSERT_EQ(func_8bpp_, nullptr); } - func_(source1_, source2_, src_2_stride, mask_, kStride, width, height, + func_(source1_, source2_, src_2_stride, mask_, mask_stride, width, height, dest_, kDestStride); } elapsed_time += absl::Now() - start; @@ -520,6 +584,19 @@ INSTANTIATE_TEST_SUITE_P(NEON, MaskBlendTest10bpp, #endif #endif // LIBGAV1_MAX_BITDEPTH >= 10 +#if LIBGAV1_MAX_BITDEPTH == 12 +using MaskBlendTest12bpp = MaskBlendTest<12, uint16_t>; + +TEST_P(MaskBlendTest12bpp, Blending) { Test(GetDigest12bpp(GetDigestId()), 1); } + +TEST_P(MaskBlendTest12bpp, DISABLED_Speed) { + Test(GetDigest12bpp(GetDigestId()), kNumSpeedTests); +} + +INSTANTIATE_TEST_SUITE_P(C, MaskBlendTest12bpp, + testing::ValuesIn(kMaskBlendTestParam)); +#endif // LIBGAV1_MAX_BITDEPTH == 12 + } // namespace } // namespace dsp } // namespace libgav1 diff --git a/src/dsp/obmc.cc b/src/dsp/obmc.cc index 6b5c6e3..479cb1d 100644 --- a/src/dsp/obmc.cc +++ b/src/dsp/obmc.cc @@ -116,7 +116,28 @@ void Init10bpp() { #endif #endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS } +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +#if LIBGAV1_MAX_BITDEPTH == 12 +void Init12bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(12); + assert(dsp != nullptr); +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendVertical_C<uint16_t>; + dsp->obmc_blend[kObmcDirectionHorizontal] = + OverlapBlendHorizontal_C<uint16_t>; +#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + static_cast<void>(dsp); +#ifndef LIBGAV1_Dsp12bpp_ObmcVertical + dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendVertical_C<uint16_t>; +#endif +#ifndef LIBGAV1_Dsp12bpp_ObmcHorizontal + dsp->obmc_blend[kObmcDirectionHorizontal] = + OverlapBlendHorizontal_C<uint16_t>; #endif +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +} +#endif // LIBGAV1_MAX_BITDEPTH == 12 } // namespace @@ -125,6 +146,9 @@ void ObmcInit_C() { #if LIBGAV1_MAX_BITDEPTH >= 10 Init10bpp(); #endif +#if LIBGAV1_MAX_BITDEPTH == 12 + Init12bpp(); +#endif } } // namespace dsp diff --git a/src/dsp/obmc_test.cc b/src/dsp/obmc_test.cc index 3672e12..a10feb2 100644 --- a/src/dsp/obmc_test.cc +++ b/src/dsp/obmc_test.cc @@ -15,6 +15,7 @@ #include "src/dsp/obmc.h" #include <algorithm> +#include <cassert> #include <cstddef> #include <cstdint> #include <cstring> @@ -58,6 +59,8 @@ const char* GetDigest8bpp(int id) { "98a9be6245720d4e0da18115c1a1dbd7", "7e7afe3136ad681b5ea05664fe916548", "33971753243f09106173199b7bae1ef5", "65413f33c19a42c112d395121aa4b3b4", }; + assert(id >= 0); + assert(id < sizeof(kDigest) / sizeof(kDigest[0])); return kDigest[id]; } @@ -75,6 +78,8 @@ const char* GetDigestSpeed8bpp(int id) { "29f8a6fc2a650f3945a4ea6d3b975b6d", "8f300a257e913a42666b4921b2b0b5c5", "a526265c4b3c8593736a82ddc1fd1603", "76e248f6756ac96343204b0e48d72a9e", }; + assert(id >= 0); + assert(id < sizeof(kDigest) / sizeof(kDigest[0])); return kDigest[id]; } @@ -93,6 +98,8 @@ const char* GetDigest10bpp(int id) { "e4a01e492ddc0398b5c5b60c81468242", "f1b4f7ab5c8b949e51db104f2e33565a", "b1fb9ecc6a552e2b23ee92e2f3e4122a", "a683d20129a91bb20b904aa20c0499b1", }; + assert(id >= 0); + assert(id < sizeof(kDigest) / sizeof(kDigest[0])); return kDigest[id]; } @@ -110,10 +117,52 @@ const char* GetDigestSpeed10bpp(int id) { "b543855cbe384b88861c881853c28192", "5faaafc124e94eedc69dc0f5d33dacac", "13ca4d01bd20085459e6126555e1f7b5", "46d46fae3c8a7d9e4725154d8d2b76d8", }; + assert(id >= 0); + assert(id < sizeof(kDigest) / sizeof(kDigest[0])); return kDigest[id]; } #endif // LIBGAV1_MAX_BITDEPTH >= 10 +#if LIBGAV1_MAX_BITDEPTH == 12 +const char* GetDigest12bpp(int id) { + static const char* const kDigest[] = { + "eb18c776d7b56280f01cca40b04a9c44", "058d4a6ed025eac5dcf7aec3203c0882", + "8355884d7470e9c6af9309ab23bee859", "2ba330551ac58d1d034b947d7ab9b59f", + "0d25cd773c81e4c57f82513e3b031f01", "b9075f7c3b9a240dbb015a24454eeb71", + "563ed8683723d1e4f2746280bca3db0a", "d7125306bd8c952d0f85fe1515ca16a7", + "5bf99c7e4a918c9b6a7e251484ea6527", "38ac9c685e8d2bd2771b6f2b38268301", + "abc39dbde7470e08b15417ee97c704b2", "37e12753d23b7a8df92b1d32f3170d9f", + "9a609776cfa31f64826225d0a6b7afdd", "ccdd89e70e94f751fd891b124c1c3210", + "2bbf7b095e26ed4f27e7d05e20117084", "9a1b403c3a7c00da5686bcb87f1270e8", + "701d651e391043ab8ebbd0023a430980", "0047f10bdd8321494e8e82597fe2f969", + "f97e662d139b2811e3d3227de95135a2", "852933b90d4a70f9254157381ed641e0", + "cfcda707ec8e4361ef741dc716888348", "95e34eab83b3159f61685db248c6a881", + }; + assert(id >= 0); + assert(id < sizeof(kDigest) / sizeof(kDigest[0])); + return kDigest[id]; +} + +const char* GetDigestSpeed12bpp(int id) { + static const char* const kDigest[] = { + "6c0f37c41d72ce40d95545ac0f08d88a", "8a8efeb7d8b2f852d76d0176b6c6878f", + "5757c88d1cdc0cd29c47c346474161f0", "fef8cf06d16ba7357bfc061e43080cd3", + "6bd11582448532bce8b91cc8807ab6a0", "1e6dd42eada2d636e210f4e20a771102", + "377a0472f45fcb42f1712243ea845530", "e3760f2b6e69c1b40e71ecde711d227c", + "6721638d1a5dadb96ddd0ca067c737ca", "3d3a23210a8496a76991bcec5045808b", + "2cbd26ecf7d4e927ab569083d3ddb4ca", "7d61af2d7841d1a39a2e930bac166804", + "dd929506442fb1f2e67130fe8cdf487b", "c0e57f8d2546d5bcb646a24d09d83d7c", + "2989c6487456c92eb003c8e17e904f45", "5cfb60a3be6ee5c41e0f655a3020f687", + "28f37d47cb07aa382659ff556a55a4c6", "b6478ab317b11f592deb60d02ce62f2f", + "bc78e7250c101f82e794d4fa0ee55025", "24304ed23d336a46f205206d3c5d48ef", + "dc1e71d95d06c1086bb7f9e05e38bf39", "32606ef72985e7de608df2e8760784b7", + }; + assert(id >= 0); + assert(id < sizeof(kDigest) / sizeof(kDigest[0])); + return kDigest[id]; +} +#endif // LIBGAV1_MAX_BITDEPTH == 12 + struct ObmcTestParam { ObmcTestParam(int width, int height, ObmcDirection blending_direction) : width(width), height(height), blending_direction(blending_direction) {} @@ -130,6 +179,7 @@ std::ostream& operator<<(std::ostream& os, const ObmcTestParam& param) { template <int bitdepth, typename Pixel> class ObmcBlendTest : public testing::TestWithParam<ObmcTestParam> { public: + static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, ""); ObmcBlendTest() = default; ~ObmcBlendTest() override = default; @@ -206,11 +256,12 @@ void ObmcBlendTest<bitdepth, Pixel>::Test(const char* const digest, src_2[x] = rnd.Rand16() & mask; } src_1 += kMaxBlendingBlockSize; - src_2 += kMaxBlendingBlockSize; + src_2 += width_; } } const ptrdiff_t stride = kMaxBlendingBlockSize * sizeof(Pixel); - func_(source1_, stride, width_, height_, source2_, stride); + func_(source1_, stride, width_, height_, source2_, + width_ * sizeof(source2_[0])); if (use_fixed_values) { const bool success = test_utils::CompareBlocks( source1_, source2_, width_, height_, kMaxBlendingBlockSize, @@ -238,7 +289,7 @@ void ObmcBlendTest<bitdepth, Pixel>::TestSpeed(const char* const digest, src_2[x] = rnd.Rand16() & mask; } src_1 += kMaxBlendingBlockSize; - src_2 += kMaxBlendingBlockSize; + src_2 += width_; } const ptrdiff_t stride = kMaxBlendingBlockSize * sizeof(Pixel); uint8_t dest[sizeof(Pixel) * kMaxBlendingBlockSize * kMaxBlendingBlockSize]; @@ -247,7 +298,8 @@ void ObmcBlendTest<bitdepth, Pixel>::TestSpeed(const char* const digest, memcpy(dest, source1_, sizeof(Pixel) * kMaxBlendingBlockSize * kMaxBlendingBlockSize); const absl::Time start = absl::Now(); - func_(dest, stride, width_, height_, source2_, stride); + func_(dest, stride, width_, height_, source2_, + width_ * sizeof(source2_[0])); elapsed_time += absl::Now() - start; } memcpy(source1_, dest, @@ -338,6 +390,26 @@ INSTANTIATE_TEST_SUITE_P(NEON, ObmcBlendTest10bpp, #endif #endif // LIBGAV1_MAX_BITDEPTH >= 10 +#if LIBGAV1_MAX_BITDEPTH == 12 +using ObmcBlendTest12bpp = ObmcBlendTest<12, uint16_t>; + +TEST_P(ObmcBlendTest12bpp, Blending) { + Test(/*digest=*/nullptr, /*use_fixed_values=*/true, 0); + Test(/*digest=*/nullptr, /*use_fixed_values=*/true, 1); + Test(/*digest=*/nullptr, /*use_fixed_values=*/true, 128); + Test(/*digest=*/nullptr, /*use_fixed_values=*/true, (1 << 12) - 1); + Test(GetDigest12bpp(GetDigestId()), /*use_fixed_values=*/false, -1); +} + +TEST_P(ObmcBlendTest12bpp, DISABLED_Speed) { + TestSpeed(GetDigestSpeed12bpp(GetDigestId()), + kNumSpeedTests / (GetParam().height * GetParam().width)); +} + +INSTANTIATE_TEST_SUITE_P(C, ObmcBlendTest12bpp, + testing::ValuesIn(kObmcTestParam)); +#endif // LIBGAV1_MAX_BITDEPTH == 12 + } // namespace } // namespace dsp } // namespace libgav1 diff --git a/src/dsp/super_res.cc b/src/dsp/super_res.cc index 570ba73..7593729 100644 --- a/src/dsp/super_res.cc +++ b/src/dsp/super_res.cc @@ -95,7 +95,23 @@ void Init10bpp() { #endif #endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS } +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +#if LIBGAV1_MAX_BITDEPTH == 12 +void Init12bpp() { + Dsp* dsp = dsp_internal::GetWritableDspTable(12); + assert(dsp != nullptr); + dsp->super_res_coefficients = nullptr; +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + dsp->super_res = SuperRes_C<12, uint16_t>; +#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + static_cast<void>(dsp); +#ifndef LIBGAV1_Dsp12bpp_SuperRes + dsp->super_res = SuperRes_C<12, uint16_t>; #endif +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +} +#endif // LIBGAV1_MAX_BITDEPTH == 12 } // namespace @@ -104,6 +120,9 @@ void SuperResInit_C() { #if LIBGAV1_MAX_BITDEPTH >= 10 Init10bpp(); #endif +#if LIBGAV1_MAX_BITDEPTH == 12 + Init12bpp(); +#endif } } // namespace dsp diff --git a/src/dsp/super_res_test.cc b/src/dsp/super_res_test.cc index a93fc31..7b253ff 100644 --- a/src/dsp/super_res_test.cc +++ b/src/dsp/super_res_test.cc @@ -56,7 +56,16 @@ const char* GetDigest10bpp(int id) { "126cd7727e787e0625ec3f5ce97f8fa0", "85c806c41d40b841764bcb54f6d3a712"}; return kDigestSuperRes[id]; } -#endif +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +#if LIBGAV1_MAX_BITDEPTH == 12 +const char* GetDigest12bpp(int id) { + static const char* const kDigestSuperRes[] = { + "9a08983d82df4983700976f18919201b", "6e5edbafcb6c38db37258bf79c00ea32", + "f5c57e6d3b518f9585f768ed19b91568", "b5de9b93c8a1a50580e7c7c9456fb615"}; + return kDigestSuperRes[id]; +} +#endif // LIBGAV1_MAX_BITDEPTH == 12 struct SuperResTestParam { SuperResTestParam(int downscaled_width, int upscaled_width) @@ -69,6 +78,7 @@ template <int bitdepth, typename Pixel, typename Coefficient> class SuperResTest : public testing::TestWithParam<SuperResTestParam>, public test_utils::MaxAlignedAllocable { public: + static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, ""); SuperResTest() = default; void SetUp() override { test_utils::ResetDspTable(bitdepth); @@ -174,14 +184,23 @@ void SuperResTest<bitdepth, Pixel, Coefficient>::TestComputeSuperRes( } } } - const char* expected_digest; - if (bitdepth == 8) { - expected_digest = GetDigest8bpp(test_id_); - } else { + const char* expected_digest = nullptr; + switch (bitdepth) { + case 8: + expected_digest = GetDigest8bpp(test_id_); + break; #if LIBGAV1_MAX_BITDEPTH >= 10 - expected_digest = GetDigest10bpp(test_id_); + case 10: + expected_digest = GetDigest10bpp(test_id_); + break; +#endif +#if LIBGAV1_MAX_BITDEPTH == 12 + case 12: + expected_digest = GetDigest12bpp(test_id_); + break; #endif } + ASSERT_NE(expected_digest, nullptr); test_utils::CheckMd5Digest( "SuperRes", absl::StrFormat("width %d, step %d, start %d", kUpscaledWidth, step, @@ -259,6 +278,25 @@ INSTANTIATE_TEST_SUITE_P(NEON, SuperResTest10bpp, #endif #endif // LIBGAV1_MAX_BITDEPTH >= 10 +#if LIBGAV1_MAX_BITDEPTH == 12 +using SuperResTest12bpp = SuperResTest<12, uint16_t, int16_t>; + +TEST_P(SuperResTest12bpp, FixedValues) { + TestComputeSuperRes(100, 1); + TestComputeSuperRes(2047, 1); + TestComputeSuperRes(1, 1); +} + +TEST_P(SuperResTest12bpp, RandomValues) { TestComputeSuperRes(0, 1); } + +TEST_P(SuperResTest12bpp, DISABLED_Speed) { + TestComputeSuperRes(0, kNumSpeedTests); +} + +INSTANTIATE_TEST_SUITE_P(C, SuperResTest12bpp, + testing::ValuesIn(kSuperResTestParams)); +#endif // LIBGAV1_MAX_BITDEPTH == 12 + } // namespace } // namespace dsp } // namespace libgav1 diff --git a/src/dsp/warp.cc b/src/dsp/warp.cc index dd467ea..f62f1ed 100644 --- a/src/dsp/warp.cc +++ b/src/dsp/warp.cc @@ -111,14 +111,8 @@ void Warp_C(const void* LIBGAV1_RESTRICT const source, ptrdiff_t source_stride, start_x += 8) { const int src_x = (start_x + 4) << subsampling_x; const int src_y = (start_y + 4) << subsampling_y; - const int dst_x = - src_x * warp_params[2] + src_y * warp_params[3] + warp_params[0]; - const int dst_y = - src_x * warp_params[4] + src_y * warp_params[5] + warp_params[1]; - const int x4 = dst_x >> subsampling_x; - const int y4 = dst_y >> subsampling_y; - const int ix4 = x4 >> kWarpedModelPrecisionBits; - const int iy4 = y4 >> kWarpedModelPrecisionBits; + const WarpFilterParams filter_params = GetWarpFilterParams( + src_x, src_y, subsampling_x, subsampling_y, warp_params); // A prediction block may fall outside the frame's boundaries. If a // prediction block is calculated using only samples outside the frame's @@ -172,22 +166,24 @@ void Warp_C(const void* LIBGAV1_RESTRICT const source, ptrdiff_t source_stride, // border index (source_width - 1 or 0, respectively). Then for each x, // the inner for loop of the horizontal filter is reduced to multiplying // the border pixel by the sum of the filter coefficients. - if (ix4 - 7 >= source_width - 1 || ix4 + 7 <= 0) { + if (filter_params.ix4 - 7 >= source_width - 1 || + filter_params.ix4 + 7 <= 0) { // Regions 1 and 2. // Points to the left or right border of the first row of |src|. const Pixel* first_row_border = - (ix4 + 7 <= 0) ? src : src + source_width - 1; + (filter_params.ix4 + 7 <= 0) ? src : src + source_width - 1; // In general, for y in [-7, 8), the row number iy4 + y is clipped: // const int row = Clip3(iy4 + y, 0, source_height - 1); // In two special cases, iy4 + y is clipped to either 0 or // source_height - 1 for all y. In the rest of the cases, iy4 + y is // bounded and we can avoid clipping iy4 + y by relying on a reference // frame's boundary extension on the top and bottom. - if (iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0) { + if (filter_params.iy4 - 7 >= source_height - 1 || + filter_params.iy4 + 7 <= 0) { // Region 1. // Every sample used to calculate the prediction block has the same // value. So the whole prediction block has the same value. - const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1; + const int row = (filter_params.iy4 + 7 <= 0) ? 0 : source_height - 1; const Pixel row_border_pixel = first_row_border[row * source_stride]; DestType* dst_row = dst + start_x - block_start_x; if (is_compound) { @@ -220,15 +216,15 @@ void Warp_C(const void* LIBGAV1_RESTRICT const source, ptrdiff_t source_stride, for (int y = -7; y < 8; ++y) { // We may over-read up to 13 pixels above the top source row, or up // to 13 pixels below the bottom source row. This is proved below. - const int row = iy4 + y; + const int row = filter_params.iy4 + y; int sum = first_row_border[row * source_stride]; sum <<= kFilterBits - kRoundBitsHorizontal; intermediate_result_column[y + 7] = sum; } // Vertical filter. DestType* dst_row = dst + start_x - block_start_x; - int sy4 = - (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta); + int sy4 = (filter_params.y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - + MultiplyBy4(delta); for (int y = 0; y < 8; ++y) { int sy = sy4 - MultiplyBy4(gamma); for (int x = 0; x < 8; ++x) { @@ -269,12 +265,14 @@ void Warp_C(const void* LIBGAV1_RESTRICT const source, ptrdiff_t source_stride, // source_height - 1 for all y. In the rest of the cases, iy4 + y is // bounded and we can avoid clipping iy4 + y by relying on a reference // frame's boundary extension on the top and bottom. - if (iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0) { + if (filter_params.iy4 - 7 >= source_height - 1 || + filter_params.iy4 + 7 <= 0) { // Region 3. // Horizontal filter. - const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1; + const int row = (filter_params.iy4 + 7 <= 0) ? 0 : source_height - 1; const Pixel* const src_row = src + row * source_stride; - int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7; + int sx4 = (filter_params.x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - + beta * 7; for (int y = -7; y < 8; ++y) { int sx = sx4 - MultiplyBy4(alpha); for (int x = -4; x < 4; ++x) { @@ -300,7 +298,7 @@ void Warp_C(const void* LIBGAV1_RESTRICT const source, ptrdiff_t source_stride, // -13 <= column <= (source_width - 1) + 13. // Therefore we may over-read up to 13 pixels before the source // row, or up to 13 pixels after the source row. - const int column = ix4 + x + k - 3; + const int column = filter_params.ix4 + x + k - 3; sum += kWarpedFilters[offset][k] * src_row[column]; } intermediate_result[y + 7][x + 4] = @@ -315,7 +313,8 @@ void Warp_C(const void* LIBGAV1_RESTRICT const source, ptrdiff_t source_stride, // At this point, we know iy4 - 7 < source_height - 1 and iy4 + 7 > 0. // It follows that -6 <= iy4 <= source_height + 5. This inequality is // used below. - int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7; + int sx4 = (filter_params.x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - + beta * 7; for (int y = -7; y < 8; ++y) { // We assume the source frame has top and bottom borders of at least // 13 pixels that extend the frame boundary pixels. @@ -326,7 +325,7 @@ void Warp_C(const void* LIBGAV1_RESTRICT const source, ptrdiff_t source_stride, // -13 <= row <= (source_height - 1) + 13. // Therefore we may over-read up to 13 pixels above the top source // row, or up to 13 pixels below the bottom source row. - const int row = iy4 + y; + const int row = filter_params.iy4 + y; const Pixel* const src_row = src + row * source_stride; int sx = sx4 - MultiplyBy4(alpha); for (int x = -4; x < 4; ++x) { @@ -352,7 +351,7 @@ void Warp_C(const void* LIBGAV1_RESTRICT const source, ptrdiff_t source_stride, // -13 <= column <= (source_width - 1) + 13. // Therefore we may over-read up to 13 pixels before the source // row, or up to 13 pixels after the source row. - const int column = ix4 + x + k - 3; + const int column = filter_params.ix4 + x + k - 3; sum += kWarpedFilters[offset][k] * src_row[column]; } intermediate_result[y + 7][x + 4] = @@ -367,8 +366,8 @@ void Warp_C(const void* LIBGAV1_RESTRICT const source, ptrdiff_t source_stride, // Regions 3 and 4. // Vertical filter. DestType* dst_row = dst + start_x - block_start_x; - int sy4 = - (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta); + int sy4 = (filter_params.y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - + MultiplyBy4(delta); // The spec says we should use the following loop condition: // y < std::min(4, block_start_y + block_height - start_y - 4); // We can prove that block_start_y + block_height - start_y >= 8, which @@ -460,7 +459,26 @@ void Init10bpp() { #endif #endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS } +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +#if LIBGAV1_MAX_BITDEPTH == 12 +void Init12bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(12); + assert(dsp != nullptr); +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + dsp->warp = Warp_C</*is_compound=*/false, 12, uint16_t>; + dsp->warp_compound = Warp_C</*is_compound=*/true, 12, uint16_t>; +#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + static_cast<void>(dsp); +#ifndef LIBGAV1_Dsp12bpp_Warp + dsp->warp = Warp_C</*is_compound=*/false, 12, uint16_t>; #endif +#ifndef LIBGAV1_Dsp12bpp_WarpCompound + dsp->warp_compound = Warp_C</*is_compound=*/true, 12, uint16_t>; +#endif +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +} +#endif // LIBGAV1_MAX_BITDEPTH == 12 } // namespace @@ -469,6 +487,9 @@ void WarpInit_C() { #if LIBGAV1_MAX_BITDEPTH >= 10 Init10bpp(); #endif +#if LIBGAV1_MAX_BITDEPTH == 12 + Init12bpp(); +#endif } } // namespace dsp diff --git a/src/dsp/warp.h b/src/dsp/warp.h index 7367a9b..9c20f12 100644 --- a/src/dsp/warp.h +++ b/src/dsp/warp.h @@ -38,9 +38,39 @@ namespace libgav1 { namespace dsp { +// Section 7.11.3.5. +struct WarpFilterParams { + int64_t x4; + int64_t y4; + int ix4; + int iy4; +}; + // Initializes Dsp::warp. This function is not thread-safe. void WarpInit_C(); +// Section 7.11.3.5. +inline WarpFilterParams GetWarpFilterParams(int src_x, int src_y, + int subsampling_x, + int subsampling_y, + const int* warp_params) { + WarpFilterParams filter_params; + // warp_params[2]/[5] require 17 bits (the others 14). With large resolutions + // the result of the multiplication will require 33. + const int64_t dst_x = static_cast<int64_t>(src_x) * warp_params[2] + + src_y * warp_params[3] + warp_params[0]; + const int64_t dst_y = src_x * warp_params[4] + + static_cast<int64_t>(src_y) * warp_params[5] + + warp_params[1]; + filter_params.x4 = dst_x >> subsampling_x; + filter_params.y4 = dst_y >> subsampling_y; + filter_params.ix4 = + static_cast<int>(filter_params.x4 >> kWarpedModelPrecisionBits); + filter_params.iy4 = + static_cast<int>(filter_params.y4 >> kWarpedModelPrecisionBits); + return filter_params; +} + } // namespace dsp } // namespace libgav1 diff --git a/src/dsp/warp_test.cc b/src/dsp/warp_test.cc index 4d13051..c64c8d6 100644 --- a/src/dsp/warp_test.cc +++ b/src/dsp/warp_test.cc @@ -105,6 +105,8 @@ const char* GetDigest8bpp(int id) { "4d412349a25a832c1fb3fb29e3f0e2b3", "2c6dd2a9a4ede9fa00adb567ba646f30", "b2a0ce68db3cadd207299f73112bed74", }; + assert(id >= 0); + assert(id < sizeof(kDigest) / sizeof(kDigest[0])); return is_compound ? kCompoundDigest[id] : kDigest[id]; } @@ -129,9 +131,38 @@ const char* GetDigest10bpp(int id) { "f3be504bbb609ce4cc71c5539252638a", "fcde83b54e14e9de23460644f244b047", "42eb66e752e9ef289b47053b5c73fdd6", }; + assert(id >= 0); + assert(id < sizeof(kDigest) / sizeof(kDigest[0])); return is_compound ? kCompoundDigest[id] : kDigest[id]; } -#endif +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +#if LIBGAV1_MAX_BITDEPTH == 12 +template <bool is_compound> +const char* GetDigest12bpp(int id) { + static const char* const kDigest[] = { + "cd5d5e2102b8917ad70778f523d24bdf", "374a5f1b53a3fdf2eefa741eb71e6889", + "311636841770ec2427084891df96bee5", "c40c537917b1f0d1d84c99dfcecd8219", + "a1d9bb920e6c3d20c0cf84adc18e1f15", "13b5659acdb39b717526cb358c6f4026", + "f81ea4f6fd1f4ebed1262e3fae37b5bb", "c1452fefcd9b9562fe3a0b7f9302809c", + "8fed8a3159dc7b6b59a39ab2be6bee13", "b46458bc0e5cf1cee92aac4f0f608749", + "2e6a1039ab111add89f5b44b13565f40", "9c666691860bdc89b03f601b40126196", + "418a47157d992b94c302ca2e2f6ee07e", + }; + static const char* const kCompoundDigest[] = { + "8e6986ae143260e0b8b4887f15a141a1", "0a7f0db8316b8c3569f08834dd0c6f50", + "90705b2e7dbe083e8a1f70f29d6f257e", "e428a75bea77d769d21f3f7a1d2b0b38", + "a570b13d790c085c4ab50d71dd085d56", "e5d043c6cd6ff6dbab6e38a8877e93bd", + "12ea96991e46e3e9aa78ab812ffa0525", "84293a94a53f1cf814fa25e793c3fe27", + "b98a7502c84ac8437266f702dcc0a92e", "d8db5d52e9b0a5be0ad2d517d5bd16e9", + "f3be504bbb609ce4cc71c5539252638a", "fcde83b54e14e9de23460644f244b047", + "42eb66e752e9ef289b47053b5c73fdd6", + }; + assert(id >= 0); + assert(id < sizeof(kDigest) / sizeof(kDigest[0])); + return is_compound ? kCompoundDigest[id] : kDigest[id]; +} +#endif // LIBGAV1_MAX_BITDEPTH == 12 int RandomWarpedParam(int seed_offset, int bits) { libvpx_test::ACMRandom rnd(seed_offset + @@ -228,6 +259,7 @@ struct WarpTestParam { template <bool is_compound, int bitdepth, typename Pixel> class WarpTest : public testing::TestWithParam<WarpTestParam> { public: + static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, ""); WarpTest() = default; ~WarpTest() override = default; @@ -389,14 +421,23 @@ void WarpTest<is_compound, bitdepth, Pixel>::Test(bool use_fixed_values, id = 2 + 3 * static_cast<int>(FloorLog2(param_.height) - 3); } - const char* expected_digest; - if (bitdepth == 8) { - expected_digest = GetDigest8bpp<is_compound>(id); - } else { + const char* expected_digest = nullptr; + switch (bitdepth) { + case 8: + expected_digest = GetDigest8bpp<is_compound>(id); + break; #if LIBGAV1_MAX_BITDEPTH >= 10 - expected_digest = GetDigest10bpp<is_compound>(id); + case 10: + expected_digest = GetDigest10bpp<is_compound>(id); + break; +#endif +#if LIBGAV1_MAX_BITDEPTH == 12 + case 12: + expected_digest = GetDigest12bpp<is_compound>(id); + break; #endif } + ASSERT_NE(expected_digest, nullptr); test_utils::CheckMd5Digest( "Warp", absl::StrFormat("%dx%d", param_.width, param_.height).c_str(), expected_digest, dest_, sizeof(dest_), elapsed_time); @@ -643,7 +684,22 @@ INSTANTIATE_TEST_SUITE_P(C, WarpTest10bpp, testing::ValuesIn(warp_test_param)); INSTANTIATE_TEST_SUITE_P(NEON, WarpTest10bpp, testing::ValuesIn(warp_test_param)); #endif -#endif +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +#if LIBGAV1_MAX_BITDEPTH == 12 +using WarpTest12bpp = WarpTest</*is_compound=*/false, 12, uint16_t>; +// TODO(jzern): Coverage could be added for kInterRoundBitsCompoundVertical via +// WarpCompoundTest. +// using WarpCompoundTest12bpp = WarpTest</*is_compound=*/true, 12, uint16_t>; + +TEST_P(WarpTest12bpp, FixedValues) { TestFixedValues(); } + +TEST_P(WarpTest12bpp, RandomValues) { TestRandomValues(); } + +TEST_P(WarpTest12bpp, DISABLED_Speed) { TestSpeed(); } + +INSTANTIATE_TEST_SUITE_P(C, WarpTest12bpp, testing::ValuesIn(warp_test_param)); +#endif // LIBGAV1_MAX_BITDEPTH == 12 std::ostream& operator<<(std::ostream& os, const WarpTestParam& warp_param) { return os << "BlockSize" << warp_param.width << "x" << warp_param.height; diff --git a/src/dsp/weight_mask.cc b/src/dsp/weight_mask.cc index 41f4c70..ee3808b 100644 --- a/src/dsp/weight_mask.cc +++ b/src/dsp/weight_mask.cc @@ -213,7 +213,86 @@ void Init10bpp() { #endif #endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS } +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +#if LIBGAV1_MAX_BITDEPTH == 12 +void Init12bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(12); + assert(dsp != nullptr); +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + INIT_WEIGHT_MASK(8, 8, 12, 0, 0); + INIT_WEIGHT_MASK(8, 16, 12, 0, 1); + INIT_WEIGHT_MASK(8, 32, 12, 0, 2); + INIT_WEIGHT_MASK(16, 8, 12, 1, 0); + INIT_WEIGHT_MASK(16, 16, 12, 1, 1); + INIT_WEIGHT_MASK(16, 32, 12, 1, 2); + INIT_WEIGHT_MASK(16, 64, 12, 1, 3); + INIT_WEIGHT_MASK(32, 8, 12, 2, 0); + INIT_WEIGHT_MASK(32, 16, 12, 2, 1); + INIT_WEIGHT_MASK(32, 32, 12, 2, 2); + INIT_WEIGHT_MASK(32, 64, 12, 2, 3); + INIT_WEIGHT_MASK(64, 16, 12, 3, 1); + INIT_WEIGHT_MASK(64, 32, 12, 3, 2); + INIT_WEIGHT_MASK(64, 64, 12, 3, 3); + INIT_WEIGHT_MASK(64, 128, 12, 3, 4); + INIT_WEIGHT_MASK(128, 64, 12, 4, 3); + INIT_WEIGHT_MASK(128, 128, 12, 4, 4); +#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + static_cast<void>(dsp); +#ifndef LIBGAV1_Dsp12bpp_WeightMask_8x8 + INIT_WEIGHT_MASK(8, 8, 12, 0, 0); +#endif +#ifndef LIBGAV1_Dsp12bpp_WeightMask_8x16 + INIT_WEIGHT_MASK(8, 16, 12, 0, 1); +#endif +#ifndef LIBGAV1_Dsp12bpp_WeightMask_8x32 + INIT_WEIGHT_MASK(8, 32, 12, 0, 2); +#endif +#ifndef LIBGAV1_Dsp12bpp_WeightMask_16x8 + INIT_WEIGHT_MASK(16, 8, 12, 1, 0); +#endif +#ifndef LIBGAV1_Dsp12bpp_WeightMask_16x16 + INIT_WEIGHT_MASK(16, 16, 12, 1, 1); +#endif +#ifndef LIBGAV1_Dsp12bpp_WeightMask_16x32 + INIT_WEIGHT_MASK(16, 32, 12, 1, 2); +#endif +#ifndef LIBGAV1_Dsp12bpp_WeightMask_16x64 + INIT_WEIGHT_MASK(16, 64, 12, 1, 3); +#endif +#ifndef LIBGAV1_Dsp12bpp_WeightMask_32x8 + INIT_WEIGHT_MASK(32, 8, 12, 2, 0); #endif +#ifndef LIBGAV1_Dsp12bpp_WeightMask_32x16 + INIT_WEIGHT_MASK(32, 16, 12, 2, 1); +#endif +#ifndef LIBGAV1_Dsp12bpp_WeightMask_32x32 + INIT_WEIGHT_MASK(32, 32, 12, 2, 2); +#endif +#ifndef LIBGAV1_Dsp12bpp_WeightMask_32x64 + INIT_WEIGHT_MASK(32, 64, 12, 2, 3); +#endif +#ifndef LIBGAV1_Dsp12bpp_WeightMask_64x16 + INIT_WEIGHT_MASK(64, 16, 12, 3, 1); +#endif +#ifndef LIBGAV1_Dsp12bpp_WeightMask_64x32 + INIT_WEIGHT_MASK(64, 32, 12, 3, 2); +#endif +#ifndef LIBGAV1_Dsp12bpp_WeightMask_64x64 + INIT_WEIGHT_MASK(64, 64, 12, 3, 3); +#endif +#ifndef LIBGAV1_Dsp12bpp_WeightMask_64x128 + INIT_WEIGHT_MASK(64, 128, 12, 3, 4); +#endif +#ifndef LIBGAV1_Dsp12bpp_WeightMask_128x64 + INIT_WEIGHT_MASK(128, 64, 12, 4, 3); +#endif +#ifndef LIBGAV1_Dsp12bpp_WeightMask_128x128 + INIT_WEIGHT_MASK(128, 128, 12, 4, 4); +#endif +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +} +#endif // LIBGAV1_MAX_BITDEPTH == 12 } // namespace @@ -222,6 +301,9 @@ void WeightMaskInit_C() { #if LIBGAV1_MAX_BITDEPTH >= 10 Init10bpp(); #endif +#if LIBGAV1_MAX_BITDEPTH == 12 + Init12bpp(); +#endif } } // namespace dsp diff --git a/src/dsp/weight_mask_test.cc b/src/dsp/weight_mask_test.cc index 77b608e..74ec03c 100644 --- a/src/dsp/weight_mask_test.cc +++ b/src/dsp/weight_mask_test.cc @@ -54,42 +54,42 @@ constexpr int kCompoundPredictionRange[3][2] = { const char* GetDigest8bpp(int id) { static const char* const kDigest[] = { - "035267cb2ac5a0f8ff50c2d30ad52226", - "3231f4972dd858b734e0cc48c4cd001e", - "7e163b69721a13ec9f75b5cd74ffee3f", + "eaca5b6a96dcfe5e44f3926a071b48b3", + "1d82c75cfdf8e57925eb1d5301647538", + "25bd455d74fb891b97b133c528f8db60", "" /*kBlock4x16*/, - "b75e90abc224acca8754c82039b3ba93", - "9f555f3a2c1a933a663d6103b8118dea", - "8539e54f34cd6668ff6e6606210be201", - "20f85c9db7c878c21fbf2052936f269e", - "620ec166de57b0639260b2d72eebfc3e", - "be666394b5a894d78f4097b6cca272fe", - "57a96816e84cdb381f596c23827b5922", - "f2e0d348f608f246b6d8d799b66c189e", - "161ac051f38372d9339d36728b9926ba", - "d5fad48aaf132a81cb62bba4f07bbebb", - "e10be2dca2f7dae38dae75150fc1612d", - "7f744481eb551bbc224b5236c82cbade", + "1d82c75cfdf8e57925eb1d5301647538", + "25bd455d74fb891b97b133c528f8db60", + "62a08776db35a186406a11ab92dee71c", + "95131d1dc0e05fcf4bd234d5ce9eea11", + "25bd455d74fb891b97b133c528f8db60", + "62a08776db35a186406a11ab92dee71c", + "95131d1dc0e05fcf4bd234d5ce9eea11", + "0b3c75272e0fb0747b9850145d340c4c", + "95131d1dc0e05fcf4bd234d5ce9eea11", + "0b3c75272e0fb0747b9850145d340c4c", + "f26c43d4bc823a89c1ed47ab8708bc06", + "0d99bbf31ecddc1c2d5063a68c0e9375", "0d99bbf31ecddc1c2d5063a68c0e9375", "5fb8ec5f582f0ebfe519ed55860f67c4", // mask_is_inverse = true. - "a4250ca39daa700836138371d36d465f", - "abe9a9a1c3a5accda9bfefd4d6e81ccb", - "e95b08878d0bb5f2293c27c3a6fe0253", + "96811f3b192828ff679e4c9ad8069d7d", + "a04dc180c028d55af70240163445523a", + "8513e3988233d0a7de316a0179bb6139", "" /*kBlock4x16*/, - "e1c52be02ce9ab2800015bb08b866c31", - "eea1dc73811f73866edfeb4555865f20", - "3178e64085645bd819256a8ab43c7b0a", - "ee83884e4d5cd2c9ac04879116bab681", - "d107eff7d5ae9ba14d2c6b3b8d9fca49", - "400aeea7d299626fc336c46b1ad7a9d8", - "e9e26a400f67f3ad36350fe4171fc613", - "4c31ad714f470f34127febaf1bac714b", - "bbdcb1097c66d561dd4ea16b3fb73f97", - "3a21dfbf53e4c964e303a75a3308ce15", - "3416dab4512fd0dc61d788b433cd624e", - "68ace8f01fdd74aec3fee528c8167738", + "a04dc180c028d55af70240163445523a", + "8513e3988233d0a7de316a0179bb6139", + "f7356d42fb44a6ccb41253ba35b8b3c7", + "3d2d61ffc203ee64fe91c9d16168a19d", + "8513e3988233d0a7de316a0179bb6139", + "f7356d42fb44a6ccb41253ba35b8b3c7", + "3d2d61ffc203ee64fe91c9d16168a19d", + "87a2011ac69fb597ca4f71bb3c35ebb0", + "3d2d61ffc203ee64fe91c9d16168a19d", + "87a2011ac69fb597ca4f71bb3c35ebb0", + "97100a3639d567046dc8a99fcb84cb2e", + "9fabe05a6523da81a45150e19f75acff", "9fabe05a6523da81a45150e19f75acff", "7c0643e4d02421d06d7ca71822a94e1d", }; @@ -99,42 +99,42 @@ const char* GetDigest8bpp(int id) { #if LIBGAV1_MAX_BITDEPTH >= 10 const char* GetDigest10bpp(int id) { static const char* const kDigest[] = { - "1dc9bdd042e5228705b857b42798e364", - "c054c8644bd482ce78a139d8e063e013", - "bbe4ac48f013f34c84779da05b0bcbe0", + "5ae8d64b65a671301a457b8a73368ab5", + "61535217f179054d4b76a8d9352a223d", + "1aa6614773570e7b021cd509849c4180", "" /*kBlock4x16*/, - "13d4759277637a607f25439182553708", - "f089667610561a47d50f9f930ad7c454", - "46715e6f7819f59725bdb083f4403255", - "3774541c339ae3af920ef2b1d6abf6a1", - "94913b01d226cb5eb273dfee84b51f65", - "be0c0847629dfff8e0e991ed67697a7d", - "716b5398b77d7459274d4ea9c91ebd8e", - "f5c1b0b461df4182529949472242b421", - "5e9576ea4cf107249ce4ae89a72b9c95", - "da021bcdf7936f7bd9a2399c69e4d37c", - "b3a310a39c1900e00f992839ff188656", - "9f3a15351af5945615f296242ec56a38", + "61535217f179054d4b76a8d9352a223d", + "1aa6614773570e7b021cd509849c4180", + "f04c2825cfb6408c7778658f71fa176e", + "e1694ea1f026dac7fe7e86a84482cf86", + "1aa6614773570e7b021cd509849c4180", + "f04c2825cfb6408c7778658f71fa176e", + "e1694ea1f026dac7fe7e86a84482cf86", + "9c4855d44c013fbddb373b2e9e311080", + "e1694ea1f026dac7fe7e86a84482cf86", + "9c4855d44c013fbddb373b2e9e311080", + "f510e743c3efe3b83374a98ef8a30838", + "b6e0bd03c521c5f00e90530daa7d4432", "b6e0bd03c521c5f00e90530daa7d4432", "3270d7f621d488aec5b76bcf121debd0", // mask_is_inverse = true. - "33df96dd246683133eefe4caea6e3f7d", - "73e0ccc5d42806548a4b59f856256c1e", - "3561a0358cf831aee9477d07feafae2d", + "9aa00fcfe21b71e30c5393699122a020", + "4d8ce33262cf6b5375f363530815189a", + "428625c51ac1bd4585988f7b36dff1db", "" /*kBlock4x16*/, - "c5a2e633c0cd6925e68f21f47f0e2d84", - "8755a2d3840dde5fd6a0cce6bd6642c5", - "85ec538b72cecd6ea1fddab5ce3b4e64", - "a53e0dec84c675c4c6b1f5792b0232ff", - "86180da325f9727670a98cf2dbf7410e", - "a5fdc95104948047e179b2bc3d47f51d", - "9b95b3858187838e4669180e2ddb295e", - "6e40ca55608f6bf2f8cd91c8dbf3ddbf", - "d3a092672e921b588279d57e50b31888", - "9883eb19b733ee9f1cb6a6b6a1a00bb5", - "dd34764e068b228b7820321b06864e63", - "6c743dc9c8c87c7044151d29993e5042", + "4d8ce33262cf6b5375f363530815189a", + "428625c51ac1bd4585988f7b36dff1db", + "1ef63c06a2d9c42da293fdf924032981", + "5dd3f201d755d1c22c126a633bfbb3c0", + "428625c51ac1bd4585988f7b36dff1db", + "1ef63c06a2d9c42da293fdf924032981", + "5dd3f201d755d1c22c126a633bfbb3c0", + "fe1e6843e6f214939da516dcbea04a79", + "5dd3f201d755d1c22c126a633bfbb3c0", + "fe1e6843e6f214939da516dcbea04a79", + "240187f27389b5e89f9ec6bdbd7d20a7", + "44925dab01011a98b8ab1f0308fa852a", "44925dab01011a98b8ab1f0308fa852a", "6d984b2ccfa056278e2130771127a943", }; @@ -142,6 +142,52 @@ const char* GetDigest10bpp(int id) { } #endif // LIBGAV1_MAX_BITDEPTH >= 10 +#if LIBGAV1_MAX_BITDEPTH == 12 +const char* GetDigest12bpp(int id) { + static const char* const kDigest[] = { + "57629d3872fd52ff4bbec439c5517ec5", + "dba421ceeb534756c77167e00ae91a2c", + "72e8ac1d450ef0c6c6b03e93856d5cc2", + "" /*kBlock4x16*/, + "dba421ceeb534756c77167e00ae91a2c", + "72e8ac1d450ef0c6c6b03e93856d5cc2", + "ae573eb368df04e6a0133b4e15471728", + "ceede597b2729357b15e0d08bb9bb760", + "72e8ac1d450ef0c6c6b03e93856d5cc2", + "ae573eb368df04e6a0133b4e15471728", + "ceede597b2729357b15e0d08bb9bb760", + "c4976af803d7ad3f92ef26f25b9f3754", + "ceede597b2729357b15e0d08bb9bb760", + "c4976af803d7ad3f92ef26f25b9f3754", + "1d957d49f71bb7f304705a11a597f0cb", + "9522d5713fb951b79f42d78fbff914cf", + "9522d5713fb951b79f42d78fbff914cf", + "422c046013f79a9f46e2c855967570ba", + + // mask_is_inverse = true. + "a585cca9bc459d10e081bc0eb847b6e3", + "2fa4ec5f74fad2831d216c51c2cdad5a", + "d6c9ac69a9eb3059f5bb6e42b486ebcd", + "" /*kBlock4x16*/, + "2fa4ec5f74fad2831d216c51c2cdad5a", + "d6c9ac69a9eb3059f5bb6e42b486ebcd", + "2ddd8c8a1841501964011030e2557e20", + "97ef2575023dda008711015cf08d7590", + "d6c9ac69a9eb3059f5bb6e42b486ebcd", + "2ddd8c8a1841501964011030e2557e20", + "97ef2575023dda008711015cf08d7590", + "d69aff1e0d43395ce305c9be0dfb4c89", + "97ef2575023dda008711015cf08d7590", + "d69aff1e0d43395ce305c9be0dfb4c89", + "48786f640191dcbee5b3321672778519", + "6ad4718230353440b01f2bb78348157e", + "6ad4718230353440b01f2bb78348157e", + "ad49bd7af0ea17c84f434c7dfd0a911d", + }; + return kDigest[id]; +} +#endif // LIBGAV1_MAX_BITDEPTH == 12 + struct WeightMaskTestParam { WeightMaskTestParam(int width, int height, bool mask_is_inverse) : width(width), height(height), mask_is_inverse(mask_is_inverse) {} @@ -159,6 +205,7 @@ template <int bitdepth> class WeightMaskTest : public testing::TestWithParam<WeightMaskTestParam>, public test_utils::MaxAlignedAllocable { public: + static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, ""); WeightMaskTest() = default; ~WeightMaskTest() override = default; @@ -276,7 +323,7 @@ void WeightMaskTest<bitdepth>::Test(const int num_runs, SetInputData(use_fixed_values, value_1, value_2); const absl::Time start = absl::Now(); for (int i = 0; i < num_runs; ++i) { - func_(block_1_, block_2_, mask_, kMaxPredictionSize); + func_(block_1_, block_2_, mask_, width_); } const absl::Duration elapsed_time = absl::Now() - start; if (use_fixed_values) { @@ -284,8 +331,7 @@ void WeightMaskTest<bitdepth>::Test(const int num_runs, if (mask_is_inverse_) fixed_value = 64 - fixed_value; for (int y = 0; y < height_; ++y) { for (int x = 0; x < width_; ++x) { - ASSERT_EQ(static_cast<int>(mask_[y * kMaxPredictionSize + x]), - fixed_value) + ASSERT_EQ(static_cast<int>(mask_[y * width_ + x]), fixed_value) << "x: " << x << " y: " << y; } } @@ -293,17 +339,26 @@ void WeightMaskTest<bitdepth>::Test(const int num_runs, const int id_offset = mask_is_inverse_ ? kMaxBlockSizes - 4 : 0; const int id = id_offset + static_cast<int>(DimensionsToBlockSize(width_, height_)) - 4; - if (bitdepth == 8) { - test_utils::CheckMd5Digest( - absl::StrFormat("BlockSize %dx%d", width_, height_).c_str(), - "WeightMask", GetDigest8bpp(id), mask_, sizeof(mask_), elapsed_time); + const char* expected_digest = nullptr; + switch (bitdepth) { + case 8: + expected_digest = GetDigest8bpp(id); + break; #if LIBGAV1_MAX_BITDEPTH >= 10 - } else { - test_utils::CheckMd5Digest( - absl::StrFormat("BlockSize %dx%d", width_, height_).c_str(), - "WeightMask", GetDigest10bpp(id), mask_, sizeof(mask_), elapsed_time); + case 10: + expected_digest = GetDigest10bpp(id); + break; +#endif +#if LIBGAV1_MAX_BITDEPTH == 12 + case 12: + expected_digest = GetDigest12bpp(id); + break; #endif } + ASSERT_NE(expected_digest, nullptr); + test_utils::CheckMd5Digest( + absl::StrFormat("BlockSize %dx%d", width_, height_).c_str(), + "WeightMask", expected_digest, mask_, sizeof(mask_), elapsed_time); } } @@ -385,6 +440,28 @@ INSTANTIATE_TEST_SUITE_P(SSE41, WeightMaskTest10bpp, #endif #endif // LIBGAV1_MAX_BITDEPTH >= 10 +#if LIBGAV1_MAX_BITDEPTH == 12 +using WeightMaskTest12bpp = WeightMaskTest<12>; + +TEST_P(WeightMaskTest12bpp, FixedValues) { + const int min = kCompoundPredictionRange[2][0]; + const int max = kCompoundPredictionRange[2][1]; + Test(1, true, min, min); + Test(1, true, min, max); + Test(1, true, max, min); + Test(1, true, max, max); +} + +TEST_P(WeightMaskTest12bpp, RandomValues) { Test(1, false, -1, -1); } + +TEST_P(WeightMaskTest12bpp, DISABLED_Speed) { + Test(kNumSpeedTests, false, -1, -1); +} + +INSTANTIATE_TEST_SUITE_P(C, WeightMaskTest12bpp, + testing::ValuesIn(weight_mask_test_param)); +#endif // LIBGAV1_MAX_BITDEPTH == 12 + } // namespace } // namespace dsp } // namespace libgav1 diff --git a/src/dsp/x86/average_blend_sse4.cc b/src/dsp/x86/average_blend_sse4.cc index 911c5a9..c08b3d6 100644 --- a/src/dsp/x86/average_blend_sse4.cc +++ b/src/dsp/x86/average_blend_sse4.cc @@ -35,24 +35,46 @@ namespace { constexpr int kInterPostRoundBit = 4; -inline void AverageBlend4Row(const int16_t* LIBGAV1_RESTRICT prediction_0, - const int16_t* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT dest) { - const __m128i pred_0 = LoadLo8(prediction_0); - const __m128i pred_1 = LoadLo8(prediction_1); - __m128i res = _mm_add_epi16(pred_0, pred_1); - res = RightShiftWithRounding_S16(res, kInterPostRoundBit + 1); - Store4(dest, _mm_packus_epi16(res, res)); +inline void AverageBlend4x4Row(const int16_t* LIBGAV1_RESTRICT prediction_0, + const int16_t* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT dest, + const ptrdiff_t dest_stride) { + const __m128i pred_00 = LoadAligned16(prediction_0); + const __m128i pred_10 = LoadAligned16(prediction_1); + __m128i res_0 = _mm_add_epi16(pred_00, pred_10); + res_0 = RightShiftWithRounding_S16(res_0, kInterPostRoundBit + 1); + const __m128i pred_01 = LoadAligned16(prediction_0 + 8); + const __m128i pred_11 = LoadAligned16(prediction_1 + 8); + __m128i res_1 = _mm_add_epi16(pred_01, pred_11); + res_1 = RightShiftWithRounding_S16(res_1, kInterPostRoundBit + 1); + const __m128i result_pixels = _mm_packus_epi16(res_0, res_1); + Store4(dest, result_pixels); + dest += dest_stride; + const int result_1 = _mm_extract_epi32(result_pixels, 1); + memcpy(dest, &result_1, sizeof(result_1)); + dest += dest_stride; + const int result_2 = _mm_extract_epi32(result_pixels, 2); + memcpy(dest, &result_2, sizeof(result_2)); + dest += dest_stride; + const int result_3 = _mm_extract_epi32(result_pixels, 3); + memcpy(dest, &result_3, sizeof(result_3)); } inline void AverageBlend8Row(const int16_t* LIBGAV1_RESTRICT prediction_0, const int16_t* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT dest) { - const __m128i pred_0 = LoadAligned16(prediction_0); - const __m128i pred_1 = LoadAligned16(prediction_1); - __m128i res = _mm_add_epi16(pred_0, pred_1); - res = RightShiftWithRounding_S16(res, kInterPostRoundBit + 1); - StoreLo8(dest, _mm_packus_epi16(res, res)); + uint8_t* LIBGAV1_RESTRICT dest, + const ptrdiff_t dest_stride) { + const __m128i pred_00 = LoadAligned16(prediction_0); + const __m128i pred_10 = LoadAligned16(prediction_1); + __m128i res_0 = _mm_add_epi16(pred_00, pred_10); + res_0 = RightShiftWithRounding_S16(res_0, kInterPostRoundBit + 1); + const __m128i pred_01 = LoadAligned16(prediction_0 + 8); + const __m128i pred_11 = LoadAligned16(prediction_1 + 8); + __m128i res_1 = _mm_add_epi16(pred_01, pred_11); + res_1 = RightShiftWithRounding_S16(res_1, kInterPostRoundBit + 1); + const __m128i result_pixels = _mm_packus_epi16(res_0, res_1); + StoreLo8(dest, result_pixels); + StoreHi8(dest + dest_stride, result_pixels); } inline void AverageBlendLargeRow(const int16_t* LIBGAV1_RESTRICT prediction_0, @@ -85,35 +107,27 @@ void AverageBlend_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, int y = height; if (width == 4) { + const ptrdiff_t dest_stride4 = dest_stride << 2; + constexpr ptrdiff_t width4 = 4 << 2; do { - // TODO(b/150326556): |prediction_[01]| values are packed. It is possible - // to load 8 values at a time. - AverageBlend4Row(pred_0, pred_1, dst); - dst += dest_stride; - pred_0 += width; - pred_1 += width; - - AverageBlend4Row(pred_0, pred_1, dst); - dst += dest_stride; - pred_0 += width; - pred_1 += width; + AverageBlend4x4Row(pred_0, pred_1, dst, dest_stride); + dst += dest_stride4; + pred_0 += width4; + pred_1 += width4; - y -= 2; + y -= 4; } while (y != 0); return; } if (width == 8) { + const ptrdiff_t dest_stride2 = dest_stride << 1; + constexpr ptrdiff_t width2 = 8 << 1; do { - AverageBlend8Row(pred_0, pred_1, dst); - dst += dest_stride; - pred_0 += width; - pred_1 += width; - - AverageBlend8Row(pred_0, pred_1, dst); - dst += dest_stride; - pred_0 += width; - pred_1 += width; + AverageBlend8Row(pred_0, pred_1, dst, dest_stride); + dst += dest_stride2; + pred_0 += width2; + pred_1 += width2; y -= 2; } while (y != 0); diff --git a/src/dsp/x86/common_sse4_test.cc b/src/dsp/x86/common_sse4_test.cc index 4ea811a..3288cfc 100644 --- a/src/dsp/x86/common_sse4_test.cc +++ b/src/dsp/x86/common_sse4_test.cc @@ -31,7 +31,7 @@ namespace { // INT16_MAX - ((1 << bits) >> 1). In particular, if bits == 16, then // RightShiftWithRounding_S16() is equal to RightShiftWithRounding() only for // negative values. -TEST(CommonDspTest, SSE4RightShiftWithRoundingS16) { +TEST(CommonDspTest, SSE41RightShiftWithRoundingS16) { for (int bits = 0; bits < 16; ++bits) { const int bias = (1 << bits) >> 1; for (int32_t value = INT16_MIN; value <= INT16_MAX; ++value) { @@ -56,7 +56,7 @@ TEST(CommonDspTest, SSE4RightShiftWithRoundingS16) { #else // !LIBGAV1_TARGETING_SSE4_1 -TEST(CommonDspTest, SSE4) { +TEST(CommonDspTest, SSE41) { GTEST_SKIP() << "Build this module for x86(-64) with SSE4 enabled to enable " "the tests."; } diff --git a/src/dsp/x86/convolve_avx2.cc b/src/dsp/x86/convolve_avx2.cc index 4126ca9..6e94347 100644 --- a/src/dsp/x86/convolve_avx2.cc +++ b/src/dsp/x86/convolve_avx2.cc @@ -39,17 +39,17 @@ namespace { // Multiply every entry in |src[]| by the corresponding entry in |taps[]| and // sum. The filters in |taps[]| are pre-shifted by 1. This prevents the final // sum from outranging int16_t. -template <int filter_index> +template <int num_taps> __m256i SumOnePassTaps(const __m256i* const src, const __m256i* const taps) { __m256i sum; - if (filter_index < 2) { + if (num_taps == 6) { // 6 taps. const __m256i v_madd_21 = _mm256_maddubs_epi16(src[0], taps[0]); // k2k1 const __m256i v_madd_43 = _mm256_maddubs_epi16(src[1], taps[1]); // k4k3 const __m256i v_madd_65 = _mm256_maddubs_epi16(src[2], taps[2]); // k6k5 sum = _mm256_add_epi16(v_madd_21, v_madd_43); sum = _mm256_add_epi16(sum, v_madd_65); - } else if (filter_index == 2) { + } else if (num_taps == 8) { // 8 taps. const __m256i v_madd_10 = _mm256_maddubs_epi16(src[0], taps[0]); // k1k0 const __m256i v_madd_32 = _mm256_maddubs_epi16(src[1], taps[1]); // k3k2 @@ -58,7 +58,7 @@ __m256i SumOnePassTaps(const __m256i* const src, const __m256i* const taps) { const __m256i v_sum_3210 = _mm256_add_epi16(v_madd_10, v_madd_32); const __m256i v_sum_7654 = _mm256_add_epi16(v_madd_54, v_madd_76); sum = _mm256_add_epi16(v_sum_7654, v_sum_3210); - } else if (filter_index == 3) { + } else if (num_taps == 2) { // 2 taps. sum = _mm256_maddubs_epi16(src[0], taps[0]); // k4k3 } else { @@ -70,7 +70,7 @@ __m256i SumOnePassTaps(const __m256i* const src, const __m256i* const taps) { return sum; } -template <int filter_index> +template <int num_taps> __m256i SumHorizontalTaps(const __m256i* const src, const __m256i* const v_tap) { __m256i v_src[4]; @@ -78,32 +78,32 @@ __m256i SumHorizontalTaps(const __m256i* const src, const __m256i src_long_dup_lo = _mm256_unpacklo_epi8(src_long, src_long); const __m256i src_long_dup_hi = _mm256_unpackhi_epi8(src_long, src_long); - if (filter_index < 2) { + if (num_taps == 6) { // 6 taps. v_src[0] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 3); // _21 v_src[1] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 7); // _43 v_src[2] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 11); // _65 - } else if (filter_index == 2) { + } else if (num_taps == 8) { // 8 taps. v_src[0] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 1); // _10 v_src[1] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 5); // _32 v_src[2] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 9); // _54 v_src[3] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 13); // _76 - } else if (filter_index == 3) { + } else if (num_taps == 2) { // 2 taps. v_src[0] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 7); // _43 - } else if (filter_index > 3) { + } else { // 4 taps. v_src[0] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 5); // _32 v_src[1] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 9); // _54 } - return SumOnePassTaps<filter_index>(v_src, v_tap); + return SumOnePassTaps<num_taps>(v_src, v_tap); } -template <int filter_index> +template <int num_taps> __m256i SimpleHorizontalTaps(const __m256i* const src, const __m256i* const v_tap) { - __m256i sum = SumHorizontalTaps<filter_index>(src, v_tap); + __m256i sum = SumHorizontalTaps<num_taps>(src, v_tap); // Normally the Horizontal pass does the downshift in two passes: // kInterRoundBitsHorizontal - 1 and then (kFilterBits - @@ -116,17 +116,16 @@ __m256i SimpleHorizontalTaps(const __m256i* const src, return _mm256_packus_epi16(sum, sum); } -template <int filter_index> +template <int num_taps> __m256i HorizontalTaps8To16(const __m256i* const src, const __m256i* const v_tap) { - const __m256i sum = SumHorizontalTaps<filter_index>(src, v_tap); + const __m256i sum = SumHorizontalTaps<num_taps>(src, v_tap); return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1); } // Filter 2xh sizes. -template <int num_taps, int filter_index, bool is_2d = false, - bool is_compound = false> +template <int num_taps, bool is_2d = false, bool is_compound = false> void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src, const ptrdiff_t src_stride, void* LIBGAV1_RESTRICT const dest, @@ -145,14 +144,14 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src, do { if (is_2d) { const __m128i sum = - HorizontalTaps8To16_2x2<filter_index>(src, src_stride, v_tap); + HorizontalTaps8To16_2x2<num_taps>(src, src_stride, v_tap); Store4(&dest16[0], sum); dest16 += pred_stride; Store4(&dest16[0], _mm_srli_si128(sum, 8)); dest16 += pred_stride; } else { const __m128i sum = - SimpleHorizontalTaps2x2<filter_index>(src, src_stride, v_tap); + SimpleHorizontalTaps2x2<num_taps>(src, src_stride, v_tap); Store2(dest8, sum); dest8 += pred_stride; Store2(dest8, _mm_srli_si128(sum, 4)); @@ -169,7 +168,7 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src, assert(height % 2 == 1); __m128i sum; const __m128i input = LoadLo8(&src[2]); - if (filter_index == 3) { + if (num_taps == 2) { // 03 04 04 05 05 06 06 07 .... const __m128i v_src_43 = _mm_srli_si128(_mm_unpacklo_epi8(input, input), 3); @@ -194,8 +193,7 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src, } // Filter widths >= 4. -template <int num_taps, int filter_index, bool is_2d = false, - bool is_compound = false> +template <int num_taps, bool is_2d = false, bool is_compound = false> void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src, const ptrdiff_t src_stride, void* LIBGAV1_RESTRICT const dest, @@ -214,11 +212,11 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src, const __m256i src_long = SetrM128i(LoadUnaligned16(&src[x]), LoadUnaligned16(&src[x + 8])); const __m256i result = - HorizontalTaps8To16<filter_index>(&src_long, v_tap); + HorizontalTaps8To16<num_taps>(&src_long, v_tap); const __m256i src_long2 = SetrM128i(LoadUnaligned16(&src[x + 16]), LoadUnaligned16(&src[x + 24])); const __m256i result2 = - HorizontalTaps8To16<filter_index>(&src_long2, v_tap); + HorizontalTaps8To16<num_taps>(&src_long2, v_tap); if (is_2d) { StoreAligned32(&dest16[x], result); StoreAligned32(&dest16[x + 16], result2); @@ -230,11 +228,11 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src, // Load src used to calculate dest8[7:0] and dest8[23:16]. const __m256i src_long = LoadUnaligned32(&src[x]); const __m256i result = - SimpleHorizontalTaps<filter_index>(&src_long, v_tap); + SimpleHorizontalTaps<num_taps>(&src_long, v_tap); // Load src used to calculate dest8[15:8] and dest8[31:24]. const __m256i src_long2 = LoadUnaligned32(&src[x + 8]); const __m256i result2 = - SimpleHorizontalTaps<filter_index>(&src_long2, v_tap); + SimpleHorizontalTaps<num_taps>(&src_long2, v_tap); // Combine results and store. StoreUnaligned32(&dest8[x], _mm256_unpacklo_epi64(result, result2)); } @@ -252,13 +250,12 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src, // Load into 2 128 bit lanes. const __m256i src_long = SetrM128i(LoadUnaligned16(&src[0]), LoadUnaligned16(&src[8])); - const __m256i result = - HorizontalTaps8To16<filter_index>(&src_long, v_tap); + const __m256i result = HorizontalTaps8To16<num_taps>(&src_long, v_tap); const __m256i src_long2 = SetrM128i(LoadUnaligned16(&src[src_stride]), LoadUnaligned16(&src[8 + src_stride])); const __m256i result2 = - HorizontalTaps8To16<filter_index>(&src_long2, v_tap); + HorizontalTaps8To16<num_taps>(&src_long2, v_tap); if (is_2d) { StoreAligned32(&dest16[0], result); StoreAligned32(&dest16[pred_stride], result2); @@ -270,12 +267,11 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src, // Load into 2 128 bit lanes. const __m256i src_long = SetrM128i(LoadUnaligned16(&src[0]), LoadUnaligned16(&src[src_stride])); - const __m256i result = - SimpleHorizontalTaps<filter_index>(&src_long, v_tap); + const __m256i result = SimpleHorizontalTaps<num_taps>(&src_long, v_tap); const __m256i src_long2 = SetrM128i( LoadUnaligned16(&src[8]), LoadUnaligned16(&src[8 + src_stride])); const __m256i result2 = - SimpleHorizontalTaps<filter_index>(&src_long2, v_tap); + SimpleHorizontalTaps<num_taps>(&src_long2, v_tap); const __m256i packed_result = _mm256_unpacklo_epi64(result, result2); StoreUnaligned16(&dest8[0], _mm256_castsi256_si128(packed_result)); StoreUnaligned16(&dest8[pred_stride], @@ -292,8 +288,7 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src, if (is_2d) { const __m256i src_long = SetrM128i(LoadUnaligned16(&src[0]), LoadUnaligned16(&src[8])); - const __m256i result = - HorizontalTaps8To16<filter_index>(&src_long, v_tap); + const __m256i result = HorizontalTaps8To16<num_taps>(&src_long, v_tap); StoreAligned32(&dest16[0], result); } @@ -306,8 +301,7 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src, const __m128i next_row = LoadUnaligned16(&src[src_stride]); const __m256i src_long = SetrM128i(this_row, next_row); if (is_2d || is_compound) { - const __m256i result = - HorizontalTaps8To16<filter_index>(&src_long, v_tap); + const __m256i result = HorizontalTaps8To16<num_taps>(&src_long, v_tap); if (is_2d) { StoreAligned16(&dest16[0], _mm256_castsi256_si128(result)); StoreAligned16(&dest16[pred_stride], @@ -322,8 +316,7 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src, const __m128i next_row = LoadUnaligned16(&src[src_stride]); // Load into 2 128 bit lanes. const __m256i src_long = SetrM128i(this_row, next_row); - const __m256i result = - SimpleHorizontalTaps<filter_index>(&src_long, v_tap); + const __m256i result = SimpleHorizontalTaps<num_taps>(&src_long, v_tap); StoreLo8(&dest8[0], _mm256_castsi256_si128(result)); StoreLo8(&dest8[pred_stride], _mm256_extracti128_si256(result, 1)); } @@ -337,8 +330,7 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src, // filter the remaining row. if (is_2d) { const __m256i src_long = _mm256_castsi128_si256(LoadUnaligned16(&src[0])); - const __m256i result = - HorizontalTaps8To16<filter_index>(&src_long, v_tap); + const __m256i result = HorizontalTaps8To16<num_taps>(&src_long, v_tap); StoreAligned16(&dest16[0], _mm256_castsi256_si128(result)); } @@ -351,8 +343,7 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src, const __m128i next_row = LoadUnaligned16(&src[src_stride]); const __m256i src_long = SetrM128i(this_row, next_row); if (is_2d || is_compound) { - const __m256i result = - HorizontalTaps8To16<filter_index>(&src_long, v_tap); + const __m256i result = HorizontalTaps8To16<num_taps>(&src_long, v_tap); StoreLo8(&dest16[0], _mm256_castsi256_si128(result)); StoreLo8(&dest16[pred_stride], _mm256_extracti128_si256(result, 1)); } else { @@ -360,8 +351,7 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src, const __m128i next_row = LoadUnaligned16(&src[src_stride]); // Load into 2 128 bit lanes. const __m256i src_long = SetrM128i(this_row, next_row); - const __m256i result = - SimpleHorizontalTaps<filter_index>(&src_long, v_tap); + const __m256i result = SimpleHorizontalTaps<num_taps>(&src_long, v_tap); Store4(&dest8[0], _mm256_castsi256_si128(result)); Store4(&dest8[pred_stride], _mm256_extracti128_si256(result, 1)); } @@ -375,8 +365,7 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src, // filter the remaining row. if (is_2d) { const __m256i src_long = _mm256_castsi128_si256(LoadUnaligned16(&src[0])); - const __m256i result = - HorizontalTaps8To16<filter_index>(&src_long, v_tap); + const __m256i result = HorizontalTaps8To16<num_taps>(&src_long, v_tap); StoreLo8(&dest16[0], _mm256_castsi256_si128(result)); } } @@ -554,18 +543,15 @@ LIBGAV1_ALWAYS_INLINE void DoHorizontalPass2xH( const __m128i v_horizontal_filter = LoadLo8(kHalfSubPixelFilters[filter_index][filter_id]); - if (filter_index == 4) { // 4 tap. - SetupTaps<4>(&v_horizontal_filter, v_tap); - FilterHorizontal<4, 4, is_2d, is_compound>(src, src_stride, dst, dst_stride, - width, height, v_tap); - } else if (filter_index == 5) { // 4 tap. + if ((filter_index & 0x4) != 0) { // 4 tap. + // ((filter_index == 4) | (filter_index == 5)) SetupTaps<4>(&v_horizontal_filter, v_tap); - FilterHorizontal<4, 5, is_2d, is_compound>(src, src_stride, dst, dst_stride, - width, height, v_tap); + FilterHorizontal<4, is_2d, is_compound>(src, src_stride, dst, dst_stride, + width, height, v_tap); } else { // 2 tap. SetupTaps<2>(&v_horizontal_filter, v_tap); - FilterHorizontal<2, 3, is_2d, is_compound>(src, src_stride, dst, dst_stride, - width, height, v_tap); + FilterHorizontal<2, is_2d, is_compound>(src, src_stride, dst, dst_stride, + width, height, v_tap); } } @@ -582,28 +568,25 @@ LIBGAV1_ALWAYS_INLINE void DoHorizontalPass( if (filter_index == 2) { // 8 tap. SetupTaps<8>(&v_horizontal_filter, v_tap); - FilterHorizontal<8, 2, is_2d, is_compound>(src, src_stride, dst, dst_stride, - width, height, v_tap); + FilterHorizontal<8, is_2d, is_compound>(src, src_stride, dst, dst_stride, + width, height, v_tap); } else if (filter_index == 1) { // 6 tap. SetupTaps<6>(&v_horizontal_filter, v_tap); - FilterHorizontal<6, 1, is_2d, is_compound>(src, src_stride, dst, dst_stride, - width, height, v_tap); + FilterHorizontal<6, is_2d, is_compound>(src, src_stride, dst, dst_stride, + width, height, v_tap); } else if (filter_index == 0) { // 6 tap. SetupTaps<6>(&v_horizontal_filter, v_tap); - FilterHorizontal<6, 0, is_2d, is_compound>(src, src_stride, dst, dst_stride, - width, height, v_tap); - } else if (filter_index == 4) { // 4 tap. - SetupTaps<4>(&v_horizontal_filter, v_tap); - FilterHorizontal<4, 4, is_2d, is_compound>(src, src_stride, dst, dst_stride, - width, height, v_tap); - } else if (filter_index == 5) { // 4 tap. + FilterHorizontal<6, is_2d, is_compound>(src, src_stride, dst, dst_stride, + width, height, v_tap); + } else if ((filter_index & 0x4) != 0) { // 4 tap. + // ((filter_index == 4) | (filter_index == 5)) SetupTaps<4>(&v_horizontal_filter, v_tap); - FilterHorizontal<4, 5, is_2d, is_compound>(src, src_stride, dst, dst_stride, - width, height, v_tap); + FilterHorizontal<4, is_2d, is_compound>(src, src_stride, dst, dst_stride, + width, height, v_tap); } else { // 2 tap. SetupTaps<2>(&v_horizontal_filter, v_tap); - FilterHorizontal<2, 3, is_2d, is_compound>(src, src_stride, dst, dst_stride, - width, height, v_tap); + FilterHorizontal<2, is_2d, is_compound>(src, src_stride, dst, dst_stride, + width, height, v_tap); } } @@ -617,7 +600,8 @@ void Convolve2D_AVX2(const void* LIBGAV1_RESTRICT const reference, const ptrdiff_t pred_stride) { const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width); const int vert_filter_index = GetFilterIndex(vertical_filter_index, height); - const int vertical_taps = GetNumTapsInFilter(vert_filter_index); + const int vertical_taps = + GetNumTapsInFilter(vert_filter_index, vertical_filter_id); // The output of the horizontal filter is guaranteed to fit in 16 bits. alignas(32) uint16_t @@ -730,61 +714,60 @@ __m256i Compound1DShift(const __m256i sum) { return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1); } -template <int filter_index, bool unpack_high = false> +template <int num_taps, bool unpack_high = false> __m256i SumVerticalTaps(const __m256i* const srcs, const __m256i* const v_tap) { __m256i v_src[4]; if (!unpack_high) { - if (filter_index < 2) { + if (num_taps == 6) { // 6 taps. v_src[0] = _mm256_unpacklo_epi8(srcs[0], srcs[1]); v_src[1] = _mm256_unpacklo_epi8(srcs[2], srcs[3]); v_src[2] = _mm256_unpacklo_epi8(srcs[4], srcs[5]); - } else if (filter_index == 2) { + } else if (num_taps == 8) { // 8 taps. v_src[0] = _mm256_unpacklo_epi8(srcs[0], srcs[1]); v_src[1] = _mm256_unpacklo_epi8(srcs[2], srcs[3]); v_src[2] = _mm256_unpacklo_epi8(srcs[4], srcs[5]); v_src[3] = _mm256_unpacklo_epi8(srcs[6], srcs[7]); - } else if (filter_index == 3) { + } else if (num_taps == 2) { // 2 taps. v_src[0] = _mm256_unpacklo_epi8(srcs[0], srcs[1]); - } else if (filter_index > 3) { + } else { // 4 taps. v_src[0] = _mm256_unpacklo_epi8(srcs[0], srcs[1]); v_src[1] = _mm256_unpacklo_epi8(srcs[2], srcs[3]); } } else { - if (filter_index < 2) { + if (num_taps == 6) { // 6 taps. v_src[0] = _mm256_unpackhi_epi8(srcs[0], srcs[1]); v_src[1] = _mm256_unpackhi_epi8(srcs[2], srcs[3]); v_src[2] = _mm256_unpackhi_epi8(srcs[4], srcs[5]); - } else if (filter_index == 2) { + } else if (num_taps == 8) { // 8 taps. v_src[0] = _mm256_unpackhi_epi8(srcs[0], srcs[1]); v_src[1] = _mm256_unpackhi_epi8(srcs[2], srcs[3]); v_src[2] = _mm256_unpackhi_epi8(srcs[4], srcs[5]); v_src[3] = _mm256_unpackhi_epi8(srcs[6], srcs[7]); - } else if (filter_index == 3) { + } else if (num_taps == 2) { // 2 taps. v_src[0] = _mm256_unpackhi_epi8(srcs[0], srcs[1]); - } else if (filter_index > 3) { + } else { // 4 taps. v_src[0] = _mm256_unpackhi_epi8(srcs[0], srcs[1]); v_src[1] = _mm256_unpackhi_epi8(srcs[2], srcs[3]); } } - return SumOnePassTaps<filter_index>(v_src, v_tap); + return SumOnePassTaps<num_taps>(v_src, v_tap); } -template <int filter_index, bool is_compound = false> +template <int num_taps, bool is_compound = false> void FilterVertical32xH(const uint8_t* LIBGAV1_RESTRICT src, const ptrdiff_t src_stride, void* LIBGAV1_RESTRICT const dst, const ptrdiff_t dst_stride, const int width, const int height, const __m256i* const v_tap) { - const int num_taps = GetNumTapsInFilter(filter_index); const int next_row = num_taps - 1; auto* dst8 = static_cast<uint8_t*>(dst); auto* dst16 = static_cast<uint16_t*>(dst); @@ -821,9 +804,9 @@ void FilterVertical32xH(const uint8_t* LIBGAV1_RESTRICT src, srcs[next_row] = LoadUnaligned32(src_x); src_x += src_stride; - const __m256i sums = SumVerticalTaps<filter_index>(srcs, v_tap); + const __m256i sums = SumVerticalTaps<num_taps>(srcs, v_tap); const __m256i sums_hi = - SumVerticalTaps<filter_index, /*unpack_high=*/true>(srcs, v_tap); + SumVerticalTaps<num_taps, /*unpack_high=*/true>(srcs, v_tap); if (is_compound) { const __m256i results = Compound1DShift(_mm256_permute2x128_si256(sums, sums_hi, 0x20)); @@ -861,13 +844,12 @@ void FilterVertical32xH(const uint8_t* LIBGAV1_RESTRICT src, } while (x < width); } -template <int filter_index, bool is_compound = false> +template <int num_taps, bool is_compound = false> void FilterVertical16xH(const uint8_t* LIBGAV1_RESTRICT src, const ptrdiff_t src_stride, void* LIBGAV1_RESTRICT const dst, const ptrdiff_t dst_stride, const int /*width*/, const int height, const __m256i* const v_tap) { - const int num_taps = GetNumTapsInFilter(filter_index); const int next_row = num_taps; auto* dst8 = static_cast<uint8_t*>(dst); auto* dst16 = static_cast<uint16_t*>(dst); @@ -922,9 +904,9 @@ void FilterVertical16xH(const uint8_t* LIBGAV1_RESTRICT src, srcs[next_row - 1] = _mm256_inserti128_si256( srcs[next_row - 1], _mm256_castsi256_si128(srcs[next_row]), 1); - const __m256i sums = SumVerticalTaps<filter_index>(srcs, v_tap); + const __m256i sums = SumVerticalTaps<num_taps>(srcs, v_tap); const __m256i sums_hi = - SumVerticalTaps<filter_index, /*unpack_high=*/true>(srcs, v_tap); + SumVerticalTaps<num_taps, /*unpack_high=*/true>(srcs, v_tap); if (is_compound) { const __m256i results = Compound1DShift(_mm256_permute2x128_si256(sums, sums_hi, 0x20)); @@ -964,13 +946,12 @@ void FilterVertical16xH(const uint8_t* LIBGAV1_RESTRICT src, } while (y != 0); } -template <int filter_index, bool is_compound = false> +template <int num_taps, bool is_compound = false> void FilterVertical8xH(const uint8_t* LIBGAV1_RESTRICT src, const ptrdiff_t src_stride, void* LIBGAV1_RESTRICT const dst, const ptrdiff_t dst_stride, const int /*width*/, const int height, const __m256i* const v_tap) { - const int num_taps = GetNumTapsInFilter(filter_index); const int next_row = num_taps; auto* dst8 = static_cast<uint8_t*>(dst); auto* dst16 = static_cast<uint16_t*>(dst); @@ -1025,7 +1006,7 @@ void FilterVertical8xH(const uint8_t* LIBGAV1_RESTRICT src, srcs[next_row - 1] = _mm256_inserti128_si256( srcs[next_row - 1], _mm256_castsi256_si128(srcs[next_row]), 1); - const __m256i sums = SumVerticalTaps<filter_index>(srcs, v_tap); + const __m256i sums = SumVerticalTaps<num_taps>(srcs, v_tap); if (is_compound) { const __m256i results = Compound1DShift(sums); const __m128i this_dst = _mm256_castsi256_si128(results); @@ -1062,13 +1043,12 @@ void FilterVertical8xH(const uint8_t* LIBGAV1_RESTRICT src, } while (y != 0); } -template <int filter_index, bool is_compound = false> +template <int num_taps, bool is_compound = false> void FilterVertical8xH(const uint8_t* LIBGAV1_RESTRICT src, const ptrdiff_t src_stride, void* LIBGAV1_RESTRICT const dst, const ptrdiff_t dst_stride, const int /*width*/, const int height, const __m128i* const v_tap) { - const int num_taps = GetNumTapsInFilter(filter_index); const int next_row = num_taps - 1; auto* dst8 = static_cast<uint8_t*>(dst); auto* dst16 = static_cast<uint16_t*>(dst); @@ -1101,7 +1081,7 @@ void FilterVertical8xH(const uint8_t* LIBGAV1_RESTRICT src, srcs[next_row] = LoadLo8(src_x); src_x += src_stride; - const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap); + const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap); if (is_compound) { const __m128i results = Compound1DShift(sums); StoreUnaligned16(dst16, results); @@ -1137,7 +1117,8 @@ void ConvolveVertical_AVX2(const void* LIBGAV1_RESTRICT const reference, const int height, void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) { const int filter_index = GetFilterIndex(vertical_filter_index, height); - const int vertical_taps = GetNumTapsInFilter(filter_index); + const int vertical_taps = + GetNumTapsInFilter(filter_index, vertical_filter_id); const ptrdiff_t src_stride = reference_stride; const auto* src = static_cast<const uint8_t*>(reference) - (vertical_taps / 2 - 1) * src_stride; @@ -1151,43 +1132,43 @@ void ConvolveVertical_AVX2(const void* LIBGAV1_RESTRICT const reference, // Use 256 bits for width > 4. if (width > 4) { __m256i taps_256[4]; - if (filter_index < 2) { // 6 tap. + if (vertical_taps == 6) { // 6 tap. SetupTaps<6>(&v_filter, taps_256); if (width == 8) { - FilterVertical8xH<0>(src, src_stride, dest, dest_stride, width, height, + FilterVertical8xH<6>(src, src_stride, dest, dest_stride, width, height, taps_256); } else if (width == 16) { - FilterVertical16xH<0>(src, src_stride, dest, dest_stride, width, height, + FilterVertical16xH<6>(src, src_stride, dest, dest_stride, width, height, taps_256); } else { - FilterVertical32xH<0>(src, src_stride, dest, dest_stride, width, height, + FilterVertical32xH<6>(src, src_stride, dest, dest_stride, width, height, taps_256); } - } else if (filter_index == 2) { // 8 tap. + } else if (vertical_taps == 8) { // 8 tap. SetupTaps<8>(&v_filter, taps_256); if (width == 8) { - FilterVertical8xH<2>(src, src_stride, dest, dest_stride, width, height, + FilterVertical8xH<8>(src, src_stride, dest, dest_stride, width, height, taps_256); } else if (width == 16) { - FilterVertical16xH<2>(src, src_stride, dest, dest_stride, width, height, + FilterVertical16xH<8>(src, src_stride, dest, dest_stride, width, height, taps_256); } else { - FilterVertical32xH<2>(src, src_stride, dest, dest_stride, width, height, + FilterVertical32xH<8>(src, src_stride, dest, dest_stride, width, height, taps_256); } - } else if (filter_index == 3) { // 2 tap. + } else if (vertical_taps == 2) { // 2 tap. SetupTaps<2>(&v_filter, taps_256); if (width == 8) { - FilterVertical8xH<3>(src, src_stride, dest, dest_stride, width, height, + FilterVertical8xH<2>(src, src_stride, dest, dest_stride, width, height, taps_256); } else if (width == 16) { - FilterVertical16xH<3>(src, src_stride, dest, dest_stride, width, height, + FilterVertical16xH<2>(src, src_stride, dest, dest_stride, width, height, taps_256); } else { - FilterVertical32xH<3>(src, src_stride, dest, dest_stride, width, height, + FilterVertical32xH<2>(src, src_stride, dest, dest_stride, width, height, taps_256); } - } else if (filter_index == 4) { // 4 tap. + } else { // 4 tap. SetupTaps<4>(&v_filter, taps_256); if (width == 8) { FilterVertical8xH<4>(src, src_stride, dest, dest_stride, width, height, @@ -1199,67 +1180,38 @@ void ConvolveVertical_AVX2(const void* LIBGAV1_RESTRICT const reference, FilterVertical32xH<4>(src, src_stride, dest, dest_stride, width, height, taps_256); } - } else { - SetupTaps<4>(&v_filter, taps_256); - if (width == 8) { - FilterVertical8xH<5>(src, src_stride, dest, dest_stride, width, height, - taps_256); - } else if (width == 16) { - FilterVertical16xH<5>(src, src_stride, dest, dest_stride, width, height, - taps_256); - } else { - FilterVertical32xH<5>(src, src_stride, dest, dest_stride, width, height, - taps_256); - } } } else { // width <= 8 // Use 128 bit code. __m128i taps[4]; - if (filter_index < 2) { // 6 tap. + if (vertical_taps == 6) { // 6 tap. SetupTaps<6>(&v_filter, taps); if (width == 2) { - FilterVertical2xH<6, 0>(src, src_stride, dest, dest_stride, height, - taps); + FilterVertical2xH<6>(src, src_stride, dest, dest_stride, height, taps); } else { - FilterVertical4xH<6, 0>(src, src_stride, dest, dest_stride, height, - taps); + FilterVertical4xH<6>(src, src_stride, dest, dest_stride, height, taps); } - } else if (filter_index == 2) { // 8 tap. + } else if (vertical_taps == 8) { // 8 tap. SetupTaps<8>(&v_filter, taps); if (width == 2) { - FilterVertical2xH<8, 2>(src, src_stride, dest, dest_stride, height, - taps); + FilterVertical2xH<8>(src, src_stride, dest, dest_stride, height, taps); } else { - FilterVertical4xH<8, 2>(src, src_stride, dest, dest_stride, height, - taps); + FilterVertical4xH<8>(src, src_stride, dest, dest_stride, height, taps); } - } else if (filter_index == 3) { // 2 tap. + } else if (vertical_taps == 2) { // 2 tap. SetupTaps<2>(&v_filter, taps); if (width == 2) { - FilterVertical2xH<2, 3>(src, src_stride, dest, dest_stride, height, - taps); - } else { - FilterVertical4xH<2, 3>(src, src_stride, dest, dest_stride, height, - taps); - } - } else if (filter_index == 4) { // 4 tap. - SetupTaps<4>(&v_filter, taps); - if (width == 2) { - FilterVertical2xH<4, 4>(src, src_stride, dest, dest_stride, height, - taps); + FilterVertical2xH<2>(src, src_stride, dest, dest_stride, height, taps); } else { - FilterVertical4xH<4, 4>(src, src_stride, dest, dest_stride, height, - taps); + FilterVertical4xH<2>(src, src_stride, dest, dest_stride, height, taps); } - } else { + } else { // 4 tap. SetupTaps<4>(&v_filter, taps); if (width == 2) { - FilterVertical2xH<4, 5>(src, src_stride, dest, dest_stride, height, - taps); + FilterVertical2xH<4>(src, src_stride, dest, dest_stride, height, taps); } else { - FilterVertical4xH<4, 5>(src, src_stride, dest, dest_stride, height, - taps); + FilterVertical4xH<4>(src, src_stride, dest, dest_stride, height, taps); } } } @@ -1272,7 +1224,8 @@ void ConvolveCompoundVertical_AVX2( const int vertical_filter_id, const int width, const int height, void* LIBGAV1_RESTRICT prediction, const ptrdiff_t /*pred_stride*/) { const int filter_index = GetFilterIndex(vertical_filter_index, height); - const int vertical_taps = GetNumTapsInFilter(filter_index); + const int vertical_taps = + GetNumTapsInFilter(filter_index, vertical_filter_id); const ptrdiff_t src_stride = reference_stride; const auto* src = static_cast<const uint8_t*>(reference) - (vertical_taps / 2 - 1) * src_stride; @@ -1286,43 +1239,43 @@ void ConvolveCompoundVertical_AVX2( // Use 256 bits for width > 4. if (width > 4) { __m256i taps_256[4]; - if (filter_index < 2) { // 6 tap. + if (vertical_taps == 6) { // 6 tap. SetupTaps<6>(&v_filter, taps_256); if (width == 8) { - FilterVertical8xH<0, /*is_compound=*/true>( + FilterVertical8xH<6, /*is_compound=*/true>( src, src_stride, dest, dest_stride, width, height, taps_256); } else if (width == 16) { - FilterVertical16xH<0, /*is_compound=*/true>( + FilterVertical16xH<6, /*is_compound=*/true>( src, src_stride, dest, dest_stride, width, height, taps_256); } else { - FilterVertical32xH<0, /*is_compound=*/true>( + FilterVertical32xH<6, /*is_compound=*/true>( src, src_stride, dest, dest_stride, width, height, taps_256); } - } else if (filter_index == 2) { // 8 tap. + } else if (vertical_taps == 8) { // 8 tap. SetupTaps<8>(&v_filter, taps_256); if (width == 8) { - FilterVertical8xH<2, /*is_compound=*/true>( + FilterVertical8xH<8, /*is_compound=*/true>( src, src_stride, dest, dest_stride, width, height, taps_256); } else if (width == 16) { - FilterVertical16xH<2, /*is_compound=*/true>( + FilterVertical16xH<8, /*is_compound=*/true>( src, src_stride, dest, dest_stride, width, height, taps_256); } else { - FilterVertical32xH<2, /*is_compound=*/true>( + FilterVertical32xH<8, /*is_compound=*/true>( src, src_stride, dest, dest_stride, width, height, taps_256); } - } else if (filter_index == 3) { // 2 tap. + } else if (vertical_taps == 2) { // 2 tap. SetupTaps<2>(&v_filter, taps_256); if (width == 8) { - FilterVertical8xH<3, /*is_compound=*/true>( + FilterVertical8xH<2, /*is_compound=*/true>( src, src_stride, dest, dest_stride, width, height, taps_256); } else if (width == 16) { - FilterVertical16xH<3, /*is_compound=*/true>( + FilterVertical16xH<2, /*is_compound=*/true>( src, src_stride, dest, dest_stride, width, height, taps_256); } else { - FilterVertical32xH<3, /*is_compound=*/true>( + FilterVertical32xH<2, /*is_compound=*/true>( src, src_stride, dest, dest_stride, width, height, taps_256); } - } else if (filter_index == 4) { // 4 tap. + } else { // 4 tap. SetupTaps<4>(&v_filter, taps_256); if (width == 8) { FilterVertical8xH<4, /*is_compound=*/true>( @@ -1334,43 +1287,27 @@ void ConvolveCompoundVertical_AVX2( FilterVertical32xH<4, /*is_compound=*/true>( src, src_stride, dest, dest_stride, width, height, taps_256); } - } else { - SetupTaps<4>(&v_filter, taps_256); - if (width == 8) { - FilterVertical8xH<5, /*is_compound=*/true>( - src, src_stride, dest, dest_stride, width, height, taps_256); - } else if (width == 16) { - FilterVertical16xH<5, /*is_compound=*/true>( - src, src_stride, dest, dest_stride, width, height, taps_256); - } else { - FilterVertical32xH<5, /*is_compound=*/true>( - src, src_stride, dest, dest_stride, width, height, taps_256); - } } } else { // width <= 4 // Use 128 bit code. __m128i taps[4]; - if (filter_index < 2) { // 6 tap. + if (vertical_taps == 6) { // 6 tap. SetupTaps<6>(&v_filter, taps); - FilterVertical4xH<6, 0, /*is_compound=*/true>(src, src_stride, dest, - dest_stride, height, taps); - } else if (filter_index == 2) { // 8 tap. + FilterVertical4xH<6, /*is_compound=*/true>(src, src_stride, dest, + dest_stride, height, taps); + } else if (vertical_taps == 8) { // 8 tap. SetupTaps<8>(&v_filter, taps); - FilterVertical4xH<8, 2, /*is_compound=*/true>(src, src_stride, dest, - dest_stride, height, taps); - } else if (filter_index == 3) { // 2 tap. + FilterVertical4xH<8, /*is_compound=*/true>(src, src_stride, dest, + dest_stride, height, taps); + } else if (vertical_taps == 2) { // 2 tap. SetupTaps<2>(&v_filter, taps); - FilterVertical4xH<2, 3, /*is_compound=*/true>(src, src_stride, dest, - dest_stride, height, taps); - } else if (filter_index == 4) { // 4 tap. - SetupTaps<4>(&v_filter, taps); - FilterVertical4xH<4, 4, /*is_compound=*/true>(src, src_stride, dest, - dest_stride, height, taps); - } else { + FilterVertical4xH<2, /*is_compound=*/true>(src, src_stride, dest, + dest_stride, height, taps); + } else { // 4 tap. SetupTaps<4>(&v_filter, taps); - FilterVertical4xH<4, 5, /*is_compound=*/true>(src, src_stride, dest, - dest_stride, height, taps); + FilterVertical4xH<4, /*is_compound=*/true>(src, src_stride, dest, + dest_stride, height, taps); } } } @@ -1430,7 +1367,8 @@ void ConvolveCompound2D_AVX2( void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) { const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width); const int vert_filter_index = GetFilterIndex(vertical_filter_index, height); - const int vertical_taps = GetNumTapsInFilter(vert_filter_index); + const int vertical_taps = + GetNumTapsInFilter(vert_filter_index, vertical_filter_id); // The output of the horizontal filter is guaranteed to fit in 16 bits. alignas(32) uint16_t diff --git a/src/dsp/x86/convolve_sse4.cc b/src/dsp/x86/convolve_sse4.cc index f7e5a71..f427c4c 100644 --- a/src/dsp/x86/convolve_sse4.cc +++ b/src/dsp/x86/convolve_sse4.cc @@ -36,7 +36,7 @@ namespace { #include "src/dsp/x86/convolve_sse4.inc" -template <int filter_index> +template <int num_taps> __m128i SumHorizontalTaps(const uint8_t* LIBGAV1_RESTRICT const src, const __m128i* const v_tap) { __m128i v_src[4]; @@ -44,33 +44,33 @@ __m128i SumHorizontalTaps(const uint8_t* LIBGAV1_RESTRICT const src, const __m128i src_long_dup_lo = _mm_unpacklo_epi8(src_long, src_long); const __m128i src_long_dup_hi = _mm_unpackhi_epi8(src_long, src_long); - if (filter_index < 2) { + if (num_taps == 6) { // 6 taps. v_src[0] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 3); // _21 v_src[1] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 7); // _43 v_src[2] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 11); // _65 - } else if (filter_index == 2) { + } else if (num_taps == 8) { // 8 taps. v_src[0] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 1); // _10 v_src[1] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 5); // _32 v_src[2] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 9); // _54 v_src[3] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 13); // _76 - } else if (filter_index == 3) { + } else if (num_taps == 2) { // 2 taps. v_src[0] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 7); // _43 - } else if (filter_index > 3) { + } else { // 4 taps. v_src[0] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 5); // _32 v_src[1] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 9); // _54 } - const __m128i sum = SumOnePassTaps<filter_index>(v_src, v_tap); + const __m128i sum = SumOnePassTaps<num_taps>(v_src, v_tap); return sum; } -template <int filter_index> +template <int num_taps> __m128i SimpleHorizontalTaps(const uint8_t* LIBGAV1_RESTRICT const src, const __m128i* const v_tap) { - __m128i sum = SumHorizontalTaps<filter_index>(src, v_tap); + __m128i sum = SumHorizontalTaps<num_taps>(src, v_tap); // Normally the Horizontal pass does the downshift in two passes: // kInterRoundBitsHorizontal - 1 and then (kFilterBits - @@ -83,16 +83,15 @@ __m128i SimpleHorizontalTaps(const uint8_t* LIBGAV1_RESTRICT const src, return _mm_packus_epi16(sum, sum); } -template <int filter_index> +template <int num_taps> __m128i HorizontalTaps8To16(const uint8_t* LIBGAV1_RESTRICT const src, const __m128i* const v_tap) { - const __m128i sum = SumHorizontalTaps<filter_index>(src, v_tap); + const __m128i sum = SumHorizontalTaps<num_taps>(src, v_tap); return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1); } -template <int num_taps, int filter_index, bool is_2d = false, - bool is_compound = false> +template <int num_taps, bool is_2d = false, bool is_compound = false> void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src, const ptrdiff_t src_stride, void* LIBGAV1_RESTRICT const dest, @@ -108,16 +107,14 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src, int x = 0; do { if (is_2d || is_compound) { - const __m128i v_sum = - HorizontalTaps8To16<filter_index>(&src[x], v_tap); + const __m128i v_sum = HorizontalTaps8To16<num_taps>(&src[x], v_tap); if (is_2d) { StoreAligned16(&dest16[x], v_sum); } else { StoreUnaligned16(&dest16[x], v_sum); } } else { - const __m128i result = - SimpleHorizontalTaps<filter_index>(&src[x], v_tap); + const __m128i result = SimpleHorizontalTaps<num_taps>(&src[x], v_tap); StoreLo8(&dest8[x], result); } x += 8; @@ -138,10 +135,10 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src, int y = height; do { if (is_2d || is_compound) { - const __m128i v_sum = HorizontalTaps8To16<filter_index>(src, v_tap); + const __m128i v_sum = HorizontalTaps8To16<num_taps>(src, v_tap); StoreLo8(dest16, v_sum); } else { - const __m128i result = SimpleHorizontalTaps<filter_index>(src, v_tap); + const __m128i result = SimpleHorizontalTaps<num_taps>(src, v_tap); Store4(&dest8[0], result); } src += src_stride; @@ -157,14 +154,14 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src, do { if (is_2d) { const __m128i sum = - HorizontalTaps8To16_2x2<filter_index>(src, src_stride, v_tap); + HorizontalTaps8To16_2x2<num_taps>(src, src_stride, v_tap); Store4(&dest16[0], sum); dest16 += pred_stride; Store4(&dest16[0], _mm_srli_si128(sum, 8)); dest16 += pred_stride; } else { const __m128i sum = - SimpleHorizontalTaps2x2<filter_index>(src, src_stride, v_tap); + SimpleHorizontalTaps2x2<num_taps>(src, src_stride, v_tap); Store2(dest8, sum); dest8 += pred_stride; Store2(dest8, _mm_srli_si128(sum, 4)); @@ -181,7 +178,7 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src, assert(height % 2 == 1); __m128i sum; const __m128i input = LoadLo8(&src[2]); - if (filter_index == 3) { + if (num_taps == 2) { // 03 04 04 05 05 06 06 07 .... const __m128i v_src_43 = _mm_srli_si128(_mm_unpacklo_epi8(input, input), 3); @@ -218,28 +215,25 @@ LIBGAV1_ALWAYS_INLINE void DoHorizontalPass( if (filter_index == 2) { // 8 tap. SetupTaps<8>(&v_horizontal_filter, v_tap); - FilterHorizontal<8, 2, is_2d, is_compound>(src, src_stride, dst, dst_stride, - width, height, v_tap); + FilterHorizontal<8, is_2d, is_compound>(src, src_stride, dst, dst_stride, + width, height, v_tap); } else if (filter_index == 1) { // 6 tap. SetupTaps<6>(&v_horizontal_filter, v_tap); - FilterHorizontal<6, 1, is_2d, is_compound>(src, src_stride, dst, dst_stride, - width, height, v_tap); + FilterHorizontal<6, is_2d, is_compound>(src, src_stride, dst, dst_stride, + width, height, v_tap); } else if (filter_index == 0) { // 6 tap. SetupTaps<6>(&v_horizontal_filter, v_tap); - FilterHorizontal<6, 0, is_2d, is_compound>(src, src_stride, dst, dst_stride, - width, height, v_tap); - } else if (filter_index == 4) { // 4 tap. - SetupTaps<4>(&v_horizontal_filter, v_tap); - FilterHorizontal<4, 4, is_2d, is_compound>(src, src_stride, dst, dst_stride, - width, height, v_tap); - } else if (filter_index == 5) { // 4 tap. + FilterHorizontal<6, is_2d, is_compound>(src, src_stride, dst, dst_stride, + width, height, v_tap); + } else if ((filter_index & 0x4) != 0) { // 4 tap. + // ((filter_index == 4) | (filter_index == 5)) SetupTaps<4>(&v_horizontal_filter, v_tap); - FilterHorizontal<4, 5, is_2d, is_compound>(src, src_stride, dst, dst_stride, - width, height, v_tap); + FilterHorizontal<4, is_2d, is_compound>(src, src_stride, dst, dst_stride, + width, height, v_tap); } else { // 2 tap. SetupTaps<2>(&v_horizontal_filter, v_tap); - FilterHorizontal<2, 3, is_2d, is_compound>(src, src_stride, dst, dst_stride, - width, height, v_tap); + FilterHorizontal<2, is_2d, is_compound>(src, src_stride, dst, dst_stride, + width, height, v_tap); } } @@ -253,7 +247,8 @@ void Convolve2D_SSE4_1(const void* LIBGAV1_RESTRICT const reference, const ptrdiff_t pred_stride) { const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width); const int vert_filter_index = GetFilterIndex(vertical_filter_index, height); - const int vertical_taps = GetNumTapsInFilter(vert_filter_index); + const int vertical_taps = + GetNumTapsInFilter(vert_filter_index, vertical_filter_id); // The output of the horizontal filter is guaranteed to fit in 16 bits. alignas(16) uint16_t @@ -329,13 +324,12 @@ void Convolve2D_SSE4_1(const void* LIBGAV1_RESTRICT const reference, } } -template <int filter_index, bool is_compound = false> +template <int num_taps, bool is_compound = false> void FilterVertical(const uint8_t* LIBGAV1_RESTRICT src, const ptrdiff_t src_stride, void* LIBGAV1_RESTRICT const dst, const ptrdiff_t dst_stride, const int width, const int height, const __m128i* const v_tap) { - const int num_taps = GetNumTapsInFilter(filter_index); const int next_row = num_taps - 1; auto* dst8 = static_cast<uint8_t*>(dst); auto* dst16 = static_cast<uint16_t*>(dst); @@ -373,7 +367,7 @@ void FilterVertical(const uint8_t* LIBGAV1_RESTRICT src, srcs[next_row] = LoadLo8(src_x); src_x += src_stride; - const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap); + const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap); if (is_compound) { const __m128i results = Compound1DShift(sums); StoreUnaligned16(dst16_x, results); @@ -410,7 +404,8 @@ void ConvolveVertical_SSE4_1( const int vertical_filter_id, const int width, const int height, void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) { const int filter_index = GetFilterIndex(vertical_filter_index, height); - const int vertical_taps = GetNumTapsInFilter(filter_index); + const int vertical_taps = + GetNumTapsInFilter(filter_index, vertical_filter_id); const ptrdiff_t src_stride = reference_stride; const auto* src = static_cast<const uint8_t*>(reference) - (vertical_taps / 2 - 1) * src_stride; @@ -422,63 +417,50 @@ void ConvolveVertical_SSE4_1( const __m128i v_filter = LoadLo8(kHalfSubPixelFilters[filter_index][vertical_filter_id]); - if (filter_index < 2) { // 6 tap. + if (vertical_taps == 6) { // 6 tap. SetupTaps<6>(&v_filter, taps); if (width == 2) { - FilterVertical2xH<6, 0>(src, src_stride, dest, dest_stride, height, taps); + FilterVertical2xH<6>(src, src_stride, dest, dest_stride, height, taps); } else if (width == 4) { - FilterVertical4xH<6, 0>(src, src_stride, dest, dest_stride, height, taps); + FilterVertical4xH<6>(src, src_stride, dest, dest_stride, height, taps); } else { - FilterVertical<0>(src, src_stride, dest, dest_stride, width, height, + FilterVertical<6>(src, src_stride, dest, dest_stride, width, height, taps); } - } else if (filter_index == 2) { // 8 tap. + } else if (vertical_taps == 8) { // 8 tap. SetupTaps<8>(&v_filter, taps); if (width == 2) { - FilterVertical2xH<8, 2>(src, src_stride, dest, dest_stride, height, taps); + FilterVertical2xH<8>(src, src_stride, dest, dest_stride, height, taps); } else if (width == 4) { - FilterVertical4xH<8, 2>(src, src_stride, dest, dest_stride, height, taps); + FilterVertical4xH<8>(src, src_stride, dest, dest_stride, height, taps); } else { - FilterVertical<2>(src, src_stride, dest, dest_stride, width, height, + FilterVertical<8>(src, src_stride, dest, dest_stride, width, height, taps); } - } else if (filter_index == 3) { // 2 tap. + } else if (vertical_taps == 2) { // 2 tap. SetupTaps<2>(&v_filter, taps); if (width == 2) { - FilterVertical2xH<2, 3>(src, src_stride, dest, dest_stride, height, taps); + FilterVertical2xH<2>(src, src_stride, dest, dest_stride, height, taps); } else if (width == 4) { - FilterVertical4xH<2, 3>(src, src_stride, dest, dest_stride, height, taps); + FilterVertical4xH<2>(src, src_stride, dest, dest_stride, height, taps); } else { - FilterVertical<3>(src, src_stride, dest, dest_stride, width, height, + FilterVertical<2>(src, src_stride, dest, dest_stride, width, height, taps); } - } else if (filter_index == 4) { // 4 tap. + } else { // 4 tap SetupTaps<4>(&v_filter, taps); if (width == 2) { - FilterVertical2xH<4, 4>(src, src_stride, dest, dest_stride, height, taps); + FilterVertical2xH<4>(src, src_stride, dest, dest_stride, height, taps); } else if (width == 4) { - FilterVertical4xH<4, 4>(src, src_stride, dest, dest_stride, height, taps); + FilterVertical4xH<4>(src, src_stride, dest, dest_stride, height, taps); } else { FilterVertical<4>(src, src_stride, dest, dest_stride, width, height, taps); } - } else { - // TODO(slavarnway): Investigate adding |filter_index| == 1 special cases. - // See convolve_neon.cc - SetupTaps<4>(&v_filter, taps); - - if (width == 2) { - FilterVertical2xH<4, 5>(src, src_stride, dest, dest_stride, height, taps); - } else if (width == 4) { - FilterVertical4xH<4, 5>(src, src_stride, dest, dest_stride, height, taps); - } else { - FilterVertical<5>(src, src_stride, dest, dest_stride, width, height, - taps); - } } } -void ConvolveCompoundCopy_SSE4( +void ConvolveCompoundCopy_SSE4_1( const void* LIBGAV1_RESTRICT const reference, const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/, const int /*horizontal_filter_id*/, @@ -502,7 +484,6 @@ void ConvolveCompoundCopy_SSE4( _mm_slli_epi16(v_src_ext_lo, kRoundBitsVertical); const __m128i v_dest_hi = _mm_slli_epi16(v_src_ext_hi, kRoundBitsVertical); - // TODO(slavarnway): Investigate using aligned stores. StoreUnaligned16(&dest[x], v_dest_lo); StoreUnaligned16(&dest[x + 8], v_dest_hi); x += 16; @@ -544,7 +525,8 @@ void ConvolveCompoundVertical_SSE4_1( const int vertical_filter_id, const int width, const int height, void* LIBGAV1_RESTRICT prediction, const ptrdiff_t /*pred_stride*/) { const int filter_index = GetFilterIndex(vertical_filter_index, height); - const int vertical_taps = GetNumTapsInFilter(filter_index); + const int vertical_taps = + GetNumTapsInFilter(filter_index, vertical_filter_id); const ptrdiff_t src_stride = reference_stride; const auto* src = static_cast<const uint8_t*>(reference) - (vertical_taps / 2 - 1) * src_stride; @@ -555,55 +537,42 @@ void ConvolveCompoundVertical_SSE4_1( const __m128i v_filter = LoadLo8(kHalfSubPixelFilters[filter_index][vertical_filter_id]); - if (filter_index < 2) { // 6 tap. + if (vertical_taps == 6) { // 6 tap. SetupTaps<6>(&v_filter, taps); if (width == 4) { - FilterVertical4xH<6, 0, /*is_compound=*/true>(src, src_stride, dest, 4, - height, taps); + FilterVertical4xH<6, /*is_compound=*/true>(src, src_stride, dest, 4, + height, taps); } else { - FilterVertical<0, /*is_compound=*/true>(src, src_stride, dest, width, + FilterVertical<6, /*is_compound=*/true>(src, src_stride, dest, width, width, height, taps); } - } else if (filter_index == 2) { // 8 tap. + } else if (vertical_taps == 8) { // 8 tap. SetupTaps<8>(&v_filter, taps); - if (width == 4) { - FilterVertical4xH<8, 2, /*is_compound=*/true>(src, src_stride, dest, 4, - height, taps); + FilterVertical4xH<8, /*is_compound=*/true>(src, src_stride, dest, 4, + height, taps); } else { - FilterVertical<2, /*is_compound=*/true>(src, src_stride, dest, width, + FilterVertical<8, /*is_compound=*/true>(src, src_stride, dest, width, width, height, taps); } - } else if (filter_index == 3) { // 2 tap. + } else if (vertical_taps == 2) { // 2 tap. SetupTaps<2>(&v_filter, taps); - if (width == 4) { - FilterVertical4xH<2, 3, /*is_compound=*/true>(src, src_stride, dest, 4, - height, taps); + FilterVertical4xH<2, /*is_compound=*/true>(src, src_stride, dest, 4, + height, taps); } else { - FilterVertical<3, /*is_compound=*/true>(src, src_stride, dest, width, + FilterVertical<2, /*is_compound=*/true>(src, src_stride, dest, width, width, height, taps); } - } else if (filter_index == 4) { // 4 tap. + } else { // 4 tap SetupTaps<4>(&v_filter, taps); - if (width == 4) { - FilterVertical4xH<4, 4, /*is_compound=*/true>(src, src_stride, dest, 4, - height, taps); + FilterVertical4xH<4, /*is_compound=*/true>(src, src_stride, dest, 4, + height, taps); } else { FilterVertical<4, /*is_compound=*/true>(src, src_stride, dest, width, width, height, taps); } - } else { - SetupTaps<4>(&v_filter, taps); - - if (width == 4) { - FilterVertical4xH<4, 5, /*is_compound=*/true>(src, src_stride, dest, 4, - height, taps); - } else { - FilterVertical<5, /*is_compound=*/true>(src, src_stride, dest, width, - width, height, taps); - } } } @@ -656,7 +625,8 @@ void ConvolveCompound2D_SSE4_1( // Similarly for height. const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width); const int vert_filter_index = GetFilterIndex(vertical_filter_index, height); - const int vertical_taps = GetNumTapsInFilter(vert_filter_index); + const int vertical_taps = + GetNumTapsInFilter(vert_filter_index, vertical_filter_id); const int intermediate_height = height + vertical_taps - 1; const ptrdiff_t src_stride = reference_stride; const auto* const src = static_cast<const uint8_t*>(reference) - @@ -933,7 +903,7 @@ inline void ConvolveHorizontalScale(const uint8_t* LIBGAV1_RESTRICT src, source); StoreLo8(intermediate, RightShiftWithRounding_S16( - SumOnePassTaps<filter_index>(source, taps), + SumOnePassTaps<num_taps>(source, taps), kInterRoundBitsHorizontal - 1)); src_x += src_stride; intermediate += kIntermediateStride; @@ -960,10 +930,9 @@ inline void ConvolveHorizontalScale(const uint8_t* LIBGAV1_RESTRICT src, PrepareSourceVectors<num_taps, grade_x>(src_x, packed_indices, source); // Shift by one less because the taps are halved. - StoreAligned16( - intermediate_x, - RightShiftWithRounding_S16(SumOnePassTaps<filter_index>(source, taps), - kInterRoundBitsHorizontal - 1)); + StoreAligned16(intermediate_x, RightShiftWithRounding_S16( + SumOnePassTaps<num_taps>(source, taps), + kInterRoundBitsHorizontal - 1)); src_x += src_stride; intermediate_x += kIntermediateStride; } while (--y != 0); @@ -1188,7 +1157,7 @@ void ConvolveScale2D_SSE4_1(const void* LIBGAV1_RESTRICT const reference, alignas(16) int16_t intermediate_result[kIntermediateAllocWidth * (2 * kIntermediateAllocWidth + kSubPixelTaps)]; - const int num_vert_taps = GetNumTapsInFilter(vert_filter_index); + const int num_vert_taps = dsp::GetNumTapsInFilter(vert_filter_index); const int intermediate_height = (((height - 1) * step_y + (1 << kScaleSubPixelBits) - 1) >> kScaleSubPixelBits) + @@ -1211,7 +1180,7 @@ void ConvolveScale2D_SSE4_1(const void* LIBGAV1_RESTRICT const reference, // inputs in each iteration on large blocks. When step_x is large, we need a // second register and alignr in order to gather all filter inputs. // |num_taps| - 1 is the offset for the shuffle of inputs to the final tap. - const int num_horiz_taps = GetNumTapsInFilter(horiz_filter_index); + const int num_horiz_taps = dsp::GetNumTapsInFilter(horiz_filter_index); const int kernel_start_ceiling = 16 - num_horiz_taps; // This truncated quotient |grade_x_threshold| selects |step_x| such that: // (step_x * 7) >> kScaleSubPixelBits < single load limit @@ -1891,7 +1860,7 @@ void Init8bpp() { dsp->convolve[0][0][1][0] = ConvolveVertical_SSE4_1; dsp->convolve[0][0][1][1] = Convolve2D_SSE4_1; - dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_SSE4; + dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_SSE4_1; dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_SSE4_1; dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_SSE4_1; dsp->convolve[0][1][1][1] = ConvolveCompound2D_SSE4_1; diff --git a/src/dsp/x86/convolve_sse4.inc b/src/dsp/x86/convolve_sse4.inc index 550d6a4..5548c5b 100644 --- a/src/dsp/x86/convolve_sse4.inc +++ b/src/dsp/x86/convolve_sse4.inc @@ -18,20 +18,63 @@ #include "src/dsp/convolve.inc" +// This version checks for the special cases when filter_index == 1. +int GetNumTapsInFilter(const int filter_index, const int filter_id) { + if (filter_index == 0) { + // Despite the names these only use 6 taps. + // kInterpolationFilterEightTap + // kInterpolationFilterEightTapSmooth + return 6; + } + + if (filter_index == 1) { + // Despite the names these only use 6 taps. + // kInterpolationFilterEightTap + // kInterpolationFilterEightTapSmooth + if (((filter_id == 1) | (filter_id == 15) | (filter_id == 7) | + (filter_id == 8) | (filter_id == 9)) != 0) { + return 6; + } + // When |filter_index| == 1, the |filter_id| values not listed above map to + // 4 tap filters. + return 4; + } + + if (filter_index == 2) { + // kInterpolationFilterEightTapSharp + return 8; + } + + if (filter_index == 3) { + // kInterpolationFilterBilinear + return 2; + } + + assert(filter_index > 3); + // For small sizes (width/height <= 4) the large filters are replaced with 4 + // tap options. + // If the original filters were |kInterpolationFilterEightTap| or + // |kInterpolationFilterEightTapSharp| then it becomes + // |kInterpolationFilterSwitchable|. + // If it was |kInterpolationFilterEightTapSmooth| then it becomes an unnamed 4 + // tap filter. + return 4; +} + // Multiply every entry in |src[]| by the corresponding entry in |taps[]| and // sum. The filters in |taps[]| are pre-shifted by 1. This prevents the final // sum from outranging int16_t. -template <int filter_index> +template <int num_taps> __m128i SumOnePassTaps(const __m128i* const src, const __m128i* const taps) { __m128i sum; - if (filter_index < 2) { + if (num_taps == 6) { // 6 taps. const __m128i v_madd_21 = _mm_maddubs_epi16(src[0], taps[0]); // k2k1 const __m128i v_madd_43 = _mm_maddubs_epi16(src[1], taps[1]); // k4k3 const __m128i v_madd_65 = _mm_maddubs_epi16(src[2], taps[2]); // k6k5 sum = _mm_add_epi16(v_madd_21, v_madd_43); sum = _mm_add_epi16(sum, v_madd_65); - } else if (filter_index == 2) { + } else if (num_taps == 8) { // 8 taps. const __m128i v_madd_10 = _mm_maddubs_epi16(src[0], taps[0]); // k1k0 const __m128i v_madd_32 = _mm_maddubs_epi16(src[1], taps[1]); // k3k2 @@ -40,7 +83,7 @@ __m128i SumOnePassTaps(const __m128i* const src, const __m128i* const taps) { const __m128i v_sum_3210 = _mm_add_epi16(v_madd_10, v_madd_32); const __m128i v_sum_7654 = _mm_add_epi16(v_madd_54, v_madd_76); sum = _mm_add_epi16(v_sum_7654, v_sum_3210); - } else if (filter_index == 3) { + } else if (num_taps == 2) { // 2 taps. sum = _mm_maddubs_epi16(src[0], taps[0]); // k4k3 } else { @@ -52,13 +95,13 @@ __m128i SumOnePassTaps(const __m128i* const src, const __m128i* const taps) { return sum; } -template <int filter_index> +template <int num_taps> __m128i SumHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride, const __m128i* const v_tap) { // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17 const __m128i v_src = LoadHi8(LoadLo8(&src[0]), &src[src_stride]); - if (filter_index == 3) { + if (num_taps == 2) { // 03 04 04 05 05 06 06 07 13 14 14 15 15 16 16 17 const __m128i v_src_43 = _mm_shuffle_epi8( v_src, _mm_set_epi32(0x0f0e0e0d, 0x0d0c0c0b, 0x07060605, 0x05040403)); @@ -79,10 +122,10 @@ __m128i SumHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride, return v_sum_5432; } -template <int filter_index> +template <int num_taps> __m128i SimpleHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride, const __m128i* const v_tap) { - __m128i sum = SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap); + __m128i sum = SumHorizontalTaps2x2<num_taps>(src, src_stride, v_tap); // Normally the Horizontal pass does the downshift in two passes: // kInterRoundBitsHorizontal - 1 and then (kFilterBits - @@ -95,11 +138,10 @@ __m128i SimpleHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride, return _mm_packus_epi16(sum, sum); } -template <int filter_index> +template <int num_taps> __m128i HorizontalTaps8To16_2x2(const uint8_t* src, const ptrdiff_t src_stride, const __m128i* const v_tap) { - const __m128i sum = - SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap); + const __m128i sum = SumHorizontalTaps2x2<num_taps>(src, src_stride, v_tap); return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1); } @@ -411,36 +453,34 @@ __m128i Compound1DShift(const __m128i sum) { return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1); } -template <int filter_index> +template <int num_taps> __m128i SumVerticalTaps(const __m128i* const srcs, const __m128i* const v_tap) { __m128i v_src[4]; - if (filter_index < 2) { + if (num_taps == 6) { // 6 taps. v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]); v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]); v_src[2] = _mm_unpacklo_epi8(srcs[4], srcs[5]); - } else if (filter_index == 2) { + } else if (num_taps == 8) { // 8 taps. v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]); v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]); v_src[2] = _mm_unpacklo_epi8(srcs[4], srcs[5]); v_src[3] = _mm_unpacklo_epi8(srcs[6], srcs[7]); - } else if (filter_index == 3) { + } else if (num_taps == 2) { // 2 taps. v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]); - } else if (filter_index > 3) { + } else { // 4 taps. v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]); v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]); } - const __m128i sum = SumOnePassTaps<filter_index>(v_src, v_tap); + const __m128i sum = SumOnePassTaps<num_taps>(v_src, v_tap); return sum; } -// TODO(slavarnway): Use num_taps instead of filter_index for templates. See the -// 2D version. -template <int num_taps, int filter_index, bool is_compound = false> +template <int num_taps, bool is_compound = false> void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride, void* const dst, const ptrdiff_t dst_stride, const int height, const __m128i* const v_tap) { @@ -468,7 +508,7 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride, // 10 11 12 13 20 21 22 23 srcs[1] = _mm_unpacklo_epi32(a, srcs[2]); - const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap); + const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap); if (is_compound) { const __m128i results = Compound1DShift(sums); StoreUnaligned16(dst16, results); @@ -515,7 +555,7 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride, // 30 31 32 33 40 41 42 43 srcs[3] = _mm_unpacklo_epi32(b, srcs[4]); - const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap); + const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap); if (is_compound) { const __m128i results = Compound1DShift(sums); StoreUnaligned16(dst16, results); @@ -574,7 +614,7 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride, // 50 51 52 53 60 61 62 63 srcs[5] = _mm_unpacklo_epi32(c, srcs[6]); - const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap); + const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap); if (is_compound) { const __m128i results = Compound1DShift(sums); StoreUnaligned16(dst16, results); @@ -645,7 +685,7 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride, // 70 71 72 73 80 81 82 83 srcs[7] = _mm_unpacklo_epi32(d, srcs[8]); - const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap); + const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap); if (is_compound) { const __m128i results = Compound1DShift(sums); StoreUnaligned16(dst16, results); @@ -672,7 +712,7 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride, } } -template <int num_taps, int filter_index, bool negative_outside_taps = false> +template <int num_taps, bool negative_outside_taps = false> void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride, void* const dst, const ptrdiff_t dst_stride, const int height, const __m128i* const v_tap) { @@ -705,7 +745,7 @@ void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride, // 10 11 20 21 30 31 40 41 srcs[1] = _mm_srli_si128(srcs_0_2, 2); // This uses srcs[0]..srcs[1]. - const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap); + const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap); const __m128i results_16 = RightShiftWithRounding_S16(sums, kFilterBits - 1); const __m128i results = _mm_packus_epi16(results_16, results_16); @@ -760,7 +800,7 @@ void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride, srcs[3] = _mm_srli_si128(srcs_0_4, 6); // This uses srcs[0]..srcs[3]. - const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap); + const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap); const __m128i results_16 = RightShiftWithRounding_S16(sums, kFilterBits - 1); const __m128i results = _mm_packus_epi16(results_16, results_16); @@ -829,7 +869,7 @@ void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride, srcs[5] = _mm_srli_si128(srcs_4_8, 2); // This uses srcs[0]..srcs[5]. - const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap); + const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap); const __m128i results_16 = RightShiftWithRounding_S16(sums, kFilterBits - 1); const __m128i results = _mm_packus_epi16(results_16, results_16); @@ -909,7 +949,7 @@ void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride, srcs[7] = _mm_srli_si128(srcs_4_8, 6); // This uses srcs[0]..srcs[7]. - const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap); + const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap); const __m128i results_16 = RightShiftWithRounding_S16(sums, kFilterBits - 1); const __m128i results = _mm_packus_epi16(results_16, results_16); diff --git a/src/dsp/x86/distance_weighted_blend_sse4.cc b/src/dsp/x86/distance_weighted_blend_sse4.cc index c813df4..8c32117 100644 --- a/src/dsp/x86/distance_weighted_blend_sse4.cc +++ b/src/dsp/x86/distance_weighted_blend_sse4.cc @@ -34,54 +34,50 @@ namespace low_bitdepth { namespace { constexpr int kInterPostRoundBit = 4; +constexpr int kInterPostRhsAdjust = 1 << (16 - kInterPostRoundBit - 1); inline __m128i ComputeWeightedAverage8(const __m128i& pred0, const __m128i& pred1, - const __m128i& weights) { - // TODO(https://issuetracker.google.com/issues/150325685): Investigate range. - const __m128i preds_lo = _mm_unpacklo_epi16(pred0, pred1); - const __m128i mult_lo = _mm_madd_epi16(preds_lo, weights); - const __m128i result_lo = - RightShiftWithRounding_S32(mult_lo, kInterPostRoundBit + 4); - - const __m128i preds_hi = _mm_unpackhi_epi16(pred0, pred1); - const __m128i mult_hi = _mm_madd_epi16(preds_hi, weights); - const __m128i result_hi = - RightShiftWithRounding_S32(mult_hi, kInterPostRoundBit + 4); - - return _mm_packs_epi32(result_lo, result_hi); + const __m128i& weight) { + // Given: p0,p1 in range [-5132,9212] and w0 = 16 - w1, w1 = 16 - w0 + // Output: (p0 * w0 + p1 * w1 + 128(=rounding bit)) >> + // 8(=kInterPostRoundBit + 4) + // The formula is manipulated to avoid lengthening to 32 bits. + // p0 * w0 + p1 * w1 = p0 * w0 + (16 - w0) * p1 + // = (p0 - p1) * w0 + 16 * p1 + // Maximum value of p0 - p1 is 9212 + 5132 = 0x3808. + const __m128i diff = _mm_slli_epi16(_mm_sub_epi16(pred0, pred1), 1); + // (((p0 - p1) * (w0 << 12) >> 16) + ((16 * p1) >> 4) + const __m128i weighted_diff = _mm_mulhi_epi16(diff, weight); + // ((p0 - p1) * w0 >> 4) + p1 + const __m128i upscaled_average = _mm_add_epi16(weighted_diff, pred1); + // (x << 11) >> 15 == x >> 4 + const __m128i right_shift_prep = _mm_set1_epi16(kInterPostRhsAdjust); + // (((p0 - p1) * w0 >> 4) + p1 + (128 >> 4)) >> 4 + return _mm_mulhrs_epi16(upscaled_average, right_shift_prep); } template <int height> inline void DistanceWeightedBlend4xH_SSE4_1( const int16_t* LIBGAV1_RESTRICT pred_0, - const int16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight_0, - const uint8_t weight_1, void* LIBGAV1_RESTRICT const dest, - const ptrdiff_t dest_stride) { + const int16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight, + void* LIBGAV1_RESTRICT const dest, const ptrdiff_t dest_stride) { auto* dst = static_cast<uint8_t*>(dest); - const __m128i weights = _mm_set1_epi32(weight_0 | (weight_1 << 16)); + // Upscale the weight for mulhi. + const __m128i weights = _mm_set1_epi16(weight << 11); for (int y = 0; y < height; y += 4) { - // TODO(b/150326556): Use larger loads. - const __m128i src_00 = LoadLo8(pred_0); - const __m128i src_10 = LoadLo8(pred_1); - pred_0 += 4; - pred_1 += 4; - __m128i src_0 = LoadHi8(src_00, pred_0); - __m128i src_1 = LoadHi8(src_10, pred_1); - pred_0 += 4; - pred_1 += 4; - const __m128i res0 = ComputeWeightedAverage8(src_0, src_1, weights); - - const __m128i src_01 = LoadLo8(pred_0); - const __m128i src_11 = LoadLo8(pred_1); - pred_0 += 4; - pred_1 += 4; - src_0 = LoadHi8(src_01, pred_0); - src_1 = LoadHi8(src_11, pred_1); - pred_0 += 4; - pred_1 += 4; - const __m128i res1 = ComputeWeightedAverage8(src_0, src_1, weights); + const __m128i src_00 = LoadAligned16(pred_0); + const __m128i src_10 = LoadAligned16(pred_1); + pred_0 += 8; + pred_1 += 8; + const __m128i res0 = ComputeWeightedAverage8(src_00, src_10, weights); + + const __m128i src_01 = LoadAligned16(pred_0); + const __m128i src_11 = LoadAligned16(pred_1); + pred_0 += 8; + pred_1 += 8; + const __m128i res1 = ComputeWeightedAverage8(src_01, src_11, weights); const __m128i result_pixels = _mm_packus_epi16(res0, res1); Store4(dst, result_pixels); @@ -101,11 +97,11 @@ inline void DistanceWeightedBlend4xH_SSE4_1( template <int height> inline void DistanceWeightedBlend8xH_SSE4_1( const int16_t* LIBGAV1_RESTRICT pred_0, - const int16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight_0, - const uint8_t weight_1, void* LIBGAV1_RESTRICT const dest, - const ptrdiff_t dest_stride) { + const int16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight, + void* LIBGAV1_RESTRICT const dest, const ptrdiff_t dest_stride) { auto* dst = static_cast<uint8_t*>(dest); - const __m128i weights = _mm_set1_epi32(weight_0 | (weight_1 << 16)); + // Upscale the weight for mulhi. + const __m128i weights = _mm_set1_epi16(weight << 11); for (int y = 0; y < height; y += 2) { const __m128i src_00 = LoadAligned16(pred_0); @@ -130,11 +126,12 @@ inline void DistanceWeightedBlend8xH_SSE4_1( inline void DistanceWeightedBlendLarge_SSE4_1( const int16_t* LIBGAV1_RESTRICT pred_0, - const int16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight_0, - const uint8_t weight_1, const int width, const int height, - void* LIBGAV1_RESTRICT const dest, const ptrdiff_t dest_stride) { + const int16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight, + const int width, const int height, void* LIBGAV1_RESTRICT const dest, + const ptrdiff_t dest_stride) { auto* dst = static_cast<uint8_t*>(dest); - const __m128i weights = _mm_set1_epi32(weight_0 | (weight_1 << 16)); + // Upscale the weight for mulhi. + const __m128i weights = _mm_set1_epi16(weight << 11); int y = height; do { @@ -162,23 +159,24 @@ inline void DistanceWeightedBlendLarge_SSE4_1( void DistanceWeightedBlend_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, const void* LIBGAV1_RESTRICT prediction_1, const uint8_t weight_0, - const uint8_t weight_1, const int width, + const uint8_t /*weight_1*/, const int width, const int height, void* LIBGAV1_RESTRICT const dest, const ptrdiff_t dest_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); + const uint8_t weight = weight_0; if (width == 4) { if (height == 4) { - DistanceWeightedBlend4xH_SSE4_1<4>(pred_0, pred_1, weight_0, weight_1, - dest, dest_stride); + DistanceWeightedBlend4xH_SSE4_1<4>(pred_0, pred_1, weight, dest, + dest_stride); } else if (height == 8) { - DistanceWeightedBlend4xH_SSE4_1<8>(pred_0, pred_1, weight_0, weight_1, - dest, dest_stride); + DistanceWeightedBlend4xH_SSE4_1<8>(pred_0, pred_1, weight, dest, + dest_stride); } else { assert(height == 16); - DistanceWeightedBlend4xH_SSE4_1<16>(pred_0, pred_1, weight_0, weight_1, - dest, dest_stride); + DistanceWeightedBlend4xH_SSE4_1<16>(pred_0, pred_1, weight, dest, + dest_stride); } return; } @@ -186,28 +184,28 @@ void DistanceWeightedBlend_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, if (width == 8) { switch (height) { case 4: - DistanceWeightedBlend8xH_SSE4_1<4>(pred_0, pred_1, weight_0, weight_1, - dest, dest_stride); + DistanceWeightedBlend8xH_SSE4_1<4>(pred_0, pred_1, weight, dest, + dest_stride); return; case 8: - DistanceWeightedBlend8xH_SSE4_1<8>(pred_0, pred_1, weight_0, weight_1, - dest, dest_stride); + DistanceWeightedBlend8xH_SSE4_1<8>(pred_0, pred_1, weight, dest, + dest_stride); return; case 16: - DistanceWeightedBlend8xH_SSE4_1<16>(pred_0, pred_1, weight_0, weight_1, - dest, dest_stride); + DistanceWeightedBlend8xH_SSE4_1<16>(pred_0, pred_1, weight, dest, + dest_stride); return; default: assert(height == 32); - DistanceWeightedBlend8xH_SSE4_1<32>(pred_0, pred_1, weight_0, weight_1, - dest, dest_stride); + DistanceWeightedBlend8xH_SSE4_1<32>(pred_0, pred_1, weight, dest, + dest_stride); return; } } - DistanceWeightedBlendLarge_SSE4_1(pred_0, pred_1, weight_0, weight_1, width, - height, dest, dest_stride); + DistanceWeightedBlendLarge_SSE4_1(pred_0, pred_1, weight, width, height, dest, + dest_stride); } void Init8bpp() { @@ -273,27 +271,19 @@ inline void DistanceWeightedBlend4xH_SSE4_1( int y = height; do { - const __m128i src_00 = LoadLo8(pred_0); - const __m128i src_10 = LoadLo8(pred_1); - pred_0 += 4; - pred_1 += 4; - __m128i src_0 = LoadHi8(src_00, pred_0); - __m128i src_1 = LoadHi8(src_10, pred_1); - pred_0 += 4; - pred_1 += 4; + const __m128i src_00 = LoadAligned16(pred_0); + const __m128i src_10 = LoadAligned16(pred_1); + pred_0 += 8; + pred_1 += 8; const __m128i res0 = - ComputeWeightedAverage8(src_0, src_1, weight0, weight1); - - const __m128i src_01 = LoadLo8(pred_0); - const __m128i src_11 = LoadLo8(pred_1); - pred_0 += 4; - pred_1 += 4; - src_0 = LoadHi8(src_01, pred_0); - src_1 = LoadHi8(src_11, pred_1); - pred_0 += 4; - pred_1 += 4; + ComputeWeightedAverage8(src_00, src_10, weight0, weight1); + + const __m128i src_01 = LoadAligned16(pred_0); + const __m128i src_11 = LoadAligned16(pred_1); + pred_0 += 8; + pred_1 += 8; const __m128i res1 = - ComputeWeightedAverage8(src_0, src_1, weight0, weight1); + ComputeWeightedAverage8(src_01, src_11, weight0, weight1); StoreLo8(dst, res0); dst += dest_stride; diff --git a/src/dsp/x86/film_grain_sse4.cc b/src/dsp/x86/film_grain_sse4.cc index 9ece947..59d18a6 100644 --- a/src/dsp/x86/film_grain_sse4.cc +++ b/src/dsp/x86/film_grain_sse4.cc @@ -23,14 +23,15 @@ #include <cstdint> #include <cstring> -#include "src/dsp/common.h" #include "src/dsp/constants.h" #include "src/dsp/dsp.h" #include "src/dsp/film_grain_common.h" #include "src/dsp/x86/common_sse4.h" +#include "src/utils/array_2d.h" #include "src/utils/common.h" #include "src/utils/compiler_attributes.h" -#include "src/utils/logging.h" +#include "src/utils/constants.h" +#include "src/utils/types.h" namespace libgav1 { namespace dsp { @@ -165,7 +166,7 @@ void BlendNoiseWithImageLuma_SSE4_1( int y = 0; do { int x = 0; - for (; x < safe_width; x += 8) { + for (; x + 8 <= safe_width; x += 8) { const __m128i orig = LoadSource(&in_y_row[x]); const __m128i scaling = GetScalingFactors<bitdepth, Pixel>(scaling_lut_y, &in_y_row[x]); @@ -181,6 +182,7 @@ void BlendNoiseWithImageLuma_SSE4_1( // Prevent arbitrary indices from entering GetScalingFactors. memset(luma_buffer, 0, sizeof(luma_buffer)); const int valid_range = width - x; + assert(valid_range < 8); memcpy(luma_buffer, &in_y_row[x], valid_range * sizeof(in_y_row[0])); luma_buffer[valid_range] = in_y_row[width - 1]; const __m128i orig = LoadSource(&in_y_row[x]); @@ -239,7 +241,7 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlaneWithCfl_SSE4_1( int y = 0; do { int x = 0; - for (; x < safe_chroma_width; x += 8) { + for (; x + 8 <= safe_chroma_width; x += 8) { const int luma_x = x << subsampling_x; const __m128i average_luma = GetAverageLuma(&in_y_row[luma_x], subsampling_x); @@ -252,8 +254,6 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlaneWithCfl_SSE4_1( StoreUnsigned(&out_chroma_row[x], Clip3(blended, floor, ceiling)); } - // This section only runs if width % (8 << sub_x) != 0. It should never run - // on 720p and above. if (x < chroma_width) { // Prevent huge indices from entering GetScalingFactors due to // uninitialized values. This is not a problem in 8bpp because the table @@ -365,7 +365,7 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlane8bpp_SSE4_1( int y = 0; do { int x = 0; - for (; x < safe_chroma_width; x += 8) { + for (; x + 8 <= safe_chroma_width; x += 8) { const int luma_x = x << subsampling_x; const __m128i average_luma = GetAverageLuma(&in_y_row[luma_x], subsampling_x); diff --git a/src/dsp/x86/intrapred_directional_sse4.cc b/src/dsp/x86/intrapred_directional_sse4.cc index e642aee..bc61745 100644 --- a/src/dsp/x86/intrapred_directional_sse4.cc +++ b/src/dsp/x86/intrapred_directional_sse4.cc @@ -624,14 +624,6 @@ inline void DirectionalZone2FromLeftCol_4x4_SSE4_1( } } -// The height at which a load of 16 bytes will not contain enough source pixels -// from |left_column| to supply an accurate row when computing 8 pixels at a -// time. The values are found by inspection. By coincidence, all angles that -// satisfy (ystep >> 6) == 2 map to the same value, so it is enough to look up -// by ystep >> 6. The largest index for this lookup is 1023 >> 6 == 15. -constexpr int kDirectionalZone2ShuffleInvalidHeight[16] = { - 1024, 1024, 16, 16, 16, 16, 0, 0, 18, 0, 0, 0, 0, 0, 0, 40}; - template <bool upsampled> inline void DirectionalZone2FromLeftCol_8x8_SSE4_1( uint8_t* dst, ptrdiff_t stride, const uint8_t* const left_column, @@ -729,6 +721,103 @@ inline void DirectionalZone1Blend_8xH( } } +template <bool shuffle_left_column, bool upsampled_left, bool upsampled_top> +inline void DirectionalZone2_8xH( + uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride, + const uint8_t* LIBGAV1_RESTRICT const top_row, + const uint8_t* LIBGAV1_RESTRICT const left_column, const int height, + const int xstep, const int ystep, const int x, const int left_offset, + const __m128i& xstep_for_shift, const __m128i& xstep_bounds_base, + const __m128i& left_y) { + const int upsample_left_shift = static_cast<int>(upsampled_left); + const int upsample_top_shift = static_cast<int>(upsampled_top); + + // Loop incrementers for moving by block (8x8). This function handles blocks + // with height 4 as well. They are calculated in one pass so these variables + // do not get used. + const ptrdiff_t stride8 = stride << 3; + const int xstep8 = xstep << 3; + const __m128i xstep8_vect = _mm_set1_epi16(xstep8); + + // Cover 8x4 case. + const int min_height = (height == 4) ? 4 : 8; + + // The first stage, before the first y-loop, covers blocks that are only + // computed from the top row. The second stage, comprising two y-loops, covers + // blocks that have a mixture of values computed from top or left. The final + // stage covers blocks that are only computed from the left. + uint8_t* dst_x = dst + x; + + // Round down to the nearest multiple of 8 (or 4, if height is 4). + const int max_top_only_y = + std::min(((x + 1) << 6) / xstep, height) & ~(min_height - 1); + DirectionalZone1_4xH(dst_x, stride, top_row + (x << upsample_top_shift), + max_top_only_y, -xstep, upsampled_top); + DirectionalZone1_4xH(dst_x + 4, stride, + top_row + ((x + 4) << upsample_top_shift), + max_top_only_y, -xstep, upsampled_top); + if (max_top_only_y == height) return; + + const __m128i max_shift = _mm_set1_epi8(32); + const __m128i shift_mask = _mm_set1_epi32(0x003F003F); + const __m128i dest_index_x = + _mm_set_epi32(0x00070006, 0x00050004, 0x00030002, 0x00010000); + const __m128i sampler_top = + upsampled_top + ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100) + : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100); + int y = max_top_only_y; + dst_x += stride * y; + const int xstep_y = xstep * y; + const __m128i xstep_y_vect = _mm_set1_epi16(xstep_y); + // All rows from |min_left_only_y| down for this set of columns, only need + // |left_column| to compute. + const int min_left_only_y = + Align(std::min(((x + 8) << 6) / xstep, height), 8); + + __m128i xstep_bounds = _mm_add_epi16(xstep_bounds_base, xstep_y_vect); + __m128i xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift, xstep_y_vect); + int top_x = -xstep_y; + + const auto base_left_y = static_cast<int16_t>(_mm_extract_epi16(left_y, 0)); + for (; y < min_left_only_y; + y += 8, dst_x += stride8, + xstep_bounds = _mm_add_epi16(xstep_bounds, xstep8_vect), + xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep8_vect), + top_x -= xstep8) { + // Pick up from the last y-value, using the 10% slower but secure method for + // left prediction. + if (shuffle_left_column) { + DirectionalZone2FromLeftCol_8x8_SSE4_1<upsampled_left>( + dst_x, stride, + left_column + ((left_offset + y) << upsample_left_shift), left_y); + } else { + DirectionalZone3_8xH<upsampled_left, 8>( + dst_x, stride, + left_column + ((left_offset + y) << upsample_left_shift), base_left_y, + -ystep); + } + + __m128i shifts = _mm_srli_epi16( + _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift), + shift_mask), + 1); + shifts = _mm_packus_epi16(shifts, shifts); + __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts); + shifts = _mm_unpacklo_epi8(opposite_shifts, shifts); + __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6); + DirectionalZone1Blend_8xH<upsampled_top, 8>( + dst_x, top_row + (x << upsample_top_shift), stride, sampler_top, + xstep_bounds_off, shifts, dest_index_x, top_x, xstep); + } + // Loop over y for left_only rows. + for (; y < height; y += 8, dst_x += stride8) { + DirectionalZone3_8xH<upsampled_left, 8>( + dst_x, stride, left_column + ((left_offset + y) << upsample_left_shift), + base_left_y, -ystep); + } +} + // 7.11.2.4 (8) 90 < angle > 180 // The strategy for this function is to know how many blocks can be processed // with just pixels from |top_ptr|, then handle mixed blocks, then handle only @@ -742,29 +831,11 @@ inline void DirectionalZone2_SSE4_1(void* dest, ptrdiff_t stride, const int width, const int height, const int xstep, const int ystep) { auto* dst = static_cast<uint8_t*>(dest); - const int upsample_left_shift = static_cast<int>(upsampled_left); const int upsample_top_shift = static_cast<int>(upsampled_top); - const __m128i max_shift = _mm_set1_epi8(32); - const ptrdiff_t stride8 = stride << 3; - const __m128i dest_index_x = - _mm_set_epi32(0x00070006, 0x00050004, 0x00030002, 0x00010000); - const __m128i sampler_top = - upsampled_top - ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100) - : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100); - const __m128i shift_mask = _mm_set1_epi32(0x003F003F); - // All columns from |min_top_only_x| to the right will only need |top_row| to - // compute. This assumes minimum |xstep| is 3. + // All columns from |min_top_only_x| to the right will only need |top_row| + // to compute. This assumes minimum |xstep| is 3. const int min_top_only_x = std::min((height * xstep) >> 6, width); - // For steep angles, the source pixels from left_column may not fit in a - // 16-byte load for shuffling. - // TODO(petersonab): Find a more precise formula for this subject to x. - const int max_shuffle_height = - std::min(height, kDirectionalZone2ShuffleInvalidHeight[ystep >> 6]); - - const int xstep8 = xstep << 3; - const __m128i xstep8_vect = _mm_set1_epi16(xstep8); // Accumulate xstep across 8 rows. const __m128i xstep_dup = _mm_set1_epi16(-xstep); const __m128i increments = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1); @@ -787,105 +858,39 @@ inline void DirectionalZone2_SSE4_1(void* dest, ptrdiff_t stride, // offset. Following values need the full ystep as a relative offset. const __m128i ystep_init = _mm_set1_epi16(-ystep_remainder); const __m128i ystep_dup = _mm_set1_epi16(-ystep); + const __m128i dest_index_x = + _mm_set_epi32(0x00070006, 0x00050004, 0x00030002, 0x00010000); __m128i left_y = _mm_mullo_epi16(ystep_dup, dest_index_x); left_y = _mm_add_epi16(ystep_init, left_y); + // Analysis finds that, for most angles (ystep < 132), all segments that use + // both top_row and left_column can compute from left_column using byte + // shuffles from a single vector. For steeper angles, the shuffle is also + // fully reliable when x >= 32. + const int shuffle_left_col_x = (ystep < 132) ? 0 : 32; + const int min_shuffle_x = std::min(min_top_only_x, shuffle_left_col_x); const __m128i increment_top8 = _mm_set1_epi16(8 << 6); int x = 0; - // This loop treats each set of 4 columns in 3 stages with y-value boundaries. - // The first stage, before the first y-loop, covers blocks that are only - // computed from the top row. The second stage, comprising two y-loops, covers - // blocks that have a mixture of values computed from top or left. The final - // stage covers blocks that are only computed from the left. + for (int left_offset = -left_base_increment; x < min_shuffle_x; + x += 8, + xstep_bounds_base = _mm_sub_epi16(xstep_bounds_base, increment_top8), + // Watch left_y because it can still get big. + left_y = _mm_add_epi16(left_y, increment_left8), + left_offset -= left_base_increment8) { + DirectionalZone2_8xH<false, upsampled_left, upsampled_top>( + dst, stride, top_row, left_column, height, xstep, ystep, x, left_offset, + xstep_for_shift, xstep_bounds_base, left_y); + } for (int left_offset = -left_base_increment; x < min_top_only_x; x += 8, xstep_bounds_base = _mm_sub_epi16(xstep_bounds_base, increment_top8), // Watch left_y because it can still get big. left_y = _mm_add_epi16(left_y, increment_left8), left_offset -= left_base_increment8) { - uint8_t* dst_x = dst + x; - - // Round down to the nearest multiple of 8. - const int max_top_only_y = std::min(((x + 1) << 6) / xstep, height) & ~7; - DirectionalZone1_4xH(dst_x, stride, top_row + (x << upsample_top_shift), - max_top_only_y, -xstep, upsampled_top); - DirectionalZone1_4xH(dst_x + 4, stride, - top_row + ((x + 4) << upsample_top_shift), - max_top_only_y, -xstep, upsampled_top); - - int y = max_top_only_y; - dst_x += stride * y; - const int xstep_y = xstep * y; - const __m128i xstep_y_vect = _mm_set1_epi16(xstep_y); - // All rows from |min_left_only_y| down for this set of columns, only need - // |left_column| to compute. - const int min_left_only_y = std::min(((x + 8) << 6) / xstep, height); - // At high angles such that min_left_only_y < 8, ystep is low and xstep is - // high. This means that max_shuffle_height is unbounded and xstep_bounds - // will overflow in 16 bits. This is prevented by stopping the first - // blending loop at min_left_only_y for such cases, which means we skip over - // the second blending loop as well. - const int left_shuffle_stop_y = - std::min(max_shuffle_height, min_left_only_y); - __m128i xstep_bounds = _mm_add_epi16(xstep_bounds_base, xstep_y_vect); - __m128i xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift, xstep_y_vect); - int top_x = -xstep_y; - - for (; y < left_shuffle_stop_y; - y += 8, dst_x += stride8, - xstep_bounds = _mm_add_epi16(xstep_bounds, xstep8_vect), - xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep8_vect), - top_x -= xstep8) { - DirectionalZone2FromLeftCol_8x8_SSE4_1<upsampled_left>( - dst_x, stride, - left_column + ((left_offset + y) << upsample_left_shift), left_y); - - __m128i shifts = _mm_srli_epi16( - _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift), - shift_mask), - 1); - shifts = _mm_packus_epi16(shifts, shifts); - __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts); - shifts = _mm_unpacklo_epi8(opposite_shifts, shifts); - __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6); - DirectionalZone1Blend_8xH<upsampled_top, 8>( - dst_x, top_row + (x << upsample_top_shift), stride, sampler_top, - xstep_bounds_off, shifts, dest_index_x, top_x, xstep); - } - // Pick up from the last y-value, using the 10% slower but secure method for - // left prediction. - const auto base_left_y = static_cast<int16_t>(_mm_extract_epi16(left_y, 0)); - for (; y < min_left_only_y; - y += 8, dst_x += stride8, - xstep_bounds = _mm_add_epi16(xstep_bounds, xstep8_vect), - xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep8_vect), - top_x -= xstep8) { - const __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6); - - DirectionalZone3_8xH<upsampled_left, 8>( - dst_x, stride, - left_column + ((left_offset + y) << upsample_left_shift), base_left_y, - -ystep); - - __m128i shifts = _mm_srli_epi16( - _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift), - shift_mask), - 1); - shifts = _mm_packus_epi16(shifts, shifts); - __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts); - shifts = _mm_unpacklo_epi8(opposite_shifts, shifts); - DirectionalZone1Blend_8xH<upsampled_top, 8>( - dst_x, top_row + (x << upsample_top_shift), stride, sampler_top, - xstep_bounds_off, shifts, dest_index_x, top_x, xstep); - } - // Loop over y for left_only rows. - for (; y < height; y += 8, dst_x += stride8) { - DirectionalZone3_8xH<upsampled_left, 8>( - dst_x, stride, - left_column + ((left_offset + y) << upsample_left_shift), base_left_y, - -ystep); - } + DirectionalZone2_8xH<true, upsampled_left, upsampled_top>( + dst, stride, top_row, left_column, height, xstep, ystep, x, left_offset, + xstep_for_shift, xstep_bounds_base, left_y); } for (; x < width; x += 4) { DirectionalZone1_4xH(dst + x, stride, top_row + (x << upsample_top_shift), @@ -952,8 +957,8 @@ inline void DirectionalZone2_4_SSE4_1(void* dest, ptrdiff_t stride, left_offset -= left_base_increment4) { uint8_t* dst_x = dst + x; - // Round down to the nearest multiple of 8. - const int max_top_only_y = std::min((x << 6) / xstep, height) & 0xFFFFFFF4; + // Round down to the nearest multiple of 4. + const int max_top_only_y = std::min((x << 6) / xstep, height) & ~3; DirectionalZone1_4xH(dst_x, stride, top_row + (x << upsample_top_shift), max_top_only_y, -xstep, upsampled_top); int y = max_top_only_y; diff --git a/src/dsp/x86/loop_restoration_sse4.cc b/src/dsp/x86/loop_restoration_sse4.cc index 3363f0e..b4df072 100644 --- a/src/dsp/x86/loop_restoration_sse4.cc +++ b/src/dsp/x86/loop_restoration_sse4.cc @@ -2088,6 +2088,7 @@ LIBGAV1_ALWAYS_INLINE void BoxFilter( uint16_t* const ma444[3], uint16_t* const ma565[2], uint32_t* const b343[4], uint32_t* const b444[3], uint32_t* const b565[2], uint8_t* const dst) { __m128i s[2][2], ma3[2][2], ma5[2], sq[2][4], b3[2][3], b5[3]; + ma5[1] = _mm_setzero_si128(); // Quiets -Wmaybe-unintialized with gcc. s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width); s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1 - width); sq[0][0] = SquareLo8(s[0][0]); diff --git a/src/dsp/x86/mask_blend_sse4.cc b/src/dsp/x86/mask_blend_sse4.cc index a18444b..833814c 100644 --- a/src/dsp/x86/mask_blend_sse4.cc +++ b/src/dsp/x86/mask_blend_sse4.cc @@ -30,35 +30,81 @@ namespace libgav1 { namespace dsp { -namespace low_bitdepth { namespace { +template <int subsampling_x, int subsampling_y> +inline __m128i GetMask8(const uint8_t* mask, const ptrdiff_t stride) { + if (subsampling_x == 1 && subsampling_y == 1) { + const __m128i one = _mm_set1_epi8(1); + const __m128i mask_val_0 = LoadUnaligned16(mask); + const __m128i mask_val_1 = LoadUnaligned16(mask + stride); + const __m128i add_0 = _mm_adds_epu8(mask_val_0, mask_val_1); + const __m128i mask_0 = _mm_maddubs_epi16(add_0, one); + return RightShiftWithRounding_U16(mask_0, 2); + } + if (subsampling_x == 1) { + const __m128i row_vals = LoadUnaligned16(mask); + const __m128i mask_val_0 = _mm_cvtepu8_epi16(row_vals); + const __m128i mask_val_1 = _mm_cvtepu8_epi16(_mm_srli_si128(row_vals, 8)); + __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1); + return RightShiftWithRounding_U16(subsampled_mask, 1); + } + assert(subsampling_y == 0 && subsampling_x == 0); + const __m128i mask_val = LoadLo8(mask); + return _mm_cvtepu8_epi16(mask_val); +} + +// Imitate behavior of ARM vtrn1q_u64. +inline __m128i Transpose1_U64(const __m128i a, const __m128i b) { + return _mm_castps_si128( + _mm_movelh_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); +} + +// Imitate behavior of ARM vtrn2q_u64. +inline __m128i Transpose2_U64(const __m128i a, const __m128i b) { + return _mm_castps_si128( + _mm_movehl_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); +} + // Width can only be 4 when it is subsampled from a block of width 8, hence // subsampling_x is always 1 when this function is called. template <int subsampling_x, int subsampling_y> -inline __m128i GetMask4x2(const uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +inline __m128i GetMask4x2(const uint8_t* mask) { + if (subsampling_x == 1 && subsampling_y == 1) { + const __m128i mask_val_01 = LoadUnaligned16(mask); + // Stride is fixed because this is the smallest block size. + const __m128i mask_val_23 = LoadUnaligned16(mask + 16); + // Transpose rows to add row 0 to row 1, and row 2 to row 3. + const __m128i mask_val_02 = Transpose1_U64(mask_val_01, mask_val_23); + const __m128i mask_val_13 = Transpose2_U64(mask_val_23, mask_val_01); + const __m128i add_0 = _mm_adds_epu8(mask_val_02, mask_val_13); + const __m128i one = _mm_set1_epi8(1); + const __m128i mask_0 = _mm_maddubs_epi16(add_0, one); + return RightShiftWithRounding_U16(mask_0, 2); + } + return GetMask8<subsampling_x, 0>(mask, 0); +} + +template <int subsampling_x, int subsampling_y> +inline __m128i GetInterIntraMask4x2(const uint8_t* mask, + ptrdiff_t mask_stride) { if (subsampling_x == 1) { - const __m128i mask_val_0 = _mm_cvtepu8_epi16(LoadLo8(mask)); - const __m128i mask_val_1 = - _mm_cvtepu8_epi16(LoadLo8(mask + (mask_stride << subsampling_y))); - __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1); - if (subsampling_y == 1) { - const __m128i next_mask_val_0 = - _mm_cvtepu8_epi16(LoadLo8(mask + mask_stride)); - const __m128i next_mask_val_1 = - _mm_cvtepu8_epi16(LoadLo8(mask + mask_stride * 3)); - subsampled_mask = _mm_add_epi16( - subsampled_mask, _mm_hadd_epi16(next_mask_val_0, next_mask_val_1)); - } - return RightShiftWithRounding_U16(subsampled_mask, 1 + subsampling_y); + return GetMask4x2<subsampling_x, subsampling_y>(mask); } + // When using intra or difference weighted masks, the function doesn't use + // subsampling, so |mask_stride| may be 4 or 8. + assert(subsampling_y == 0 && subsampling_x == 0); const __m128i mask_val_0 = Load4(mask); const __m128i mask_val_1 = Load4(mask + mask_stride); return _mm_cvtepu8_epi16( _mm_or_si128(mask_val_0, _mm_slli_si128(mask_val_1, 4))); } +} // namespace + +namespace low_bitdepth { +namespace { + // This function returns a 16-bit packed mask to fit in _mm_madd_epi16. // 16-bit is also the lowest packing for hadd, but without subsampling there is // an unfortunate conversion required. @@ -87,38 +133,6 @@ inline __m128i GetMask8(const uint8_t* LIBGAV1_RESTRICT mask, return _mm_cvtepu8_epi16(mask_val); } -// This version returns 8-bit packed values to fit in _mm_maddubs_epi16 because, -// when is_inter_intra is true, the prediction values are brought to 8-bit -// packing as well. -template <int subsampling_x, int subsampling_y> -inline __m128i GetInterIntraMask8(const uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t stride) { - if (subsampling_x == 1) { - const __m128i row_vals = LoadUnaligned16(mask); - - const __m128i mask_val_0 = _mm_cvtepu8_epi16(row_vals); - const __m128i mask_val_1 = _mm_cvtepu8_epi16(_mm_srli_si128(row_vals, 8)); - __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1); - - if (subsampling_y == 1) { - const __m128i next_row_vals = LoadUnaligned16(mask + stride); - const __m128i next_mask_val_0 = _mm_cvtepu8_epi16(next_row_vals); - const __m128i next_mask_val_1 = - _mm_cvtepu8_epi16(_mm_srli_si128(next_row_vals, 8)); - subsampled_mask = _mm_add_epi16( - subsampled_mask, _mm_hadd_epi16(next_mask_val_0, next_mask_val_1)); - } - const __m128i ret = - RightShiftWithRounding_U16(subsampled_mask, 1 + subsampling_y); - return _mm_packus_epi16(ret, ret); - } - assert(subsampling_y == 0 && subsampling_x == 0); - // Unfortunately there is no shift operation for 8-bit packing, or else we - // could return everything with 8-bit packing. - const __m128i mask_val = LoadLo8(mask); - return mask_val; -} - inline void WriteMaskBlendLine4x2(const int16_t* LIBGAV1_RESTRICT const pred_0, const int16_t* LIBGAV1_RESTRICT const pred_1, const __m128i pred_mask_0, @@ -149,15 +163,14 @@ inline void WriteMaskBlendLine4x2(const int16_t* LIBGAV1_RESTRICT const pred_0, } template <int subsampling_x, int subsampling_y> -inline void MaskBlending4x4_SSE4(const int16_t* LIBGAV1_RESTRICT pred_0, - const int16_t* LIBGAV1_RESTRICT pred_1, - const uint8_t* LIBGAV1_RESTRICT mask, - const ptrdiff_t mask_stride, - uint8_t* LIBGAV1_RESTRICT dst, - const ptrdiff_t dst_stride) { +inline void MaskBlending4x4_SSE4_1(const int16_t* LIBGAV1_RESTRICT pred_0, + const int16_t* LIBGAV1_RESTRICT pred_1, + const uint8_t* LIBGAV1_RESTRICT mask, + uint8_t* LIBGAV1_RESTRICT dst, + const ptrdiff_t dst_stride) { + constexpr ptrdiff_t mask_stride = 4 << subsampling_x; const __m128i mask_inverter = _mm_set1_epi16(64); - __m128i pred_mask_0 = - GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride); + __m128i pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask); __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst, dst_stride); @@ -166,30 +179,30 @@ inline void MaskBlending4x4_SSE4(const int16_t* LIBGAV1_RESTRICT pred_0, mask += mask_stride << (1 + subsampling_y); dst += dst_stride << 1; - pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride); + pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask); pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst, dst_stride); } template <int subsampling_x, int subsampling_y> -inline void MaskBlending4xH_SSE4(const int16_t* LIBGAV1_RESTRICT pred_0, - const int16_t* LIBGAV1_RESTRICT pred_1, - const uint8_t* LIBGAV1_RESTRICT const mask_ptr, - const ptrdiff_t mask_stride, const int height, - uint8_t* LIBGAV1_RESTRICT dst, - const ptrdiff_t dst_stride) { +inline void MaskBlending4xH_SSE4_1( + const int16_t* LIBGAV1_RESTRICT pred_0, + const int16_t* LIBGAV1_RESTRICT pred_1, + const uint8_t* LIBGAV1_RESTRICT const mask_ptr, const int height, + uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t dst_stride) { + assert(subsampling_x == 1); const uint8_t* mask = mask_ptr; + constexpr ptrdiff_t mask_stride = 4 << subsampling_x; if (height == 4) { - MaskBlending4x4_SSE4<subsampling_x, subsampling_y>( - pred_0, pred_1, mask, mask_stride, dst, dst_stride); + MaskBlending4x4_SSE4_1<subsampling_x, subsampling_y>(pred_0, pred_1, mask, + dst, dst_stride); return; } const __m128i mask_inverter = _mm_set1_epi16(64); int y = 0; do { - __m128i pred_mask_0 = - GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride); + __m128i pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask); __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst, @@ -199,7 +212,7 @@ inline void MaskBlending4xH_SSE4(const int16_t* LIBGAV1_RESTRICT pred_0, mask += mask_stride << (1 + subsampling_y); dst += dst_stride << 1; - pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride); + pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask); pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst, dst_stride); @@ -208,7 +221,7 @@ inline void MaskBlending4xH_SSE4(const int16_t* LIBGAV1_RESTRICT pred_0, mask += mask_stride << (1 + subsampling_y); dst += dst_stride << 1; - pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride); + pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask); pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst, dst_stride); @@ -217,7 +230,7 @@ inline void MaskBlending4xH_SSE4(const int16_t* LIBGAV1_RESTRICT pred_0, mask += mask_stride << (1 + subsampling_y); dst += dst_stride << 1; - pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride); + pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask); pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst, dst_stride); @@ -230,21 +243,21 @@ inline void MaskBlending4xH_SSE4(const int16_t* LIBGAV1_RESTRICT pred_0, } template <int subsampling_x, int subsampling_y> -inline void MaskBlend_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - const ptrdiff_t /*prediction_stride_1*/, - const uint8_t* LIBGAV1_RESTRICT const mask_ptr, - const ptrdiff_t mask_stride, const int width, - const int height, void* LIBGAV1_RESTRICT dest, - const ptrdiff_t dst_stride) { +inline void MaskBlend_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + const ptrdiff_t /*prediction_stride_1*/, + const uint8_t* LIBGAV1_RESTRICT const mask_ptr, + const ptrdiff_t mask_stride, const int width, + const int height, void* LIBGAV1_RESTRICT dest, + const ptrdiff_t dst_stride) { auto* dst = static_cast<uint8_t*>(dest); const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); const ptrdiff_t pred_stride_0 = width; const ptrdiff_t pred_stride_1 = width; if (width == 4) { - MaskBlending4xH_SSE4<subsampling_x, subsampling_y>( - pred_0, pred_1, mask_ptr, mask_stride, height, dst, dst_stride); + MaskBlending4xH_SSE4_1<subsampling_x, subsampling_y>( + pred_0, pred_1, mask_ptr, height, dst, dst_stride); return; } const uint8_t* mask = mask_ptr; @@ -293,7 +306,6 @@ inline void InterIntraWriteMaskBlendLine8bpp4x2( const __m128i pred_mask = _mm_unpacklo_epi8(pred_mask_0, pred_mask_1); const __m128i pred_val_0 = LoadLo8(pred_0); - // TODO(b/150326556): One load. __m128i pred_val_1 = Load4(pred_1); pred_val_1 = _mm_or_si128(_mm_slli_si128(Load4(pred_1 + pred_stride_1), 4), pred_val_1); @@ -309,16 +321,16 @@ inline void InterIntraWriteMaskBlendLine8bpp4x2( } template <int subsampling_x, int subsampling_y> -inline void InterIntraMaskBlending8bpp4x4_SSE4( +inline void InterIntraMaskBlending8bpp4x4_SSE4_1( const uint8_t* LIBGAV1_RESTRICT pred_0, uint8_t* LIBGAV1_RESTRICT pred_1, const ptrdiff_t pred_stride_1, const uint8_t* LIBGAV1_RESTRICT mask, const ptrdiff_t mask_stride) { const __m128i mask_inverter = _mm_set1_epi8(64); const __m128i pred_mask_u16_first = - GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride); + GetInterIntraMask4x2<subsampling_x, subsampling_y>(mask, mask_stride); mask += mask_stride << (1 + subsampling_y); const __m128i pred_mask_u16_second = - GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride); + GetInterIntraMask4x2<subsampling_x, subsampling_y>(mask, mask_stride); mask += mask_stride << (1 + subsampling_y); __m128i pred_mask_1 = _mm_packus_epi16(pred_mask_u16_first, pred_mask_u16_second); @@ -335,26 +347,26 @@ inline void InterIntraMaskBlending8bpp4x4_SSE4( } template <int subsampling_x, int subsampling_y> -inline void InterIntraMaskBlending8bpp4xH_SSE4( +inline void InterIntraMaskBlending8bpp4xH_SSE4_1( const uint8_t* LIBGAV1_RESTRICT pred_0, uint8_t* LIBGAV1_RESTRICT pred_1, const ptrdiff_t pred_stride_1, const uint8_t* LIBGAV1_RESTRICT const mask_ptr, const ptrdiff_t mask_stride, const int height) { const uint8_t* mask = mask_ptr; if (height == 4) { - InterIntraMaskBlending8bpp4x4_SSE4<subsampling_x, subsampling_y>( + InterIntraMaskBlending8bpp4x4_SSE4_1<subsampling_x, subsampling_y>( pred_0, pred_1, pred_stride_1, mask, mask_stride); return; } int y = 0; do { - InterIntraMaskBlending8bpp4x4_SSE4<subsampling_x, subsampling_y>( + InterIntraMaskBlending8bpp4x4_SSE4_1<subsampling_x, subsampling_y>( pred_0, pred_1, pred_stride_1, mask, mask_stride); pred_0 += 4 << 2; pred_1 += pred_stride_1 << 2; mask += mask_stride << (2 + subsampling_y); - InterIntraMaskBlending8bpp4x4_SSE4<subsampling_x, subsampling_y>( + InterIntraMaskBlending8bpp4x4_SSE4_1<subsampling_x, subsampling_y>( pred_0, pred_1, pred_stride_1, mask, mask_stride); pred_0 += 4 << 2; pred_1 += pred_stride_1 << 2; @@ -363,14 +375,31 @@ inline void InterIntraMaskBlending8bpp4xH_SSE4( } while (y < height); } +// This version returns 8-bit packed values to fit in _mm_maddubs_epi16 because, +// when is_inter_intra is true, the prediction values are brought to 8-bit +// packing as well. +template <int subsampling_x, int subsampling_y> +inline __m128i GetInterIntraMask8bpp8(const uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t stride) { + if (subsampling_x == 1) { + const __m128i ret = GetMask8<subsampling_x, subsampling_y>(mask, stride); + return _mm_packus_epi16(ret, ret); + } + assert(subsampling_y == 0 && subsampling_x == 0); + // Unfortunately there is no shift operation for 8-bit packing, or else we + // could return everything with 8-bit packing. + const __m128i mask_val = LoadLo8(mask); + return mask_val; +} + template <int subsampling_x, int subsampling_y> -void InterIntraMaskBlend8bpp_SSE4( +void InterIntraMaskBlend8bpp_SSE4_1( const uint8_t* LIBGAV1_RESTRICT prediction_0, uint8_t* LIBGAV1_RESTRICT prediction_1, const ptrdiff_t prediction_stride_1, const uint8_t* LIBGAV1_RESTRICT const mask_ptr, const ptrdiff_t mask_stride, const int width, const int height) { if (width == 4) { - InterIntraMaskBlending8bpp4xH_SSE4<subsampling_x, subsampling_y>( + InterIntraMaskBlending8bpp4xH_SSE4_1<subsampling_x, subsampling_y>( prediction_0, prediction_1, prediction_stride_1, mask_ptr, mask_stride, height); return; @@ -382,7 +411,7 @@ void InterIntraMaskBlend8bpp_SSE4( int x = 0; do { const __m128i pred_mask_1 = - GetInterIntraMask8<subsampling_x, subsampling_y>( + GetInterIntraMask8bpp8<subsampling_x, subsampling_y>( mask + (x << subsampling_x), mask_stride); // 64 - mask const __m128i pred_mask_0 = _mm_sub_epi8(mask_inverter, pred_mask_1); @@ -411,24 +440,24 @@ void Init8bpp() { Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8); assert(dsp != nullptr); #if DSP_ENABLED_8BPP_SSE4_1(MaskBlend444) - dsp->mask_blend[0][0] = MaskBlend_SSE4<0, 0>; + dsp->mask_blend[0][0] = MaskBlend_SSE4_1<0, 0>; #endif #if DSP_ENABLED_8BPP_SSE4_1(MaskBlend422) - dsp->mask_blend[1][0] = MaskBlend_SSE4<1, 0>; + dsp->mask_blend[1][0] = MaskBlend_SSE4_1<1, 0>; #endif #if DSP_ENABLED_8BPP_SSE4_1(MaskBlend420) - dsp->mask_blend[2][0] = MaskBlend_SSE4<1, 1>; + dsp->mask_blend[2][0] = MaskBlend_SSE4_1<1, 1>; #endif // The is_inter_intra index of mask_blend[][] is replaced by // inter_intra_mask_blend_8bpp[] in 8-bit. #if DSP_ENABLED_8BPP_SSE4_1(InterIntraMaskBlend8bpp444) - dsp->inter_intra_mask_blend_8bpp[0] = InterIntraMaskBlend8bpp_SSE4<0, 0>; + dsp->inter_intra_mask_blend_8bpp[0] = InterIntraMaskBlend8bpp_SSE4_1<0, 0>; #endif #if DSP_ENABLED_8BPP_SSE4_1(InterIntraMaskBlend8bpp422) - dsp->inter_intra_mask_blend_8bpp[1] = InterIntraMaskBlend8bpp_SSE4<1, 0>; + dsp->inter_intra_mask_blend_8bpp[1] = InterIntraMaskBlend8bpp_SSE4_1<1, 0>; #endif #if DSP_ENABLED_8BPP_SSE4_1(InterIntraMaskBlend8bpp420) - dsp->inter_intra_mask_blend_8bpp[2] = InterIntraMaskBlend8bpp_SSE4<1, 1>; + dsp->inter_intra_mask_blend_8bpp[2] = InterIntraMaskBlend8bpp_SSE4_1<1, 1>; #endif } @@ -443,14 +472,6 @@ constexpr int kMax10bppSample = (1 << 10) - 1; constexpr int kMaskInverse = 64; constexpr int kRoundBitsMaskBlend = 4; -inline __m128i RightShiftWithRoundingZero_U16(const __m128i v_val_d, int bits, - const __m128i zero) { - // Shift out all but the last bit. - const __m128i v_tmp_d = _mm_srli_epi16(v_val_d, bits - 1); - // Avg with zero will shift by 1 and round. - return _mm_avg_epu16(v_tmp_d, zero); -} - inline __m128i RightShiftWithRoundingConst_S32(const __m128i v_val_d, int bits, const __m128i shift) { const __m128i v_tmp_d = _mm_add_epi32(v_val_d, shift); @@ -458,53 +479,31 @@ inline __m128i RightShiftWithRoundingConst_S32(const __m128i v_val_d, int bits, } template <int subsampling_x, int subsampling_y> -inline __m128i GetMask4x2(const uint8_t* mask, ptrdiff_t mask_stride, - const __m128i zero) { - if (subsampling_x == 1) { - if (subsampling_y == 0) { - const __m128i mask_val_0 = _mm_cvtepu8_epi16(LoadLo8(mask)); - const __m128i mask_val_1 = - _mm_cvtepu8_epi16(LoadLo8(mask + (mask_stride << subsampling_y))); - __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1); - return RightShiftWithRoundingZero_U16(subsampled_mask, 1, zero); - } - const __m128i one = _mm_set1_epi8(1); - const __m128i mask_val_0 = - LoadHi8(LoadLo8(mask), mask + (mask_stride << 1)); - const __m128i mask_val_1 = LoadHi8(LoadLo8(mask + mask_stride), - mask + (mask_stride << 1) + mask_stride); - const __m128i add = _mm_adds_epu8(mask_val_0, mask_val_1); - const __m128i subsampled_mask = _mm_maddubs_epi16(add, one); - return RightShiftWithRoundingZero_U16(subsampled_mask, 2, zero); +inline __m128i GetMask4x2(const uint8_t* mask) { + if (subsampling_x == 1 && subsampling_y == 1) { + const __m128i mask_row_01 = LoadUnaligned16(mask); + const __m128i mask_row_23 = LoadUnaligned16(mask + 16); + const __m128i mask_val_0 = _mm_cvtepu8_epi16(mask_row_01); + const __m128i mask_val_1 = + _mm_cvtepu8_epi16(_mm_srli_si128(mask_row_01, 8)); + const __m128i mask_val_2 = _mm_cvtepu8_epi16(mask_row_23); + const __m128i mask_val_3 = + _mm_cvtepu8_epi16(_mm_srli_si128(mask_row_23, 8)); + const __m128i subsampled_mask_02 = _mm_hadd_epi16(mask_val_0, mask_val_2); + const __m128i subsampled_mask_13 = _mm_hadd_epi16(mask_val_1, mask_val_3); + const __m128i subsampled_mask = + _mm_add_epi16(subsampled_mask_02, subsampled_mask_13); + return RightShiftWithRounding_U16(subsampled_mask, 2); } - assert(subsampling_y == 0 && subsampling_x == 0); - const __m128i mask_val_0 = Load4(mask); - const __m128i mask_val_1 = Load4(mask + mask_stride); - return _mm_cvtepu8_epi16( - _mm_or_si128(mask_val_0, _mm_slli_si128(mask_val_1, 4))); -} - -template <int subsampling_x, int subsampling_y> -inline __m128i GetMask8(const uint8_t* mask, const ptrdiff_t stride, - const __m128i zero) { if (subsampling_x == 1) { - if (subsampling_y == 0) { - const __m128i row_vals = LoadUnaligned16(mask); - const __m128i mask_val_0 = _mm_cvtepu8_epi16(row_vals); - const __m128i mask_val_1 = _mm_cvtepu8_epi16(_mm_srli_si128(row_vals, 8)); - __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1); - return RightShiftWithRoundingZero_U16(subsampled_mask, 1, zero); - } - const __m128i one = _mm_set1_epi8(1); - const __m128i mask_val_0 = LoadUnaligned16(mask); - const __m128i mask_val_1 = LoadUnaligned16(mask + stride); - const __m128i add_0 = _mm_adds_epu8(mask_val_0, mask_val_1); - const __m128i mask_0 = _mm_maddubs_epi16(add_0, one); - return RightShiftWithRoundingZero_U16(mask_0, 2, zero); + const __m128i mask_row_01 = LoadUnaligned16(mask); + const __m128i mask_val_0 = _mm_cvtepu8_epi16(mask_row_01); + const __m128i mask_val_1 = + _mm_cvtepu8_epi16(_mm_srli_si128(mask_row_01, 8)); + const __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1); + return RightShiftWithRounding_U16(subsampled_mask, 1); } - assert(subsampling_y == 0 && subsampling_x == 0); - const __m128i mask_val = LoadLo8(mask); - return _mm_cvtepu8_epi16(mask_val); + return _mm_cvtepu8_epi16(LoadLo8(mask)); } inline void WriteMaskBlendLine10bpp4x2_SSE4_1( @@ -558,12 +557,10 @@ inline void MaskBlend10bpp4x4_SSE4_1(const uint16_t* LIBGAV1_RESTRICT pred_0, uint16_t* LIBGAV1_RESTRICT dst, const ptrdiff_t dst_stride) { const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse); - const __m128i zero = _mm_setzero_si128(); const __m128i shift4 = _mm_set1_epi32((1 << kRoundBitsMaskBlend) >> 1); const __m128i offset = _mm_set1_epi32(kCompoundOffset); const __m128i max = _mm_set1_epi16(kMax10bppSample); - __m128i pred_mask_0 = - GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero); + __m128i pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask); __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, pred_mask_0, pred_mask_1, offset, max, shift4, dst, @@ -573,8 +570,7 @@ inline void MaskBlend10bpp4x4_SSE4_1(const uint16_t* LIBGAV1_RESTRICT pred_0, mask += mask_stride << (1 + subsampling_y); dst += dst_stride << 1; - pred_mask_0 = - GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero); + pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask); pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, pred_mask_0, pred_mask_1, offset, max, shift4, dst, @@ -595,7 +591,6 @@ inline void MaskBlend10bpp4xH_SSE4_1( return; } const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse); - const __m128i zero = _mm_setzero_si128(); const uint8_t pred0_stride2 = 4 << 1; const ptrdiff_t pred1_stride2 = pred_stride_1 << 1; const ptrdiff_t mask_stride2 = mask_stride << (1 + subsampling_y); @@ -605,8 +600,7 @@ inline void MaskBlend10bpp4xH_SSE4_1( const __m128i shift4 = _mm_set1_epi32((1 << kRoundBitsMaskBlend) >> 1); int y = height; do { - __m128i pred_mask_0 = - GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero); + __m128i pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask); __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, @@ -617,8 +611,7 @@ inline void MaskBlend10bpp4xH_SSE4_1( mask += mask_stride2; dst += dst_stride2; - pred_mask_0 = - GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero); + pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask); pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, pred_mask_0, pred_mask_1, offset, max, @@ -628,8 +621,7 @@ inline void MaskBlend10bpp4xH_SSE4_1( mask += mask_stride2; dst += dst_stride2; - pred_mask_0 = - GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero); + pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask); pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, pred_mask_0, pred_mask_1, offset, max, @@ -639,8 +631,7 @@ inline void MaskBlend10bpp4xH_SSE4_1( mask += mask_stride2; dst += dst_stride2; - pred_mask_0 = - GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero); + pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask); pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, pred_mask_0, pred_mask_1, offset, max, @@ -675,7 +666,6 @@ inline void MaskBlend10bpp_SSE4_1( } const uint8_t* mask = mask_ptr; const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse); - const __m128i zero = _mm_setzero_si128(); const ptrdiff_t mask_stride_ss = mask_stride << subsampling_y; const __m128i offset = _mm_set1_epi32(kCompoundOffset); const __m128i max = _mm_set1_epi16(kMax10bppSample); @@ -685,7 +675,7 @@ inline void MaskBlend10bpp_SSE4_1( int x = 0; do { const __m128i pred_mask_0 = GetMask8<subsampling_x, subsampling_y>( - mask + (x << subsampling_x), mask_stride, zero); + mask + (x << subsampling_x), mask_stride); const __m128i pred_val_0 = LoadUnaligned16(pred_0 + x); const __m128i pred_val_1 = LoadUnaligned16(pred_1 + x); // 64 - mask @@ -729,7 +719,6 @@ inline void MaskBlend10bpp_SSE4_1( mask += mask_stride_ss; } while (--y != 0); } - inline void InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1( const uint16_t* LIBGAV1_RESTRICT prediction_0, const uint16_t* LIBGAV1_RESTRICT prediction_1, @@ -764,9 +753,8 @@ inline void InterIntraMaskBlend10bpp4x4_SSE4_1( uint16_t* LIBGAV1_RESTRICT dst, const ptrdiff_t dst_stride) { const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse); const __m128i shift6 = _mm_set1_epi32((1 << 6) >> 1); - const __m128i zero = _mm_setzero_si128(); __m128i pred_mask_0 = - GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero); + GetInterIntraMask4x2<subsampling_x, subsampling_y>(mask, mask_stride); __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, pred_mask_0, pred_mask_1, shift6, @@ -777,7 +765,7 @@ inline void InterIntraMaskBlend10bpp4x4_SSE4_1( dst += dst_stride << 1; pred_mask_0 = - GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero); + GetInterIntraMask4x2<subsampling_x, subsampling_y>(mask, mask_stride); pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, pred_mask_0, pred_mask_1, shift6, @@ -798,7 +786,6 @@ inline void InterIntraMaskBlend10bpp4xH_SSE4_1( return; } const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse); - const __m128i zero = _mm_setzero_si128(); const __m128i shift6 = _mm_set1_epi32((1 << 6) >> 1); const uint8_t pred0_stride2 = 4 << 1; const ptrdiff_t pred1_stride2 = pred_stride_1 << 1; @@ -807,7 +794,7 @@ inline void InterIntraMaskBlend10bpp4xH_SSE4_1( int y = height; do { __m128i pred_mask_0 = - GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero); + GetInterIntraMask4x2<subsampling_x, subsampling_y>(mask, mask_stride); __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, pred_mask_0, pred_mask_1, @@ -818,7 +805,7 @@ inline void InterIntraMaskBlend10bpp4xH_SSE4_1( dst += dst_stride2; pred_mask_0 = - GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero); + GetInterIntraMask4x2<subsampling_x, subsampling_y>(mask, mask_stride); pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, pred_mask_0, pred_mask_1, @@ -829,7 +816,7 @@ inline void InterIntraMaskBlend10bpp4xH_SSE4_1( dst += dst_stride2; pred_mask_0 = - GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero); + GetInterIntraMask4x2<subsampling_x, subsampling_y>(mask, mask_stride); pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, pred_mask_0, pred_mask_1, @@ -840,7 +827,7 @@ inline void InterIntraMaskBlend10bpp4xH_SSE4_1( dst += dst_stride2; pred_mask_0 = - GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero); + GetInterIntraMask4x2<subsampling_x, subsampling_y>(mask, mask_stride); pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, pred_mask_0, pred_mask_1, @@ -876,14 +863,13 @@ inline void InterIntraMaskBlend10bpp_SSE4_1( const uint8_t* mask = mask_ptr; const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse); const __m128i shift6 = _mm_set1_epi32((1 << 6) >> 1); - const __m128i zero = _mm_setzero_si128(); const ptrdiff_t mask_stride_ss = mask_stride << subsampling_y; int y = height; do { int x = 0; do { const __m128i pred_mask_0 = GetMask8<subsampling_x, subsampling_y>( - mask + (x << subsampling_x), mask_stride, zero); + mask + (x << subsampling_x), mask_stride); const __m128i pred_val_0 = LoadUnaligned16(pred_0 + x); const __m128i pred_val_1 = LoadUnaligned16(pred_1 + x); // 64 - mask diff --git a/src/dsp/x86/obmc_sse4.cc b/src/dsp/x86/obmc_sse4.cc index 8ce23b4..f068ff3 100644 --- a/src/dsp/x86/obmc_sse4.cc +++ b/src/dsp/x86/obmc_sse4.cc @@ -39,8 +39,8 @@ namespace { inline void OverlapBlendFromLeft2xH_SSE4_1( uint8_t* LIBGAV1_RESTRICT const prediction, const ptrdiff_t prediction_stride, const int height, - const uint8_t* LIBGAV1_RESTRICT const obmc_prediction, - const ptrdiff_t obmc_prediction_stride) { + const uint8_t* LIBGAV1_RESTRICT const obmc_prediction) { + constexpr int obmc_prediction_stride = 2; uint8_t* pred = prediction; const uint8_t* obmc_pred = obmc_prediction; const __m128i mask_inverter = _mm_cvtsi32_si128(0x40404040); @@ -51,8 +51,7 @@ inline void OverlapBlendFromLeft2xH_SSE4_1( int y = height; do { const __m128i pred_val = Load2x2(pred, pred + prediction_stride); - const __m128i obmc_pred_val = - Load2x2(obmc_pred, obmc_pred + obmc_prediction_stride); + const __m128i obmc_pred_val = Load4(obmc_pred); const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val); const __m128i result = @@ -71,8 +70,8 @@ inline void OverlapBlendFromLeft2xH_SSE4_1( inline void OverlapBlendFromLeft4xH_SSE4_1( uint8_t* LIBGAV1_RESTRICT const prediction, const ptrdiff_t prediction_stride, const int height, - const uint8_t* LIBGAV1_RESTRICT const obmc_prediction, - const ptrdiff_t obmc_prediction_stride) { + const uint8_t* LIBGAV1_RESTRICT const obmc_prediction) { + constexpr int obmc_prediction_stride = 4; uint8_t* pred = prediction; const uint8_t* obmc_pred = obmc_prediction; const __m128i mask_inverter = _mm_cvtsi32_si128(0x40404040); @@ -85,15 +84,12 @@ inline void OverlapBlendFromLeft4xH_SSE4_1( int y = height; do { const __m128i pred_val0 = Load4(pred); - const __m128i obmc_pred_val0 = Load4(obmc_pred); pred += prediction_stride; - obmc_pred += obmc_prediction_stride; // Place the second row of each source in the second four bytes. const __m128i pred_val = _mm_alignr_epi8(Load4(pred), _mm_slli_si128(pred_val0, 12), 12); - const __m128i obmc_pred_val = _mm_alignr_epi8( - Load4(obmc_pred), _mm_slli_si128(obmc_pred_val0, 12), 12); + const __m128i obmc_pred_val = LoadLo8(obmc_pred); const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val); const __m128i result = RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6); @@ -102,7 +98,7 @@ inline void OverlapBlendFromLeft4xH_SSE4_1( const int second_row_result = _mm_extract_epi32(packed_result, 1); memcpy(pred, &second_row_result, sizeof(second_row_result)); pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + obmc_pred += obmc_prediction_stride << 1; y -= 2; } while (y != 0); } @@ -110,8 +106,8 @@ inline void OverlapBlendFromLeft4xH_SSE4_1( inline void OverlapBlendFromLeft8xH_SSE4_1( uint8_t* LIBGAV1_RESTRICT const prediction, const ptrdiff_t prediction_stride, const int height, - const uint8_t* LIBGAV1_RESTRICT const obmc_prediction, - const ptrdiff_t obmc_prediction_stride) { + const uint8_t* LIBGAV1_RESTRICT const obmc_prediction) { + constexpr int obmc_prediction_stride = 8; uint8_t* pred = prediction; const uint8_t* obmc_pred = obmc_prediction; const __m128i mask_inverter = _mm_set1_epi8(64); @@ -121,16 +117,25 @@ inline void OverlapBlendFromLeft8xH_SSE4_1( const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val); int y = height; do { - const __m128i pred_val = LoadLo8(pred); - const __m128i obmc_pred_val = LoadLo8(obmc_pred); - const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val); - const __m128i result = - RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6); + const __m128i pred_val = LoadHi8(LoadLo8(pred), pred + prediction_stride); + const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred); + + const __m128i terms_lo = _mm_unpacklo_epi8(pred_val, obmc_pred_val); + const __m128i result_lo = + RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_lo, masks), 6); + + const __m128i terms_hi = _mm_unpackhi_epi8(pred_val, obmc_pred_val); + const __m128i result_hi = + RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_hi, masks), 6); - StoreLo8(pred, _mm_packus_epi16(result, result)); + const __m128i result = _mm_packus_epi16(result_lo, result_hi); + StoreLo8(pred, result); pred += prediction_stride; - obmc_pred += obmc_prediction_stride; - } while (--y != 0); + StoreHi8(pred, result); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride << 1; + y -= 2; + } while (y != 0); } void OverlapBlendFromLeft_SSE4_1( @@ -144,18 +149,15 @@ void OverlapBlendFromLeft_SSE4_1( assert(height >= 4); if (width == 2) { - OverlapBlendFromLeft2xH_SSE4_1(pred, prediction_stride, height, obmc_pred, - obmc_prediction_stride); + OverlapBlendFromLeft2xH_SSE4_1(pred, prediction_stride, height, obmc_pred); return; } if (width == 4) { - OverlapBlendFromLeft4xH_SSE4_1(pred, prediction_stride, height, obmc_pred, - obmc_prediction_stride); + OverlapBlendFromLeft4xH_SSE4_1(pred, prediction_stride, height, obmc_pred); return; } if (width == 8) { - OverlapBlendFromLeft8xH_SSE4_1(pred, prediction_stride, height, obmc_pred, - obmc_prediction_stride); + OverlapBlendFromLeft8xH_SSE4_1(pred, prediction_stride, height, obmc_pred); return; } const __m128i mask_inverter = _mm_set1_epi8(64); @@ -192,8 +194,8 @@ void OverlapBlendFromLeft_SSE4_1( inline void OverlapBlendFromTop4xH_SSE4_1( uint8_t* LIBGAV1_RESTRICT const prediction, const ptrdiff_t prediction_stride, const int height, - const uint8_t* LIBGAV1_RESTRICT const obmc_prediction, - const ptrdiff_t obmc_prediction_stride) { + const uint8_t* LIBGAV1_RESTRICT const obmc_prediction) { + constexpr int obmc_prediction_stride = 4; uint8_t* pred = prediction; const uint8_t* obmc_pred = obmc_prediction; const __m128i mask_inverter = _mm_set1_epi16(64); @@ -212,13 +214,10 @@ inline void OverlapBlendFromTop4xH_SSE4_1( _mm_sub_epi8(mask_inverter, _mm_sign_epi8(mask_val, mask_preinverter)); const __m128i pred_val0 = Load4(pred); - const __m128i obmc_pred_val0 = Load4(obmc_pred); + const __m128i obmc_pred_val = LoadLo8(obmc_pred); pred += prediction_stride; - obmc_pred += obmc_prediction_stride; const __m128i pred_val = _mm_alignr_epi8(Load4(pred), _mm_slli_si128(pred_val0, 12), 12); - const __m128i obmc_pred_val = _mm_alignr_epi8( - Load4(obmc_pred), _mm_slli_si128(obmc_pred_val0, 12), 12); const __m128i terms = _mm_unpacklo_epi8(obmc_pred_val, pred_val); const __m128i result = RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6); @@ -227,7 +226,7 @@ inline void OverlapBlendFromTop4xH_SSE4_1( Store4(pred - prediction_stride, packed_result); Store4(pred, _mm_srli_si128(packed_result, 4)); pred += prediction_stride; - obmc_pred += obmc_prediction_stride; + obmc_pred += obmc_prediction_stride << 1; y += 2; } while (y < compute_height); } @@ -235,8 +234,8 @@ inline void OverlapBlendFromTop4xH_SSE4_1( inline void OverlapBlendFromTop8xH_SSE4_1( uint8_t* LIBGAV1_RESTRICT const prediction, const ptrdiff_t prediction_stride, const int height, - const uint8_t* LIBGAV1_RESTRICT const obmc_prediction, - const ptrdiff_t obmc_prediction_stride) { + const uint8_t* LIBGAV1_RESTRICT const obmc_prediction) { + constexpr int obmc_prediction_stride = 8; uint8_t* pred = prediction; const uint8_t* obmc_pred = obmc_prediction; const uint8_t* mask = kObmcMask + height - 2; @@ -244,20 +243,35 @@ inline void OverlapBlendFromTop8xH_SSE4_1( const int compute_height = height - (height >> 2); int y = compute_height; do { - const __m128i mask_val = _mm_set1_epi8(mask[compute_height - y]); + const __m128i mask_val0 = _mm_set1_epi8(mask[compute_height - y]); // 64 - mask - const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val); - const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val); - const __m128i pred_val = LoadLo8(pred); - const __m128i obmc_pred_val = LoadLo8(obmc_pred); - const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val); - const __m128i result = - RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6); + const __m128i obmc_mask_val0 = _mm_sub_epi8(mask_inverter, mask_val0); + const __m128i masks0 = _mm_unpacklo_epi8(mask_val0, obmc_mask_val0); - StoreLo8(pred, _mm_packus_epi16(result, result)); + const __m128i pred_val = LoadHi8(LoadLo8(pred), pred + prediction_stride); + const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred); + + const __m128i terms_lo = _mm_unpacklo_epi8(pred_val, obmc_pred_val); + const __m128i result_lo = + RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_lo, masks0), 6); + + --y; + const __m128i mask_val1 = _mm_set1_epi8(mask[compute_height - y]); + // 64 - mask + const __m128i obmc_mask_val1 = _mm_sub_epi8(mask_inverter, mask_val1); + const __m128i masks1 = _mm_unpacklo_epi8(mask_val1, obmc_mask_val1); + + const __m128i terms_hi = _mm_unpackhi_epi8(pred_val, obmc_pred_val); + const __m128i result_hi = + RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_hi, masks1), 6); + + const __m128i result = _mm_packus_epi16(result_lo, result_hi); + StoreLo8(pred, result); pred += prediction_stride; - obmc_pred += obmc_prediction_stride; - } while (--y != 0); + StoreHi8(pred, result); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride << 1; + } while (--y > 0); } void OverlapBlendFromTop_SSE4_1( @@ -271,13 +285,11 @@ void OverlapBlendFromTop_SSE4_1( assert(height >= 2); if (width == 4) { - OverlapBlendFromTop4xH_SSE4_1(pred, prediction_stride, height, obmc_pred, - obmc_prediction_stride); + OverlapBlendFromTop4xH_SSE4_1(pred, prediction_stride, height, obmc_pred); return; } if (width == 8) { - OverlapBlendFromTop8xH_SSE4_1(pred, prediction_stride, height, obmc_pred, - obmc_prediction_stride); + OverlapBlendFromTop8xH_SSE4_1(pred, prediction_stride, height, obmc_pred); return; } @@ -333,8 +345,8 @@ constexpr int kRoundBitsObmcBlend = 6; inline void OverlapBlendFromLeft2xH_SSE4_1( uint16_t* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride, - const int height, const uint16_t* LIBGAV1_RESTRICT const obmc_prediction, - const ptrdiff_t obmc_pred_stride) { + const int height, const uint16_t* LIBGAV1_RESTRICT const obmc_prediction) { + constexpr int obmc_pred_stride = 2; uint16_t* pred = prediction; const uint16_t* obmc_pred = obmc_prediction; const ptrdiff_t pred_stride2 = pred_stride << 1; @@ -348,8 +360,7 @@ inline void OverlapBlendFromLeft2xH_SSE4_1( int y = height; do { const __m128i pred_val = Load4x2(pred, pred + pred_stride); - const __m128i obmc_pred_val = - Load4x2(obmc_pred, obmc_pred + obmc_pred_stride); + const __m128i obmc_pred_val = LoadLo8(obmc_pred); const __m128i terms = _mm_unpacklo_epi16(pred_val, obmc_pred_val); const __m128i result = RightShiftWithRounding_U32( _mm_madd_epi16(terms, masks), kRoundBitsObmcBlend); @@ -364,8 +375,8 @@ inline void OverlapBlendFromLeft2xH_SSE4_1( inline void OverlapBlendFromLeft4xH_SSE4_1( uint16_t* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride, - const int height, const uint16_t* LIBGAV1_RESTRICT const obmc_prediction, - const ptrdiff_t obmc_pred_stride) { + const int height, const uint16_t* LIBGAV1_RESTRICT const obmc_prediction) { + constexpr int obmc_pred_stride = 4; uint16_t* pred = prediction; const uint16_t* obmc_pred = obmc_prediction; const ptrdiff_t pred_stride2 = pred_stride << 1; @@ -379,8 +390,7 @@ inline void OverlapBlendFromLeft4xH_SSE4_1( int y = height; do { const __m128i pred_val = LoadHi8(LoadLo8(pred), pred + pred_stride); - const __m128i obmc_pred_val = - LoadHi8(LoadLo8(obmc_pred), obmc_pred + obmc_pred_stride); + const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred); const __m128i terms_lo = _mm_unpacklo_epi16(pred_val, obmc_pred_val); const __m128i terms_hi = _mm_unpackhi_epi16(pred_val, obmc_pred_val); const __m128i result_lo = RightShiftWithRounding_U32( @@ -410,13 +420,11 @@ void OverlapBlendFromLeft10bpp_SSE4_1( assert(height >= 4); if (width == 2) { - OverlapBlendFromLeft2xH_SSE4_1(pred, pred_stride, height, obmc_pred, - obmc_pred_stride); + OverlapBlendFromLeft2xH_SSE4_1(pred, pred_stride, height, obmc_pred); return; } if (width == 4) { - OverlapBlendFromLeft4xH_SSE4_1(pred, pred_stride, height, obmc_pred, - obmc_pred_stride); + OverlapBlendFromLeft4xH_SSE4_1(pred, pred_stride, height, obmc_pred); return; } const __m128i mask_inverter = _mm_set1_epi8(64); @@ -452,8 +460,8 @@ void OverlapBlendFromLeft10bpp_SSE4_1( inline void OverlapBlendFromTop4xH_SSE4_1( uint16_t* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride, - const int height, const uint16_t* LIBGAV1_RESTRICT const obmc_prediction, - const ptrdiff_t obmc_pred_stride) { + const int height, const uint16_t* LIBGAV1_RESTRICT const obmc_prediction) { + constexpr int obmc_pred_stride = 4; uint16_t* pred = prediction; const uint16_t* obmc_pred = obmc_prediction; const __m128i mask_inverter = _mm_set1_epi16(64); @@ -473,8 +481,7 @@ inline void OverlapBlendFromTop4xH_SSE4_1( const __m128i masks_hi = _mm_cvtepi8_epi16(_mm_srli_si128(masks, 8)); const __m128i pred_val = LoadHi8(LoadLo8(pred), pred + pred_stride); - const __m128i obmc_pred_val = - LoadHi8(LoadLo8(obmc_pred), obmc_pred + obmc_pred_stride); + const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred); const __m128i terms_lo = _mm_unpacklo_epi16(obmc_pred_val, pred_val); const __m128i terms_hi = _mm_unpackhi_epi16(obmc_pred_val, pred_val); const __m128i result_lo = RightShiftWithRounding_U32( @@ -505,8 +512,7 @@ void OverlapBlendFromTop10bpp_SSE4_1( assert(height >= 2); if (width == 4) { - OverlapBlendFromTop4xH_SSE4_1(pred, pred_stride, height, obmc_pred, - obmc_pred_stride); + OverlapBlendFromTop4xH_SSE4_1(pred, pred_stride, height, obmc_pred); return; } diff --git a/src/dsp/x86/warp_sse4.cc b/src/dsp/x86/warp_sse4.cc index 5830894..5498052 100644 --- a/src/dsp/x86/warp_sse4.cc +++ b/src/dsp/x86/warp_sse4.cc @@ -167,7 +167,7 @@ inline void WriteVerticalFilter(const __m128i filter[8], } template <bool is_compound, typename DestType> -inline void VerticalFilter(const int16_t source[15][8], int y4, int gamma, +inline void VerticalFilter(const int16_t source[15][8], int64_t y4, int gamma, int delta, DestType* LIBGAV1_RESTRICT dest_row, ptrdiff_t dest_stride) { int sy4 = (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta); @@ -188,8 +188,8 @@ inline void VerticalFilter(const int16_t source[15][8], int y4, int gamma, } template <bool is_compound, typename DestType> -inline void VerticalFilter(const int16_t* LIBGAV1_RESTRICT source_cols, int y4, - int gamma, int delta, +inline void VerticalFilter(const int16_t* LIBGAV1_RESTRICT source_cols, + int64_t y4, int gamma, int delta, DestType* LIBGAV1_RESTRICT dest_row, ptrdiff_t dest_stride) { int sy4 = (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta); @@ -249,7 +249,7 @@ inline void WarpRegion1(const uint8_t* LIBGAV1_RESTRICT src, template <bool is_compound, typename DestType> inline void WarpRegion2(const uint8_t* LIBGAV1_RESTRICT src, - ptrdiff_t source_stride, int source_width, int y4, + ptrdiff_t source_stride, int source_width, int64_t y4, int ix4, int iy4, int gamma, int delta, int16_t intermediate_result_column[15], DestType* LIBGAV1_RESTRICT dst_row, @@ -291,7 +291,7 @@ inline void WarpRegion2(const uint8_t* LIBGAV1_RESTRICT src, template <bool is_compound, typename DestType> inline void WarpRegion3(const uint8_t* LIBGAV1_RESTRICT src, ptrdiff_t source_stride, int source_height, int alpha, - int beta, int x4, int ix4, int iy4, + int beta, int64_t x4, int ix4, int iy4, int16_t intermediate_result[15][8]) { // Region 3 // At this point, we know ix4 - 7 < source_width - 1 and ix4 + 7 > 0. @@ -323,8 +323,9 @@ inline void WarpRegion3(const uint8_t* LIBGAV1_RESTRICT src, template <bool is_compound, typename DestType> inline void WarpRegion4(const uint8_t* LIBGAV1_RESTRICT src, - ptrdiff_t source_stride, int alpha, int beta, int x4, - int ix4, int iy4, int16_t intermediate_result[15][8]) { + ptrdiff_t source_stride, int alpha, int beta, + int64_t x4, int ix4, int iy4, + int16_t intermediate_result[15][8]) { // Region 4. // At this point, we know ix4 - 7 < source_width - 1 and ix4 + 7 > 0. @@ -379,14 +380,8 @@ inline void HandleWarpBlock(const uint8_t* LIBGAV1_RESTRICT src, int16_t intermediate_result_column[15]; }; - const int dst_x = - src_x * warp_params[2] + src_y * warp_params[3] + warp_params[0]; - const int dst_y = - src_x * warp_params[4] + src_y * warp_params[5] + warp_params[1]; - const int x4 = dst_x >> subsampling_x; - const int y4 = dst_y >> subsampling_y; - const int ix4 = x4 >> kWarpedModelPrecisionBits; - const int iy4 = y4 >> kWarpedModelPrecisionBits; + const WarpFilterParams filter_params = GetWarpFilterParams( + src_x, src_y, subsampling_x, subsampling_y, warp_params); // A prediction block may fall outside the frame's boundaries. If a // prediction block is calculated using only samples outside the frame's // boundary, the filtering can be simplified. We can divide the plane @@ -439,33 +434,38 @@ inline void HandleWarpBlock(const uint8_t* LIBGAV1_RESTRICT src, // border index (source_width - 1 or 0, respectively). Then for each x, // the inner for loop of the horizontal filter is reduced to multiplying // the border pixel by the sum of the filter coefficients. - if (ix4 - 7 >= source_width - 1 || ix4 + 7 <= 0) { - if ((iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0)) { + if (filter_params.ix4 - 7 >= source_width - 1 || filter_params.ix4 + 7 <= 0) { + if ((filter_params.iy4 - 7 >= source_height - 1 || + filter_params.iy4 + 7 <= 0)) { // Outside the frame in both directions. One repeated value. - WarpRegion1<is_compound, DestType>(src, source_stride, source_width, - source_height, ix4, iy4, dst_row, - dest_stride); + WarpRegion1<is_compound, DestType>( + src, source_stride, source_width, source_height, filter_params.ix4, + filter_params.iy4, dst_row, dest_stride); return; } // Outside the frame horizontally. Rows repeated. WarpRegion2<is_compound, DestType>( - src, source_stride, source_width, y4, ix4, iy4, gamma, delta, - intermediate_result_column, dst_row, dest_stride); + src, source_stride, source_width, filter_params.y4, filter_params.ix4, + filter_params.iy4, gamma, delta, intermediate_result_column, dst_row, + dest_stride); return; } - if ((iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0)) { + if ((filter_params.iy4 - 7 >= source_height - 1 || + filter_params.iy4 + 7 <= 0)) { // Outside the frame vertically. - WarpRegion3<is_compound, DestType>(src, source_stride, source_height, alpha, - beta, x4, ix4, iy4, intermediate_result); + WarpRegion3<is_compound, DestType>( + src, source_stride, source_height, alpha, beta, filter_params.x4, + filter_params.ix4, filter_params.iy4, intermediate_result); } else { // Inside the frame. - WarpRegion4<is_compound, DestType>(src, source_stride, alpha, beta, x4, ix4, - iy4, intermediate_result); + WarpRegion4<is_compound, DestType>(src, source_stride, alpha, beta, + filter_params.x4, filter_params.ix4, + filter_params.iy4, intermediate_result); } // Region 3 and 4 vertical filter. - VerticalFilter<is_compound, DestType>(intermediate_result, y4, gamma, delta, - dst_row, dest_stride); + VerticalFilter<is_compound, DestType>(intermediate_result, filter_params.y4, + gamma, delta, dst_row, dest_stride); } template <bool is_compound> diff --git a/src/dsp/x86/weight_mask_sse4.cc b/src/dsp/x86/weight_mask_sse4.cc index 69cb784..53a374d 100644 --- a/src/dsp/x86/weight_mask_sse4.cc +++ b/src/dsp/x86/weight_mask_sse4.cc @@ -37,10 +37,10 @@ namespace { constexpr int kRoundingBits8bpp = 4; template <bool mask_is_inverse, bool is_store_16> -inline void WeightMask16_SSE4(const int16_t* LIBGAV1_RESTRICT prediction_0, - const int16_t* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +inline void WeightMask16_SSE4_1(const int16_t* LIBGAV1_RESTRICT prediction_0, + const int16_t* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const __m128i pred_00 = LoadAligned16(prediction_0); const __m128i pred_10 = LoadAligned16(prediction_1); const __m128i difference_0 = RightShiftWithRounding_U16( @@ -78,7 +78,7 @@ inline void WeightMask16_SSE4(const int16_t* LIBGAV1_RESTRICT prediction_0, } #define WEIGHT8_PAIR_WITHOUT_STRIDE \ - WeightMask16_SSE4<mask_is_inverse, false>(pred_0, pred_1, mask, mask_stride) + WeightMask16_SSE4_1<mask_is_inverse, false>(pred_0, pred_1, mask, mask_stride) #define WEIGHT8_PAIR_AND_STRIDE \ WEIGHT8_PAIR_WITHOUT_STRIDE; \ @@ -87,9 +87,10 @@ inline void WeightMask16_SSE4(const int16_t* LIBGAV1_RESTRICT prediction_0, mask += mask_stride << 1 template <bool mask_is_inverse> -void WeightMask8x8_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, ptrdiff_t mask_stride) { +void WeightMask8x8_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); @@ -100,10 +101,10 @@ void WeightMask8x8_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template <bool mask_is_inverse> -void WeightMask8x16_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask8x16_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); int y3 = 3; @@ -116,10 +117,10 @@ void WeightMask8x16_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template <bool mask_is_inverse> -void WeightMask8x32_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask8x32_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); int y5 = 5; @@ -132,7 +133,7 @@ void WeightMask8x32_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } #define WEIGHT16_WITHOUT_STRIDE \ - WeightMask16_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask, mask_stride) + WeightMask16_SSE4_1<mask_is_inverse, true>(pred_0, pred_1, mask, mask_stride) #define WEIGHT16_AND_STRIDE \ WEIGHT16_WITHOUT_STRIDE; \ @@ -141,10 +142,10 @@ void WeightMask8x32_SSE4(const void* LIBGAV1_RESTRICT prediction_0, mask += mask_stride template <bool mask_is_inverse> -void WeightMask16x8_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask16x8_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); int y = 7; @@ -155,10 +156,10 @@ void WeightMask16x8_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template <bool mask_is_inverse> -void WeightMask16x16_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask16x16_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); int y3 = 5; @@ -171,10 +172,10 @@ void WeightMask16x16_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template <bool mask_is_inverse> -void WeightMask16x32_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask16x32_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); int y5 = 6; @@ -190,10 +191,10 @@ void WeightMask16x32_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template <bool mask_is_inverse> -void WeightMask16x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask16x64_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); int y3 = 21; @@ -205,10 +206,11 @@ void WeightMask16x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0, WEIGHT16_WITHOUT_STRIDE; } -#define WEIGHT32_WITHOUT_STRIDE \ - WeightMask16_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask, mask_stride); \ - WeightMask16_SSE4<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16, \ - mask + 16, mask_stride) +#define WEIGHT32_WITHOUT_STRIDE \ + WeightMask16_SSE4_1<mask_is_inverse, true>(pred_0, pred_1, mask, \ + mask_stride); \ + WeightMask16_SSE4_1<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16, \ + mask + 16, mask_stride) #define WEIGHT32_AND_STRIDE \ WEIGHT32_WITHOUT_STRIDE; \ @@ -217,10 +219,10 @@ void WeightMask16x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0, mask += mask_stride template <bool mask_is_inverse> -void WeightMask32x8_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask32x8_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); WEIGHT32_AND_STRIDE; @@ -234,10 +236,10 @@ void WeightMask32x8_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template <bool mask_is_inverse> -void WeightMask32x16_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask32x16_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); int y3 = 5; @@ -250,10 +252,10 @@ void WeightMask32x16_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template <bool mask_is_inverse> -void WeightMask32x32_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask32x32_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); int y5 = 6; @@ -269,10 +271,10 @@ void WeightMask32x32_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template <bool mask_is_inverse> -void WeightMask32x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask32x64_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); int y3 = 21; @@ -284,14 +286,15 @@ void WeightMask32x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0, WEIGHT32_WITHOUT_STRIDE; } -#define WEIGHT64_WITHOUT_STRIDE \ - WeightMask16_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask, mask_stride); \ - WeightMask16_SSE4<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16, \ - mask + 16, mask_stride); \ - WeightMask16_SSE4<mask_is_inverse, true>(pred_0 + 32, pred_1 + 32, \ - mask + 32, mask_stride); \ - WeightMask16_SSE4<mask_is_inverse, true>(pred_0 + 48, pred_1 + 48, \ - mask + 48, mask_stride) +#define WEIGHT64_WITHOUT_STRIDE \ + WeightMask16_SSE4_1<mask_is_inverse, true>(pred_0, pred_1, mask, \ + mask_stride); \ + WeightMask16_SSE4_1<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16, \ + mask + 16, mask_stride); \ + WeightMask16_SSE4_1<mask_is_inverse, true>(pred_0 + 32, pred_1 + 32, \ + mask + 32, mask_stride); \ + WeightMask16_SSE4_1<mask_is_inverse, true>(pred_0 + 48, pred_1 + 48, \ + mask + 48, mask_stride) #define WEIGHT64_AND_STRIDE \ WEIGHT64_WITHOUT_STRIDE; \ @@ -300,10 +303,10 @@ void WeightMask32x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0, mask += mask_stride template <bool mask_is_inverse> -void WeightMask64x16_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask64x16_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); int y3 = 0; @@ -316,10 +319,10 @@ void WeightMask64x16_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template <bool mask_is_inverse> -void WeightMask64x32_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask64x32_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); int y5 = 0; @@ -335,10 +338,10 @@ void WeightMask64x32_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template <bool mask_is_inverse> -void WeightMask64x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask64x64_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); int y3 = 0; @@ -351,10 +354,10 @@ void WeightMask64x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template <bool mask_is_inverse> -void WeightMask64x128_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask64x128_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); int y3 = 0; @@ -368,10 +371,10 @@ void WeightMask64x128_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template <bool mask_is_inverse> -void WeightMask128x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask128x64_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); int y3 = 0; @@ -412,10 +415,10 @@ void WeightMask128x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template <bool mask_is_inverse> -void WeightMask128x128_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask128x128_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const int16_t*>(prediction_0); const auto* pred_1 = static_cast<const int16_t*>(prediction_1); int y3 = 0; @@ -466,8 +469,9 @@ void WeightMask128x128_SSE4(const void* LIBGAV1_RESTRICT prediction_0, #define INIT_WEIGHT_MASK_8BPP(width, height, w_index, h_index) \ dsp->weight_mask[w_index][h_index][0] = \ - WeightMask##width##x##height##_SSE4<0>; \ - dsp->weight_mask[w_index][h_index][1] = WeightMask##width##x##height##_SSE4<1> + WeightMask##width##x##height##_SSE4_1<0>; \ + dsp->weight_mask[w_index][h_index][1] = \ + WeightMask##width##x##height##_SSE4_1<1> void Init8bpp() { Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8); assert(dsp != nullptr); @@ -501,7 +505,7 @@ constexpr int kRoundingBits10bpp = 6; constexpr int kScaledDiffShift = 4; template <bool mask_is_inverse, bool is_store_16> -inline void WeightMask16_10bpp_SSE4( +inline void WeightMask16_10bpp_SSE4_1( const uint16_t* LIBGAV1_RESTRICT prediction_0, const uint16_t* LIBGAV1_RESTRICT prediction_1, uint8_t* LIBGAV1_RESTRICT mask, ptrdiff_t mask_stride) { @@ -562,9 +566,9 @@ inline void WeightMask16_10bpp_SSE4( } } -#define WEIGHT8_PAIR_WITHOUT_STRIDE_10BPP \ - WeightMask16_10bpp_SSE4<mask_is_inverse, false>(pred_0, pred_1, mask, \ - mask_stride) +#define WEIGHT8_PAIR_WITHOUT_STRIDE_10BPP \ + WeightMask16_10bpp_SSE4_1<mask_is_inverse, false>(pred_0, pred_1, mask, \ + mask_stride) #define WEIGHT8_PAIR_AND_STRIDE_10BPP \ WEIGHT8_PAIR_WITHOUT_STRIDE_10BPP; \ @@ -573,10 +577,10 @@ inline void WeightMask16_10bpp_SSE4( mask += mask_stride << 1 template <bool mask_is_inverse> -void WeightMask8x8_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask8x8_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); @@ -587,10 +591,10 @@ void WeightMask8x8_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template <bool mask_is_inverse> -void WeightMask8x16_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask8x16_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); int y3 = 3; @@ -603,10 +607,10 @@ void WeightMask8x16_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template <bool mask_is_inverse> -void WeightMask8x32_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask8x32_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); int y5 = 5; @@ -618,9 +622,9 @@ void WeightMask8x32_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, WEIGHT8_PAIR_WITHOUT_STRIDE_10BPP; } -#define WEIGHT16_WITHOUT_STRIDE_10BPP \ - WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask, \ - mask_stride) +#define WEIGHT16_WITHOUT_STRIDE_10BPP \ + WeightMask16_10bpp_SSE4_1<mask_is_inverse, true>(pred_0, pred_1, mask, \ + mask_stride) #define WEIGHT16_AND_STRIDE_10BPP \ WEIGHT16_WITHOUT_STRIDE_10BPP; \ @@ -629,10 +633,10 @@ void WeightMask8x32_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, mask += mask_stride template <bool mask_is_inverse> -void WeightMask16x8_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask16x8_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); int y = 7; @@ -643,10 +647,10 @@ void WeightMask16x8_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template <bool mask_is_inverse> -void WeightMask16x16_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask16x16_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); int y3 = 5; @@ -659,10 +663,10 @@ void WeightMask16x16_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template <bool mask_is_inverse> -void WeightMask16x32_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask16x32_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); int y5 = 6; @@ -678,10 +682,10 @@ void WeightMask16x32_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template <bool mask_is_inverse> -void WeightMask16x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask16x64_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); int y3 = 21; @@ -693,11 +697,11 @@ void WeightMask16x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, WEIGHT16_WITHOUT_STRIDE_10BPP; } -#define WEIGHT32_WITHOUT_STRIDE_10BPP \ - WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask, \ - mask_stride); \ - WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16, \ - mask + 16, mask_stride) +#define WEIGHT32_WITHOUT_STRIDE_10BPP \ + WeightMask16_10bpp_SSE4_1<mask_is_inverse, true>(pred_0, pred_1, mask, \ + mask_stride); \ + WeightMask16_10bpp_SSE4_1<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16, \ + mask + 16, mask_stride) #define WEIGHT32_AND_STRIDE_10BPP \ WEIGHT32_WITHOUT_STRIDE_10BPP; \ @@ -706,10 +710,10 @@ void WeightMask16x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, mask += mask_stride template <bool mask_is_inverse> -void WeightMask32x8_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask32x8_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); WEIGHT32_AND_STRIDE_10BPP; @@ -723,10 +727,10 @@ void WeightMask32x8_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template <bool mask_is_inverse> -void WeightMask32x16_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask32x16_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); int y3 = 5; @@ -739,10 +743,10 @@ void WeightMask32x16_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template <bool mask_is_inverse> -void WeightMask32x32_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask32x32_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); int y5 = 6; @@ -758,10 +762,10 @@ void WeightMask32x32_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template <bool mask_is_inverse> -void WeightMask32x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask32x64_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); int y3 = 21; @@ -773,15 +777,15 @@ void WeightMask32x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, WEIGHT32_WITHOUT_STRIDE_10BPP; } -#define WEIGHT64_WITHOUT_STRIDE_10BPP \ - WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask, \ - mask_stride); \ - WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16, \ - mask + 16, mask_stride); \ - WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0 + 32, pred_1 + 32, \ - mask + 32, mask_stride); \ - WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0 + 48, pred_1 + 48, \ - mask + 48, mask_stride) +#define WEIGHT64_WITHOUT_STRIDE_10BPP \ + WeightMask16_10bpp_SSE4_1<mask_is_inverse, true>(pred_0, pred_1, mask, \ + mask_stride); \ + WeightMask16_10bpp_SSE4_1<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16, \ + mask + 16, mask_stride); \ + WeightMask16_10bpp_SSE4_1<mask_is_inverse, true>(pred_0 + 32, pred_1 + 32, \ + mask + 32, mask_stride); \ + WeightMask16_10bpp_SSE4_1<mask_is_inverse, true>(pred_0 + 48, pred_1 + 48, \ + mask + 48, mask_stride) #define WEIGHT64_AND_STRIDE_10BPP \ WEIGHT64_WITHOUT_STRIDE_10BPP; \ @@ -790,10 +794,10 @@ void WeightMask32x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, mask += mask_stride template <bool mask_is_inverse> -void WeightMask64x16_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask64x16_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); int y3 = 5; @@ -806,10 +810,10 @@ void WeightMask64x16_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template <bool mask_is_inverse> -void WeightMask64x32_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask64x32_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); int y5 = 6; @@ -825,10 +829,10 @@ void WeightMask64x32_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template <bool mask_is_inverse> -void WeightMask64x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask64x64_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); int y3 = 21; @@ -841,10 +845,10 @@ void WeightMask64x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template <bool mask_is_inverse> -void WeightMask64x128_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask64x128_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); int y3 = 42; @@ -858,10 +862,10 @@ void WeightMask64x128_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template <bool mask_is_inverse> -void WeightMask128x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask128x64_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); int y3 = 21; @@ -902,10 +906,10 @@ void WeightMask128x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, } template <bool mask_is_inverse> -void WeightMask128x128_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, - const void* LIBGAV1_RESTRICT prediction_1, - uint8_t* LIBGAV1_RESTRICT mask, - ptrdiff_t mask_stride) { +void WeightMask128x128_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0, + const void* LIBGAV1_RESTRICT prediction_1, + uint8_t* LIBGAV1_RESTRICT mask, + ptrdiff_t mask_stride) { const auto* pred_0 = static_cast<const uint16_t*>(prediction_0); const auto* pred_1 = static_cast<const uint16_t*>(prediction_1); int y3 = 42; @@ -956,9 +960,9 @@ void WeightMask128x128_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0, #define INIT_WEIGHT_MASK_10BPP(width, height, w_index, h_index) \ dsp->weight_mask[w_index][h_index][0] = \ - WeightMask##width##x##height##_10bpp_SSE4<0>; \ + WeightMask##width##x##height##_10bpp_SSE4_1<0>; \ dsp->weight_mask[w_index][h_index][1] = \ - WeightMask##width##x##height##_10bpp_SSE4<1> + WeightMask##width##x##height##_10bpp_SSE4_1<1> void Init10bpp() { Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10); assert(dsp != nullptr); diff --git a/src/film_grain.cc b/src/film_grain.cc index 5c64ff2..44a2543 100644 --- a/src/film_grain.cc +++ b/src/film_grain.cc @@ -824,5 +824,8 @@ template class FilmGrain<kBitdepth8>; #if LIBGAV1_MAX_BITDEPTH >= 10 template class FilmGrain<kBitdepth10>; #endif +#if LIBGAV1_MAX_BITDEPTH == 12 +template class FilmGrain<kBitdepth12>; +#endif } // namespace libgav1 diff --git a/src/film_grain.h b/src/film_grain.h index f2c1e93..bda8458 100644 --- a/src/film_grain.h +++ b/src/film_grain.h @@ -104,7 +104,9 @@ class FilmGrain { using Pixel = typename std::conditional<bitdepth == 8, uint8_t, uint16_t>::type; static constexpr int kScalingLutLength = - (kScalingLookupTableSize + kScalingLookupTablePadding) << (bitdepth - 8); + (bitdepth == 10) + ? (kScalingLookupTableSize + kScalingLookupTablePadding) << 2 + : kScalingLookupTableSize + kScalingLookupTablePadding; bool Init(); diff --git a/src/film_grain_test.cc b/src/film_grain_test.cc index bf37299..d5854e0 100644 --- a/src/film_grain_test.cc +++ b/src/film_grain_test.cc @@ -435,11 +435,25 @@ const char* GetTestDigestLuma(int bitdepth, int param_index) { "0efbad5f9dc07391ad243232b8df1787", "2bd41882cd82960019aa2b87d5fb1fbc", "1c66629c0c4e7b6f9b0a7a6944fbad50", "2c633a50ead62f8e844a409545f46244", }; + static const char* const kTestDigestsLuma12bpp[10] = { + "1dc9b38a93454a85eb924f25346ae369", "5f9d311ee5384a5a902f8e2d1297319e", + "cf1a35878720564c7a741f91eef66565", "47a0608fe0f6f7ccae42a5ca05783cbf", + "dbc28da0178e3c18a036c3f2203c300f", "04911d2074e3252119ee2d80426b8c01", + "df19ab8103c40b726c842ccf7772208b", "39276967eb16710d98f82068c3eeba41", + "b83100f18abb2062d9c9969f07182b86", "b39a69515491329698cf66f6d4fa371f", + }; - if (bitdepth == 8) { - return kTestDigestsLuma8bpp[param_index]; + switch (bitdepth) { + case 8: + return kTestDigestsLuma8bpp[param_index]; + case 10: + return kTestDigestsLuma10bpp[param_index]; + case 12: + return kTestDigestsLuma12bpp[param_index]; + default: + assert(bitdepth == 8 || bitdepth == 10 || bitdepth == 12); + return nullptr; } - return kTestDigestsLuma10bpp[param_index]; } const char* GetTestDigestChromaU(int bitdepth, int param_index) { @@ -457,10 +471,25 @@ const char* GetTestDigestChromaU(int bitdepth, int param_index) { "be306c6a94c55dbd9ef514f0ad4a0011", "904602329b0dec352b3b177b0a2554d2", "58afc9497d968c67fdf2c0cf23b33aa3", "74fee7be6f62724bf901fdd04a733b46", }; - if (bitdepth == 8) { - return kTestDigestsChromaU8bpp[param_index]; + static const char* const kTestDigestsChromaU12bpp[10] = { + "846d608050fe7c19d6cabe2d53cb7821", "2caf4665a26aad50f68497e4b1326417", + "ce40f0f8f8c207c7c985464c812fea33", "820de51d07a21da5c00833bab546f1fa", + "5e7bedd8933cd274af03babb4dbb94dd", "d137cf584eabea86387460a6d3f62bfe", + "f206e0c6ed35b3ab35c6ff37e151e963", "55d87981b7044df225b3b5935185449b", + "6a655c8bf4df6af0e80ae6d004a73a25", "6234ae36076cc77161af6e6e3c04449a", + }; + + switch (bitdepth) { + case 8: + return kTestDigestsChromaU8bpp[param_index]; + case 10: + return kTestDigestsChromaU10bpp[param_index]; + case 12: + return kTestDigestsChromaU12bpp[param_index]; + default: + assert(bitdepth == 8 || bitdepth == 10 || bitdepth == 12); + return nullptr; } - return kTestDigestsChromaU10bpp[param_index]; } const char* GetTestDigestChromaV(int bitdepth, int param_index) { @@ -478,95 +507,93 @@ const char* GetTestDigestChromaV(int bitdepth, int param_index) { "7b1624c3543badf5fadaee4d1e602e6b", "3be074e4ca0eec5770748b15661aaadd", "639197401032f272d6c30666a2d08f43", "28075dd34246bf9d5e6197b1944f646a", }; - if (bitdepth == 8) { - return kTestDigestsChromaV8bpp[param_index]; + static const char* const kTestDigestsChromaV12bpp[10] = { + "4957ec919c20707d594fa5c2138c2550", "3f07c65bfb42c81768b1f5ad9611d1ce", + "665d9547171c99faba95ac81a35c9a0c", "1b5d032e0cefdb4041ad51796de8a45e", + "18fa974579a4f1ff8cd7df664fc339d5", "2ffaa4f143495ff73c06a580a97b6321", + "4fd1f562bc47a68dbfaf7c566c7c4da6", "4d37c80c9caf110c1d3d20bd1a1875b3", + "8ea29759640962613166dc5154837d14", "5ca4c10f42d0906c72ebee90fae6ce7d", + }; + + switch (bitdepth) { + case 8: + return kTestDigestsChromaV8bpp[param_index]; + case 10: + return kTestDigestsChromaV10bpp[param_index]; + case 12: + return kTestDigestsChromaV12bpp[param_index]; + default: + assert(bitdepth == 8 || bitdepth == 10 || bitdepth == 12); + return nullptr; } - return kTestDigestsChromaV10bpp[param_index]; } const char* GetARTestDigestLuma(int bitdepth, int coeff_lag, int param_index) { static const char* const kTestDigestsLuma8bpp[3][kNumFilmGrainTestParams] = { - { - "a835127918f93478b45f1ba4d20d81bd", - "a835127918f93478b45f1ba4d20d81bd", - "e5db4da626e214bb17bcc7ecffa76303", - "a835127918f93478b45f1ba4d20d81bd", - "a835127918f93478b45f1ba4d20d81bd", - "e5db4da626e214bb17bcc7ecffa76303", - "a835127918f93478b45f1ba4d20d81bd", - "1da62b7233de502123a18546b6c97da2", - "1da62b7233de502123a18546b6c97da2", - "1da62b7233de502123a18546b6c97da2", - }, - { - "11464b880de3ecd6e6189c5c4e7f9b28", - "dfe411762e283b5f49bece02ec200951", - "5c534d92afdf0a5b53dbe4fe7271929c", - "2e1a68a18aca96c31320ba7ceab59be9", - "584c0323e6b276cb9acb1a294d462d58", - "9571eb8f1cbaa96ea3bf64a820a8d9f0", - "305285ff0df87aba3c59e3fc0818697d", - "0066d35c8818cf20230114dcd3765a4d", - "0066d35c8818cf20230114dcd3765a4d", - "16d61b046084ef2636eedc5a737cb6f6", - }, - { - "0c9e2cf1b6c3cad0f7668026e8ea0516", - "7d094855292d0eded9e0d1b5bab1990b", - "fbf28860a5f1285dcc6725a45256a86a", - "dccb906904160ccabbd2c9a7797a4bf9", - "46f645e17f08a3260b1ae70284e5c5b8", - "124fdc90bed11a7320a0cbdee8b94400", - "8d2978651dddeaef6282191fa146f0a0", - "28b4d5aa33f05b3fb7f9323a11936bdc", - "6a8ea684f6736a069e3612d1af6391a8", - "2781ea40a63704dbfeb3a1ac5db6f2fc", - }, + {"a835127918f93478b45f1ba4d20d81bd", "a835127918f93478b45f1ba4d20d81bd", + "e5db4da626e214bb17bcc7ecffa76303", "a835127918f93478b45f1ba4d20d81bd", + "a835127918f93478b45f1ba4d20d81bd", "e5db4da626e214bb17bcc7ecffa76303", + "a835127918f93478b45f1ba4d20d81bd", "1da62b7233de502123a18546b6c97da2", + "1da62b7233de502123a18546b6c97da2", "1da62b7233de502123a18546b6c97da2"}, + {"11464b880de3ecd6e6189c5c4e7f9b28", "dfe411762e283b5f49bece02ec200951", + "5c534d92afdf0a5b53dbe4fe7271929c", "2e1a68a18aca96c31320ba7ceab59be9", + "584c0323e6b276cb9acb1a294d462d58", "9571eb8f1cbaa96ea3bf64a820a8d9f0", + "305285ff0df87aba3c59e3fc0818697d", "0066d35c8818cf20230114dcd3765a4d", + "0066d35c8818cf20230114dcd3765a4d", "16d61b046084ef2636eedc5a737cb6f6"}, + {"0c9e2cf1b6c3cad0f7668026e8ea0516", "7d094855292d0eded9e0d1b5bab1990b", + "fbf28860a5f1285dcc6725a45256a86a", "dccb906904160ccabbd2c9a7797a4bf9", + "46f645e17f08a3260b1ae70284e5c5b8", "124fdc90bed11a7320a0cbdee8b94400", + "8d2978651dddeaef6282191fa146f0a0", "28b4d5aa33f05b3fb7f9323a11936bdc", + "6a8ea684f6736a069e3612d1af6391a8", "2781ea40a63704dbfeb3a1ac5db6f2fc"}, }; static const char* const kTestDigestsLuma10bpp[3][kNumFilmGrainTestParams] = { - { - "5e6bc8444ece2d38420f51d82238d812", - "5e6bc8444ece2d38420f51d82238d812", - "2bfaec768794af33d60a9771f971f68d", - "5e6bc8444ece2d38420f51d82238d812", - "5e6bc8444ece2d38420f51d82238d812", - "c880807a368c4e82c23bea6f035ad23f", - "5e6bc8444ece2d38420f51d82238d812", - "c576667da5286183ec3aab9a76f53a2e", - "c576667da5286183ec3aab9a76f53a2e", - "c576667da5286183ec3aab9a76f53a2e", - }, - { - "095c2dd4d4d52aff9696df9bfdb70062", - "983d14afa497060792d472a449a380c7", - "c5fdc0f7c594b2b36132cec6f45a79bd", - "acff232ac5597c1712213150552281d1", - "4dd7341923b1d260092853553b6b6246", - "0ca8afd71a4f564ea1ce69c4af14e9ab", - "9bc7565e5359d09194fcee28e4bf7b94", - "6fea7805458b9d149f238a30e2dc3f13", - "6fea7805458b9d149f238a30e2dc3f13", - "681dff5fc7a7244ba4e4a582ca7ecb14", - }, - { - "cb99352c9c6300e7e825188bb4adaee0", - "7e40674de0209bd72f8e9c6e39ee6f7c", - "3e475572f6b4ecbb2730fd16751ad7ed", - "e6e4c63abc9cb112d9d1f23886cd1415", - "1a1c953b175c105c604902877e2bab18", - "380a53072530223d4ee622e014ee4bdb", - "6137394ea1172fb7ea0cbac237ff1703", - "85ab0c813e46f97cb9f42542f44c01ad", - "68c8ac462f0e28cb35402c538bee32f1", - "0038502ffa4760c8feb6f9abd4de7250", - }, + {"5e6bc8444ece2d38420f51d82238d812", "5e6bc8444ece2d38420f51d82238d812", + "2bfaec768794af33d60a9771f971f68d", "5e6bc8444ece2d38420f51d82238d812", + "5e6bc8444ece2d38420f51d82238d812", "c880807a368c4e82c23bea6f035ad23f", + "5e6bc8444ece2d38420f51d82238d812", "c576667da5286183ec3aab9a76f53a2e", + "c576667da5286183ec3aab9a76f53a2e", "c576667da5286183ec3aab9a76f53a2e"}, + {"095c2dd4d4d52aff9696df9bfdb70062", "983d14afa497060792d472a449a380c7", + "c5fdc0f7c594b2b36132cec6f45a79bd", "acff232ac5597c1712213150552281d1", + "4dd7341923b1d260092853553b6b6246", "0ca8afd71a4f564ea1ce69c4af14e9ab", + "9bc7565e5359d09194fcee28e4bf7b94", "6fea7805458b9d149f238a30e2dc3f13", + "6fea7805458b9d149f238a30e2dc3f13", "681dff5fc7a7244ba4e4a582ca7ecb14"}, + {"cb99352c9c6300e7e825188bb4adaee0", "7e40674de0209bd72f8e9c6e39ee6f7c", + "3e475572f6b4ecbb2730fd16751ad7ed", "e6e4c63abc9cb112d9d1f23886cd1415", + "1a1c953b175c105c604902877e2bab18", "380a53072530223d4ee622e014ee4bdb", + "6137394ea1172fb7ea0cbac237ff1703", "85ab0c813e46f97cb9f42542f44c01ad", + "68c8ac462f0e28cb35402c538bee32f1", "0038502ffa4760c8feb6f9abd4de7250"}, }; - if (bitdepth == 8) { - return kTestDigestsLuma8bpp[coeff_lag - 1][param_index]; + static const char* const kTestDigestsLuma12bpp[3][kNumFilmGrainTestParams] = { + {"d618bbb0e337969c91b1805f39561520", "d618bbb0e337969c91b1805f39561520", + "678f6e911591daf9eca4e305dabdb2b3", "d618bbb0e337969c91b1805f39561520", + "d618bbb0e337969c91b1805f39561520", "3b26f49612fd587c7360790d40adb5de", + "d618bbb0e337969c91b1805f39561520", "33f77d3ff50cfc64c6bc9a896b567377", + "33f77d3ff50cfc64c6bc9a896b567377", "33f77d3ff50cfc64c6bc9a896b567377"}, + {"362fd67050fb7abaf57c43a92d993423", "e014ae0eb9e697281015c38905cc46ef", + "82b867e57151dc08afba31eccf5ccf69", "a94ba736cdce7bfa0b550285f59e47a9", + "3f1b0b7dd3b10e322254d35e4e185b7c", "7929708e5f017d58c53513cb79b35fda", + "6d26d31a091cbe642a7070933bd7de5a", "dc29ac40a994c0a760bfbad0bfc15b3a", + "dc29ac40a994c0a760bfbad0bfc15b3a", "399b919db5190a5311ce8d166580827b"}, + {"6116d1f569f5b568eca4dc1fbf255086", "7e9cf31ea74e8ea99ffd12094ce6cd05", + "bb982c4c39e82a333d744defd16f4388", "7c6e584b082dc6b97ed0d967def3993f", + "fb234695353058f03c8e128f2f8de130", "9218c6ca67bf6a9237f98aa1ce7acdfd", + "d1fb834bbb388ed066c5cbc1c79b5bdf", "d6f630daedc08216fcea12012e7408b5", + "dd7fe49299e6f113a98debc7411c8db8", "8b89e45a5101a28c24209ae119eafeb8"}, + }; + + switch (bitdepth) { + case 8: + return kTestDigestsLuma8bpp[coeff_lag - 1][param_index]; + case 10: + return kTestDigestsLuma10bpp[coeff_lag - 1][param_index]; + case 12: + return kTestDigestsLuma12bpp[coeff_lag - 1][param_index]; + default: + assert(bitdepth == 8 || bitdepth == 10 || bitdepth == 12); + return nullptr; } - return kTestDigestsLuma10bpp[coeff_lag - 1][param_index]; } const char* GetARTestDigestChromaU(int bitdepth, int coeff_lag, @@ -589,12 +616,28 @@ const char* GetARTestDigestChromaU(int bitdepth, int coeff_lag, "e2688d7286cd43fe0a3ea734d2ad0f77", "853193c4981bd882912171061327bdf2", }; + static const char* const kTestDigestsChromaU12bpp[12] = { + "04c23b01d01c0e3f3247f3741581b383", "9f8ea1d66e44f6fe93d765ce56b2b0f3", + "5dda44b128d6c244963f1e8e17cc1d22", "9dd0a79dd2f772310a95762d445bface", + "0dbd40d930e4873d72ea72b9e3d62440", "d7d83c207c6b435a164206d5f457931f", + "e8d04f6e63ed63838adff965275a1ff1", "fc09a903e941fcff8bad67a84f705775", + "9cd706606a2aa40d0957547756f7abd9", "258b37e7b8f48db77dac7ea24073fe69", + "80149b8bb05308da09c1383d8b79d3da", "e993f3bffae53204a1942feb1af42074", + }; + assert(!(subsampling_x == 0 && subsampling_y == 1)); const int base_index = 3 * coeff_lag + subsampling_x + subsampling_y; - if (bitdepth == 8) { - return kTestDigestsChromaU8bpp[base_index]; + switch (bitdepth) { + case 8: + return kTestDigestsChromaU8bpp[base_index]; + case 10: + return kTestDigestsChromaU10bpp[base_index]; + case 12: + return kTestDigestsChromaU12bpp[base_index]; + default: + assert(bitdepth == 8 || bitdepth == 10 || bitdepth == 12); + return nullptr; } - return kTestDigestsChromaU10bpp[base_index]; } const char* GetARTestDigestChromaV(int bitdepth, int coeff_lag, @@ -617,12 +660,28 @@ const char* GetARTestDigestChromaV(int bitdepth, int coeff_lag, "d3d0912e3fdb956fef416a010bd7b4c2", "a2fca8abd9fd38d2eef3c4495d9eff78", }; + static const char* const kTestDigestsChromaV12bpp[12] = { + "0d1890335f4464167de22353678ca9c6", "9e6830aba73139407196f1c811f910bc", + "6018f2fb76bd648bef0262471cfeba5c", "78e1ae1b790d709cdb8997621cf0fde3", + "5b44ae281d7f9db2f17aa3c24b4741dd", "f931d16991669cb16721de87da9b8067", + "5580f2aed349d9cabdafb9fc25a57b1c", "86918cd78bf95e6d4405dd050f5890b8", + "13c8b314eeebe35fa60b703d94e1b2c1", "13c6fb75cab3f42e0d4ca31e4d068b0e", + "bb9ca0bd6f8cd67e44c8ac2803abf5a5", "0da4ea711ffe557bb66577392b6f148b", + }; + assert(!(subsampling_x == 0 && subsampling_y == 1)); const int base_index = 3 * coeff_lag + subsampling_x + subsampling_y; - if (bitdepth == 8) { - return kTestDigestsChromaV8bpp[base_index]; + switch (bitdepth) { + case 8: + return kTestDigestsChromaV8bpp[base_index]; + case 10: + return kTestDigestsChromaV10bpp[base_index]; + case 12: + return kTestDigestsChromaV12bpp[base_index]; + default: + assert(bitdepth == 8 || bitdepth == 10 || bitdepth == 12); + return nullptr; } - return kTestDigestsChromaV10bpp[base_index]; } const char* GetGrainGenerationTestDigestLuma(int bitdepth, int param_index) { @@ -642,10 +701,25 @@ const char* GetGrainGenerationTestDigestLuma(int bitdepth, int param_index) { "85a122e32648fde84b883a1f98947c60", "dee656e3791138285bc5b71e3491a177", }; - if (bitdepth == 8) { - return kTestDigestsLuma8bpp[param_index]; + static const char* const kTestDigestsLuma12bpp[kNumFilmGrainTestParams] = { + "ae359794b5340d073d597117046886ac", "4d4ad3908b4fb0f248a0086537dd6b1e", + "672a97e15180cbeeaf76d763992c9f23", "739124d10d16e00a158e833ea92107bc", + "4c38c738ff7ffc50adaa4474584d3aae", "ca05ba7e51000a7d10e5cbb2101bbd86", + "e207022b916bf03a76ac8742af29853d", "7454bf1859149237ff74f1161156c857", + "10fc2a16e663bbc305255b0883cfcd45", "4228abff6899bb33839b579288ab29fe", + }; + + switch (bitdepth) { + case 8: + return kTestDigestsLuma8bpp[param_index]; + case 10: + return kTestDigestsLuma10bpp[param_index]; + case 12: + return kTestDigestsLuma12bpp[param_index]; + default: + assert(bitdepth == 8 || bitdepth == 10 || bitdepth == 12); + return nullptr; } - return kTestDigestsLuma10bpp[param_index]; } const char* GetConstructStripesTestDigest(int bitdepth, int overlap_flag, @@ -663,11 +737,24 @@ const char* GetConstructStripesTestDigest(int bitdepth, int overlap_flag, "125bf18b7787e8f0792ea12f9210de0d", "21cf98cbce17eca77dc150cc9be0e0a0", }; + static const char* const kTestDigests12bpp[6] = { + "57f8e17078b6e8935252e918a2562636", "556a7b294a99bf1163b7166b4f68357e", + "249bee5572cd7d1cc07182c97adc4ba7", "9bf43ae1998c2a5b2e5f4d8236b58747", + "477c08fa26499936e5bb03bde097633e", "fe64b7166ff87ea0711ae4f519cadd59", + }; + const int base_index = 3 * overlap_flag + subsampling_x + subsampling_y; - if (bitdepth == 8) { - return kTestDigests8bpp[base_index]; + switch (bitdepth) { + case 8: + return kTestDigests8bpp[base_index]; + case 10: + return kTestDigests10bpp[base_index]; + case 12: + return kTestDigests12bpp[base_index]; + default: + assert(bitdepth == 8 || bitdepth == 10 || bitdepth == 12); + return nullptr; } - return kTestDigests10bpp[base_index]; } const char* GetConstructImageTestDigest(int bitdepth, int overlap_flag, @@ -684,11 +771,24 @@ const char* GetConstructImageTestDigest(int bitdepth, int overlap_flag, "656a9ef056b04565bec9ca7e0873c408", "a70fff81ab28d02d99dd4f142699ba39", }; + static const char* const kTestDigests12bpp[6] = { + "146f7ceadaf77e7a3c41e191a58c1d3c", "de18526db39630936733e687cdca189e", + "165c96ff63bf3136505ab1d239f7ceae", "a102636662547f84e5f6fb6c3e4ef959", + "4cb073fcc783c158a95c0b1ce0d27e9f", "3a734c71d4325a7da53e2a6e00f81647", + }; + const int base_index = 3 * overlap_flag + subsampling_x + subsampling_y; - if (bitdepth == 8) { - return kTestDigests8bpp[base_index]; + switch (bitdepth) { + case 8: + return kTestDigests8bpp[base_index]; + case 10: + return kTestDigests10bpp[base_index]; + case 12: + return kTestDigests12bpp[base_index]; + default: + assert(bitdepth == 8 || bitdepth == 10 || bitdepth == 12); + return nullptr; } - return kTestDigests10bpp[base_index]; } const char* GetScalingInitTestDigest(int param_index, int bitdepth) { @@ -708,23 +808,36 @@ const char* GetScalingInitTestDigest(int param_index, int bitdepth) { "11b3e256c74cee2b5679f7457793869a", "89fab5c1db09e242d0494d1c696a774a", }; - if (bitdepth == 8) { - return kTestDigests8bpp[param_index]; + static const char* const kTestDigests12bpp[kNumFilmGrainTestParams] = { + "1554df49a863a851d146213e09d311a4", "84808c3ed3b5495a62c9d2dd9a08cb26", + "bb31f083a3bd9ef26587478b8752f280", "34fdfe61d6871e4882e38062a0725c5c", + "bb31f083a3bd9ef26587478b8752f280", "e7b8c3e4508ceabe89b78f10a9e160b8", + "e7b8c3e4508ceabe89b78f10a9e160b8", "a0ccc9e3d0f0c9d1f08f1249264d92f5", + "7992a96883c8a9a35d6ca8961bc4515b", "de906ce2c0fceed6f168215447b21b16", + }; + + switch (bitdepth) { + case 8: + return kTestDigests8bpp[param_index]; + case 10: + return kTestDigests10bpp[param_index]; + case 12: + return kTestDigests12bpp[param_index]; + default: + assert(bitdepth == 8 || bitdepth == 10 || bitdepth == 12); + return nullptr; } - assert(bitdepth == 10); - return kTestDigests10bpp[param_index]; } const char* GetBlendLumaTestDigest(int bitdepth) { - static const char* const kTestDigest8bpp = "de35b16c702690b1d311cdd0973835d7"; - - static const char* const kTestDigest10bpp = - "60e9f24dcaaa0207a8db5ab5f3c66608"; + static const char* const kTestDigests[] = { + "de35b16c702690b1d311cdd0973835d7", + "60e9f24dcaaa0207a8db5ab5f3c66608", + "8e7d44b620bb7768459074be6bfbca7b", + }; - if (bitdepth == 8) { - return kTestDigest8bpp; - } - return kTestDigest10bpp; + assert(bitdepth == 8 || bitdepth == 10 || bitdepth == 12); + return kTestDigests[(bitdepth - 8) / 2]; } const char* GetBlendChromaUTestDigest(int bitdepth, @@ -742,12 +855,25 @@ const char* GetBlendChromaUTestDigest(int bitdepth, "9b7958a2278a16bce2b7bc31fdd811f5", "c5c3c8cccf6a2b4e40b4a412a5bf4f08", }; + static const char* const kTestDigests12bpp[6] = { + "8fad0cc641da35e0d2d8f178c7ce8394", "793eb9d2e6b4ea2e3bb08e7068236155", + "9156bd85ab9493d8867a174f920bb1e6", "6834319b4c88e3e0c96b6f8d7efd08dd", + "c40e492790d3803a734efbc6feca46e2", "d884c3b1e2c21d98844ca7639e0599a5", + }; + const int base_index = 3 * chroma_scaling_from_luma + subsampling_x + subsampling_y; - if (bitdepth == 8) { - return kTestDigests8bpp[base_index]; + switch (bitdepth) { + case 8: + return kTestDigests8bpp[base_index]; + case 10: + return kTestDigests10bpp[base_index]; + case 12: + return kTestDigests12bpp[base_index]; + default: + assert(bitdepth == 8 || bitdepth == 10 || bitdepth == 12); + return nullptr; } - return kTestDigests10bpp[base_index]; } const char* GetBlendChromaVTestDigest(int bitdepth, @@ -765,12 +891,25 @@ const char* GetBlendChromaVTestDigest(int bitdepth, "ed4382caa936acf1158ff8049d18ffac", "942bdd1344c9182dd7572099fb9372db", }; + static const char* const kTestDigests12bpp[6] = { + "70704a1e171a3a70d40b7d0037a75fbc", "62549e2afbf36a1ed405a6574d39c542", + "e93889927ab77c6e0767ff071d980c02", "a0c1f6ed78874137710fee7418d80959", + "f6283e36a25cb867e30bdf0bfdb2124b", "741c2d48898835b9d9e3bd0b6ac6269a", + }; + const int base_index = 3 * chroma_scaling_from_luma + subsampling_x + subsampling_y; - if (bitdepth == 8) { - return kTestDigests8bpp[base_index]; + switch (bitdepth) { + case 8: + return kTestDigests8bpp[base_index]; + case 10: + return kTestDigests10bpp[base_index]; + case 12: + return kTestDigests12bpp[base_index]; + default: + assert(bitdepth == 8 || bitdepth == 10 || bitdepth == 12); + return nullptr; } - return kTestDigests10bpp[base_index]; } // GetFilmGrainRandomNumber() is only invoked with |bits| equal to 11 or 8. Test @@ -844,6 +983,7 @@ template <int bitdepth> class AutoRegressionTestLuma : public testing::TestWithParam<std::tuple<int, int>> { public: + static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, ""); using GrainType = typename std::conditional<bitdepth == 8, int8_t, int16_t>::type; @@ -982,6 +1122,28 @@ TEST_P(AutoRegressionTestLuma10bpp, DISABLED_Speed) { } #endif // LIBGAV1_MAX_BITDEPTH >= 10 +#if LIBGAV1_MAX_BITDEPTH == 12 +using AutoRegressionTestLuma12bpp = AutoRegressionTestLuma<12>; + +TEST_P(AutoRegressionTestLuma12bpp, AutoRegressiveFilterLuma) { + TestAutoRegressiveFilterLuma(std::get<0>(GetParam()), std::get<1>(GetParam()), + 1, /*saturate=*/false, + /*compare=*/false); +} + +TEST_P(AutoRegressionTestLuma12bpp, AutoRegressiveFilterLumaSaturated) { + TestAutoRegressiveFilterLuma(std::get<0>(GetParam()), std::get<1>(GetParam()), + 1, /*saturate=*/true, + /*compare=*/true); +} + +TEST_P(AutoRegressionTestLuma12bpp, DISABLED_Speed) { + TestAutoRegressiveFilterLuma(std::get<0>(GetParam()), std::get<1>(GetParam()), + 1e5, + /*saturate=*/false, /*compare=*/false); +} +#endif // LIBGAV1_MAX_BITDEPTH == 12 + INSTANTIATE_TEST_SUITE_P( C, AutoRegressionTestLuma8bpp, testing::Combine(testing::Range(1, 4) /* coeff_lag */, @@ -1006,6 +1168,13 @@ INSTANTIATE_TEST_SUITE_P( #endif #endif // LIBGAV1_MAX_BITDEPTH >= 10 +#if LIBGAV1_MAX_BITDEPTH == 12 +INSTANTIATE_TEST_SUITE_P( + C, AutoRegressionTestLuma12bpp, + testing::Combine(testing::Range(1, 4) /* coeff_lag */, + testing::Range(0, 10) /* param_index */)); +#endif // LIBGAV1_MAX_BITDEPTH == 12 + struct AutoRegressionChromaTestParam { explicit AutoRegressionChromaTestParam(const std::tuple<int, int>& in) : coeff_lag(std::get<0>(in)) { @@ -1033,6 +1202,7 @@ template <int bitdepth> class AutoRegressionTestChroma : public testing::TestWithParam<std::tuple<int, int>> { public: + static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, ""); using GrainType = typename std::conditional<bitdepth == 8, int8_t, int16_t>::type; @@ -1228,9 +1398,37 @@ TEST_P(AutoRegressionTestChroma10bpp, DISABLED_Speed) { 1e5 * (1 << (test_param.subsampling_y + test_param.subsampling_x)), /*saturate=*/false, /*compare=*/false); } - #endif // LIBGAV1_MAX_BITDEPTH >= 10 +#if LIBGAV1_MAX_BITDEPTH == 12 +using AutoRegressionTestChroma12bpp = AutoRegressionTestChroma<12>; + +TEST_P(AutoRegressionTestChroma12bpp, AutoRegressiveFilterChroma) { + AutoRegressionChromaTestParam test_param(GetParam()); + TestAutoRegressiveFilterChroma(test_param.coeff_lag, test_param.subsampling_x, + test_param.subsampling_y, 1, + /*saturate=*/false, + /*compare=*/false); +} + +TEST_P(AutoRegressionTestChroma12bpp, AutoRegressiveFilterChromaSaturated) { + AutoRegressionChromaTestParam test_param(GetParam()); + TestAutoRegressiveFilterChroma(test_param.coeff_lag, test_param.subsampling_x, + test_param.subsampling_y, 1, /*saturate=*/true, + /*compare=*/true); +} + +TEST_P(AutoRegressionTestChroma12bpp, DISABLED_Speed) { + AutoRegressionChromaTestParam test_param(GetParam()); + TestAutoRegressiveFilterChroma( + test_param.coeff_lag, test_param.subsampling_x, test_param.subsampling_y, + // Subsampling cuts each dimension of the chroma blocks in half, so run + // twice as many times to compensate. + 1e5 * (1 << (test_param.subsampling_y + test_param.subsampling_x)), + /*saturate=*/false, /*compare=*/false); +} +#endif // LIBGAV1_MAX_BITDEPTH == 12 + INSTANTIATE_TEST_SUITE_P(C, AutoRegressionTestChroma8bpp, testing::Combine(testing::Range(0, 4) /* coeff_lag */, testing::Range(0, @@ -1243,6 +1441,13 @@ INSTANTIATE_TEST_SUITE_P(C, AutoRegressionTestChroma10bpp, 3) /* subsampling */)); #endif // LIBGAV1_MAX_BITDEPTH >= 10 +#if LIBGAV1_MAX_BITDEPTH == 12 +INSTANTIATE_TEST_SUITE_P(C, AutoRegressionTestChroma12bpp, + testing::Combine(testing::Range(0, 4) /* coeff_lag */, + testing::Range(0, + 3) /* subsampling */)); +#endif // LIBGAV1_MAX_BITDEPTH == 12 + #if LIBGAV1_ENABLE_NEON INSTANTIATE_TEST_SUITE_P(NEON, AutoRegressionTestChroma8bpp, testing::Combine(testing::Range(0, 4) /* coeff_lag */, @@ -1260,6 +1465,7 @@ INSTANTIATE_TEST_SUITE_P(NEON, AutoRegressionTestChroma10bpp, template <int bitdepth> class GrainGenerationTest : public testing::TestWithParam<int> { protected: + static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, ""); using GrainType = typename std::conditional<bitdepth == 8, int8_t, int16_t>::type; @@ -1313,6 +1519,18 @@ TEST_P(GrainGenerationTest10bpp, DISABLED_LumaSpeed) { } #endif // LIBGAV1_MAX_BITDEPTH >= 10 +#if LIBGAV1_MAX_BITDEPTH == 12 +using GrainGenerationTest12bpp = GrainGenerationTest<12>; + +TEST_P(GrainGenerationTest12bpp, GenerateGrainLuma) { + TestGenerateGrainLuma(GetParam(), 1); +} + +TEST_P(GrainGenerationTest12bpp, DISABLED_LumaSpeed) { + TestGenerateGrainLuma(GetParam(), 1e5); +} +#endif // LIBGAV1_MAX_BITDEPTH == 12 + INSTANTIATE_TEST_SUITE_P(C, GrainGenerationTest8bpp, testing::Range(0, 10) /* param_index */); @@ -1320,6 +1538,10 @@ INSTANTIATE_TEST_SUITE_P(C, GrainGenerationTest8bpp, INSTANTIATE_TEST_SUITE_P(C, GrainGenerationTest10bpp, testing::Range(0, 10) /* param_index */); #endif // LIBGAV1_MAX_BITDEPTH >= 10 +#if LIBGAV1_MAX_BITDEPTH == 12 +INSTANTIATE_TEST_SUITE_P(C, GrainGenerationTest12bpp, + testing::Range(0, 10) /* param_index */); +#endif // LIBGAV1_MAX_BITDEPTH == 12 // This param type is used for both ConstructStripesTest and // ConstructImageTest. @@ -1350,6 +1572,7 @@ template <int bitdepth> class ConstructStripesTest : public testing::TestWithParam<std::tuple<int, int>> { public: + static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, ""); using GrainType = typename std::conditional<bitdepth == 8, int8_t, int16_t>::type; @@ -1523,6 +1746,30 @@ TEST_P(ConstructStripesTest10bpp, DISABLED_Speed) { } #endif // LIBGAV1_MAX_BITDEPTH >= 10 +#if LIBGAV1_MAX_BITDEPTH == 12 +using ConstructStripesTest12bpp = ConstructStripesTest<12>; + +TEST_P(ConstructStripesTest12bpp, RandomValues) { + ConstructNoiseTestParam test_params(GetParam()); + TestConstructNoiseStripes(test_params.overlap_flag, test_params.subsampling_x, + test_params.subsampling_y, /*num_runs=*/1, + /*saturate=*/false, /*compare=*/false); +} +TEST_P(ConstructStripesTest12bpp, SaturatedValues) { + ConstructNoiseTestParam test_params(GetParam()); + TestConstructNoiseStripes(test_params.overlap_flag, test_params.subsampling_x, + test_params.subsampling_y, /*num_runs=*/1, + /*saturate=*/true, /*compare=*/true); +} + +TEST_P(ConstructStripesTest12bpp, DISABLED_Speed) { + ConstructNoiseTestParam test_params(GetParam()); + TestConstructNoiseStripes(test_params.overlap_flag, test_params.subsampling_x, + test_params.subsampling_y, /*num_runs=*/500, + /*saturate=*/false, /*compare=*/false); +} +#endif // LIBGAV1_MAX_BITDEPTH == 12 + INSTANTIATE_TEST_SUITE_P(C, ConstructStripesTest8bpp, testing::Combine(testing::Range(0, 2), testing::Range(0, 3))); @@ -1533,9 +1780,16 @@ INSTANTIATE_TEST_SUITE_P(C, ConstructStripesTest10bpp, testing::Range(0, 3))); #endif // LIBGAV1_MAX_BITDEPTH >= 10 +#if LIBGAV1_MAX_BITDEPTH == 12 +INSTANTIATE_TEST_SUITE_P(C, ConstructStripesTest12bpp, + testing::Combine(testing::Range(0, 2), + testing::Range(0, 3))); +#endif // LIBGAV1_MAX_BITDEPTH == 12 + template <int bitdepth> class ConstructImageTest : public testing::TestWithParam<std::tuple<int, int>> { public: + static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, ""); using GrainType = typename std::conditional<bitdepth == 8, int8_t, int16_t>::type; @@ -1732,6 +1986,31 @@ TEST_P(ConstructImageTest10bpp, DISABLED_Speed) { } #endif // LIBGAV1_MAX_BITDEPTH >= 10 +#if LIBGAV1_MAX_BITDEPTH == 12 +using ConstructImageTest12bpp = ConstructImageTest<12>; + +TEST_P(ConstructImageTest12bpp, RandomValues) { + ConstructNoiseTestParam test_params(GetParam()); + TestConstructNoiseImage(test_params.overlap_flag, test_params.subsampling_x, + test_params.subsampling_y, /*num_runs=*/1, + /*saturate=*/false, /*compare=*/false); +} + +TEST_P(ConstructImageTest12bpp, SaturatedValues) { + ConstructNoiseTestParam test_params(GetParam()); + TestConstructNoiseImage(test_params.overlap_flag, test_params.subsampling_x, + test_params.subsampling_y, /*num_runs=*/1, + /*saturate=*/true, /*compare=*/true); +} + +TEST_P(ConstructImageTest12bpp, DISABLED_Speed) { + ConstructNoiseTestParam test_params(GetParam()); + TestConstructNoiseImage(test_params.overlap_flag, test_params.subsampling_x, + test_params.subsampling_y, /*num_runs=*/500, + /*saturate=*/false, /*compare=*/false); +} +#endif // LIBGAV1_MAX_BITDEPTH == 12 + INSTANTIATE_TEST_SUITE_P(C, ConstructImageTest8bpp, testing::Combine(testing::Range(0, 2), testing::Range(0, 3))); @@ -1748,9 +2027,16 @@ INSTANTIATE_TEST_SUITE_P(C, ConstructImageTest10bpp, testing::Range(0, 3))); #endif // LIBGAV1_MAX_BITDEPTH >= 10 +#if LIBGAV1_MAX_BITDEPTH == 12 +INSTANTIATE_TEST_SUITE_P(C, ConstructImageTest12bpp, + testing::Combine(testing::Range(0, 2), + testing::Range(0, 3))); +#endif // LIBGAV1_MAX_BITDEPTH == 12 + template <int bitdepth> class ScalingLookupTableTest : public testing::TestWithParam<int> { public: + static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, ""); ScalingLookupTableTest() { test_utils::ResetDspTable(bitdepth); FilmGrainInit_C(); @@ -1840,6 +2126,18 @@ TEST_P(ScalingLookupTableTest10bpp, DISABLED_Speed) { } #endif // LIBGAV1_MAX_BITDEPTH >= 10 +#if LIBGAV1_MAX_BITDEPTH == 12 +using ScalingLookupTableTest12bpp = ScalingLookupTableTest<12>; + +TEST_P(ScalingLookupTableTest12bpp, ZeroPoints) { ZeroPoints(); } + +TEST_P(ScalingLookupTableTest12bpp, Correctness) { TestSpeed(/*num_runs=*/1); } + +TEST_P(ScalingLookupTableTest12bpp, DISABLED_Speed) { + TestSpeed(/*num_runs=*/1e5); +} +#endif // LIBGAV1_MAX_BITDEPTH == 12 + INSTANTIATE_TEST_SUITE_P(C, ScalingLookupTableTest8bpp, testing::Range(0, kNumFilmGrainTestParams)); @@ -1858,6 +2156,11 @@ INSTANTIATE_TEST_SUITE_P(NEON, ScalingLookupTableTest10bpp, #endif #endif // LIBGAV1_MAX_BITDEPTH >= 10 +#if LIBGAV1_MAX_BITDEPTH == 12 +INSTANTIATE_TEST_SUITE_P(C, ScalingLookupTableTest12bpp, + testing::Range(0, kNumFilmGrainTestParams)); +#endif // LIBGAV1_MAX_BITDEPTH == 12 + struct BlendNoiseTestParam { explicit BlendNoiseTestParam(const std::tuple<int, int>& in) : chroma_scaling_from_luma(std::get<0>(in)) { @@ -1884,6 +2187,7 @@ struct BlendNoiseTestParam { template <int bitdepth, typename Pixel> class BlendNoiseTest : public testing::TestWithParam<std::tuple<int, int>> { public: + static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, ""); using GrainType = typename std::conditional<bitdepth == 8, int8_t, int16_t>::type; @@ -2213,9 +2517,22 @@ INSTANTIATE_TEST_SUITE_P(NEON, BlendNoiseTest10bpp, #endif #endif // LIBGAV1_MAX_BITDEPTH >= 10 +#if LIBGAV1_MAX_BITDEPTH == 12 +using BlendNoiseTest12bpp = BlendNoiseTest<12, uint16_t>; + +TEST_P(BlendNoiseTest12bpp, MatchesOriginalOutput) { TestSpeed(1); } + +TEST_P(BlendNoiseTest12bpp, DISABLED_Speed) { TestSpeed(kNumSpeedTests); } + +INSTANTIATE_TEST_SUITE_P(C, BlendNoiseTest12bpp, + testing::Combine(testing::Range(0, 2), + testing::Range(0, 3))); +#endif // LIBGAV1_MAX_BITDEPTH == 12 + template <int bitdepth, typename Pixel> class FilmGrainSpeedTest : public testing::TestWithParam<int> { public: + static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, ""); FilmGrainSpeedTest() { test_utils::ResetDspTable(bitdepth); FilmGrainInit_C(); @@ -2354,6 +2671,16 @@ INSTANTIATE_TEST_SUITE_P(NEON, FilmGrainSpeedTest10bpp, #endif // LIBGAV1_MAX_BITDEPTH >= 10 +#if LIBGAV1_MAX_BITDEPTH == 12 +using FilmGrainSpeedTest12bpp = FilmGrainSpeedTest<12, uint16_t>; + +TEST_P(FilmGrainSpeedTest12bpp, MatchesOriginalOutput) { TestSpeed(1); } + +TEST_P(FilmGrainSpeedTest12bpp, DISABLED_Speed) { TestSpeed(kNumSpeedTests); } + +INSTANTIATE_TEST_SUITE_P(C, FilmGrainSpeedTest12bpp, testing::Values(0, 3, 8)); +#endif // LIBGAV1_MAX_BITDEPTH == 12 + } // namespace } // namespace film_grain } // namespace dsp diff --git a/src/gav1/decoder_buffer.h b/src/gav1/decoder_buffer.h index 880c320..0a5586e 100644 --- a/src/gav1/decoder_buffer.h +++ b/src/gav1/decoder_buffer.h @@ -115,6 +115,27 @@ typedef enum Libgav1ColorRange { kLibgav1ColorRangeFull // YUV/RGB [0..255] } Libgav1ColorRange; +typedef struct Libgav1ObuMetadataHdrCll { // NOLINT + uint16_t max_cll; // Maximum content light level. + uint16_t max_fall; // Maximum frame-average light level. +} Libgav1ObuMetadataHdrCll; + +typedef struct Libgav1ObuMetadataHdrMdcv { // NOLINT + uint16_t primary_chromaticity_x[3]; + uint16_t primary_chromaticity_y[3]; + uint16_t white_point_chromaticity_x; + uint16_t white_point_chromaticity_y; + uint32_t luminance_max; + uint32_t luminance_min; +} Libgav1ObuMetadataHdrMdcv; + +typedef struct Libgav1ObuMetadataItutT35 { // NOLINT + uint8_t country_code; + uint8_t country_code_extension_byte; // Valid if country_code is 0xFF. + uint8_t* payload_bytes; + int payload_size; +} Libgav1ObuMetadataItutT35; + typedef struct Libgav1DecoderBuffer { #if defined(__cplusplus) LIBGAV1_PUBLIC int NumPlanes() const { @@ -146,6 +167,18 @@ typedef struct Libgav1DecoderBuffer { // Temporal id of this frame. int temporal_id; + Libgav1ObuMetadataHdrCll hdr_cll; + int has_hdr_cll; // 1 if the values in hdr_cll are valid for this frame. 0 + // otherwise. + + Libgav1ObuMetadataHdrMdcv hdr_mdcv; + int has_hdr_mdcv; // 1 if the values in hdr_mdcv are valid for this frame. 0 + // otherwise. + + Libgav1ObuMetadataItutT35 itut_t35; + int has_itut_t35; // 1 if the values in itut_t35 are valid for this frame. 0 + // otherwise. + // The |user_private_data| argument passed to Decoder::EnqueueFrame(). int64_t user_private_data; // The |private_data| field of FrameBuffer. Set by the get frame buffer @@ -264,6 +297,10 @@ using ColorRange = Libgav1ColorRange; constexpr ColorRange kColorRangeStudio = kLibgav1ColorRangeStudio; constexpr ColorRange kColorRangeFull = kLibgav1ColorRangeFull; +using ObuMetadataHdrCll = Libgav1ObuMetadataHdrCll; +using ObuMetadataHdrMdcv = Libgav1ObuMetadataHdrMdcv; +using ObuMetadataItutT35 = Libgav1ObuMetadataItutT35; + using DecoderBuffer = Libgav1DecoderBuffer; } // namespace libgav1 diff --git a/src/gav1/version.h b/src/gav1/version.h index 9bdc630..b386acc 100644 --- a/src/gav1/version.h +++ b/src/gav1/version.h @@ -23,7 +23,7 @@ // (https://semver.org). #define LIBGAV1_MAJOR_VERSION 0 -#define LIBGAV1_MINOR_VERSION 17 +#define LIBGAV1_MINOR_VERSION 18 #define LIBGAV1_PATCH_VERSION 0 #define LIBGAV1_VERSION \ diff --git a/src/libgav1_decoder.cmake b/src/libgav1_decoder.cmake index b97d09d..1314d0b 100644 --- a/src/libgav1_decoder.cmake +++ b/src/libgav1_decoder.cmake @@ -107,7 +107,7 @@ macro(libgav1_add_decoder_targets) list(APPEND libgav1_static_lib_sources ${libgav1_api_sources}) endif() - if(NOT ANDROID) + if(use_absl_threading) list(APPEND libgav1_absl_deps absl::base absl::synchronization) endif() diff --git a/src/obu_parser.cc b/src/obu_parser.cc index 445450b..9e9166a 100644 --- a/src/obu_parser.cc +++ b/src/obu_parser.cc @@ -1767,11 +1767,7 @@ bool ObuParser::ParseFrameParameters() { int64_t scratch; if (sequence_header_.reduced_still_picture_header) { frame_header_.show_frame = true; - current_frame_ = buffer_pool_->GetFreeBuffer(); - if (current_frame_ == nullptr) { - LIBGAV1_DLOG(ERROR, "Could not get current_frame from the buffer pool."); - return false; - } + if (!EnsureCurrentFrameIsNotNull()) return false; } else { OBU_READ_BIT_OR_FAIL; frame_header_.show_existing_frame = scratch != 0; @@ -1840,11 +1836,7 @@ bool ObuParser::ParseFrameParameters() { } return true; } - current_frame_ = buffer_pool_->GetFreeBuffer(); - if (current_frame_ == nullptr) { - LIBGAV1_DLOG(ERROR, "Could not get current_frame from the buffer pool."); - return false; - } + if (!EnsureCurrentFrameIsNotNull()) return false; OBU_READ_LITERAL_OR_FAIL(2); frame_header_.frame_type = static_cast<FrameType>(scratch); current_frame_->set_frame_type(frame_header_.frame_type); @@ -2395,50 +2387,58 @@ bool ObuParser::ParseMetadata(const uint8_t* data, size_t size) { size -= metadata_type_size; int64_t scratch; switch (metadata_type) { - case kMetadataTypeHdrContentLightLevel: + case kMetadataTypeHdrContentLightLevel: { + ObuMetadataHdrCll hdr_cll; OBU_READ_LITERAL_OR_FAIL(16); - metadata_.max_cll = scratch; + hdr_cll.max_cll = scratch; OBU_READ_LITERAL_OR_FAIL(16); - metadata_.max_fall = scratch; + hdr_cll.max_fall = scratch; + if (!EnsureCurrentFrameIsNotNull()) return false; + current_frame_->set_hdr_cll(hdr_cll); break; - case kMetadataTypeHdrMasteringDisplayColorVolume: + } + case kMetadataTypeHdrMasteringDisplayColorVolume: { + ObuMetadataHdrMdcv hdr_mdcv; for (int i = 0; i < 3; ++i) { OBU_READ_LITERAL_OR_FAIL(16); - metadata_.primary_chromaticity_x[i] = scratch; + hdr_mdcv.primary_chromaticity_x[i] = scratch; OBU_READ_LITERAL_OR_FAIL(16); - metadata_.primary_chromaticity_y[i] = scratch; + hdr_mdcv.primary_chromaticity_y[i] = scratch; } OBU_READ_LITERAL_OR_FAIL(16); - metadata_.white_point_chromaticity_x = scratch; + hdr_mdcv.white_point_chromaticity_x = scratch; OBU_READ_LITERAL_OR_FAIL(16); - metadata_.white_point_chromaticity_y = scratch; + hdr_mdcv.white_point_chromaticity_y = scratch; OBU_READ_LITERAL_OR_FAIL(32); - metadata_.luminance_max = static_cast<uint32_t>(scratch); + hdr_mdcv.luminance_max = static_cast<uint32_t>(scratch); OBU_READ_LITERAL_OR_FAIL(32); - metadata_.luminance_min = static_cast<uint32_t>(scratch); + hdr_mdcv.luminance_min = static_cast<uint32_t>(scratch); + if (!EnsureCurrentFrameIsNotNull()) return false; + current_frame_->set_hdr_mdcv(hdr_mdcv); break; + } case kMetadataTypeScalability: if (!ParseMetadataScalability()) return false; break; case kMetadataTypeItutT35: { + ObuMetadataItutT35 itut_t35; OBU_READ_LITERAL_OR_FAIL(8); - metadata_.itu_t_t35_country_code = static_cast<uint8_t>(scratch); + itut_t35.country_code = static_cast<uint8_t>(scratch); ++data; --size; - if (metadata_.itu_t_t35_country_code == 0xFF) { + if (itut_t35.country_code == 0xFF) { OBU_READ_LITERAL_OR_FAIL(8); - metadata_.itu_t_t35_country_code_extension_byte = - static_cast<uint8_t>(scratch); + itut_t35.country_code_extension_byte = static_cast<uint8_t>(scratch); ++data; --size; } - // Read itu_t_t35_payload_bytes. Section 6.7.2 of the spec says: - // itu_t_t35_payload_bytes shall be bytes containing data registered as + // Read itut_t35.payload_bytes. Section 6.7.2 of the spec says: + // itut_t35.payload_bytes shall be bytes containing data registered as // specified in Recommendation ITU-T T.35. - // Therefore itu_t_t35_payload_bytes is byte aligned and the first - // trailing byte should be 0x80. Since the exact syntax of - // itu_t_t35_payload_bytes is not defined in the AV1 spec, identify the - // end of itu_t_t35_payload_bytes by searching for the trailing bit. + // Therefore itut_t35.payload_bytes is byte aligned and the first trailing + // byte should be 0x80. Since the exact syntax of itut_t35.payload_bytes + // is not defined in the AV1 spec, identify the end of + // itut_t35.payload_bytes by searching for the trailing bit. const int i = GetLastNonzeroByteIndex(data, size); if (i < 0) { LIBGAV1_DLOG(ERROR, "Trailing bit is missing."); @@ -2447,20 +2447,15 @@ bool ObuParser::ParseMetadata(const uint8_t* data, size_t size) { if (data[i] != 0x80) { LIBGAV1_DLOG( ERROR, - "itu_t_t35_payload_bytes is not byte aligned. The last nonzero " - "byte of the payload data is 0x%x, should be 0x80.", + "itut_t35.payload_bytes is not byte aligned. The last nonzero byte " + "of the payload data is 0x%x, should be 0x80.", data[i]); return false; } - if (i != 0) { - // data[0]..data[i - 1] are itu_t_t35_payload_bytes. - metadata_.itu_t_t35_payload_bytes.reset(new (std::nothrow) uint8_t[i]); - if (metadata_.itu_t_t35_payload_bytes == nullptr) { - LIBGAV1_DLOG(ERROR, "Allocation of itu_t_t35_payload_bytes failed."); - return false; - } - memcpy(metadata_.itu_t_t35_payload_bytes.get(), data, i); - metadata_.itu_t_t35_payload_size = i; + itut_t35.payload_size = i; + if (!EnsureCurrentFrameIsNotNull() || + !current_frame_->set_itut_t35(itut_t35, data)) { + return false; } // Skip all bits before the trailing bit. bit_reader_->SkipBytes(i); @@ -2637,6 +2632,16 @@ bool ObuParser::InitBitReader(const uint8_t* const data, size_t size) { return bit_reader_ != nullptr; } +bool ObuParser::EnsureCurrentFrameIsNotNull() { + if (current_frame_ != nullptr) return true; + current_frame_ = buffer_pool_->GetFreeBuffer(); + if (current_frame_ == nullptr) { + LIBGAV1_DLOG(ERROR, "Could not get current_frame from the buffer pool."); + return false; + } + return true; +} + bool ObuParser::HasData() const { return size_ > 0; } StatusCode ObuParser::ParseOneFrame(RefCountedBufferPtr* const current_frame) { @@ -2652,7 +2657,6 @@ StatusCode ObuParser::ParseOneFrame(RefCountedBufferPtr* const current_frame) { // Clear everything except the sequence header. obu_headers_.clear(); frame_header_ = {}; - metadata_ = {}; tile_buffers_.clear(); next_tile_group_start_ = 0; sequence_header_changed_ = false; diff --git a/src/obu_parser.h b/src/obu_parser.h index 3f452ef..eba3370 100644 --- a/src/obu_parser.h +++ b/src/obu_parser.h @@ -221,26 +221,6 @@ enum MetadataType : uint8_t { // 32 and greater are reserved for AOM use. }; -struct ObuMetadata { - // Maximum content light level. - uint16_t max_cll; - // Maximum frame-average light level. - uint16_t max_fall; - uint16_t primary_chromaticity_x[3]; - uint16_t primary_chromaticity_y[3]; - uint16_t white_point_chromaticity_x; - uint16_t white_point_chromaticity_y; - uint32_t luminance_max; - uint32_t luminance_min; - // ITU-T T.35. - uint8_t itu_t_t35_country_code; - uint8_t itu_t_t35_country_code_extension_byte; // Valid if - // itu_t_t35_country_code is - // 0xFF. - std::unique_ptr<uint8_t[]> itu_t_t35_payload_bytes; - size_t itu_t_t35_payload_size; -}; - class ObuParser : public Allocable { public: ObuParser(const uint8_t* const data, size_t size, int operating_point, @@ -276,7 +256,6 @@ class ObuParser : public Allocable { const ObuSequenceHeader& sequence_header() const { return sequence_header_; } const ObuFrameHeader& frame_header() const { return frame_header_; } const Vector<TileBuffer>& tile_buffers() const { return tile_buffers_; } - const ObuMetadata& metadata() const { return metadata_; } // Returns true if the last call to ParseOneFrame() encountered a sequence // header change. bool sequence_header_changed() const { return sequence_header_changed_; } @@ -372,6 +351,11 @@ class ObuParser : public Allocable { size_t tg_header_size, size_t bytes_consumed_so_far); bool ParseTileGroup(size_t size, size_t bytes_consumed_so_far); // 5.11.1. + // Populates |current_frame_| from the |buffer_pool_| if |current_frame_| is + // nullptr. Does not do anything otherwise. Returns true on success, false + // otherwise. + bool EnsureCurrentFrameIsNotNull(); + // Parser elements. std::unique_ptr<RawBitReader> bit_reader_; const uint8_t* data_; @@ -383,7 +367,6 @@ class ObuParser : public Allocable { ObuSequenceHeader sequence_header_ = {}; ObuFrameHeader frame_header_ = {}; Vector<TileBuffer> tile_buffers_; - ObuMetadata metadata_ = {}; // The expected starting tile number of the next Tile Group. int next_tile_group_start_ = 0; // If true, the sequence_header_ field is valid. diff --git a/src/obu_parser_test.cc b/src/obu_parser_test.cc index 6397ad0..a471037 100644 --- a/src/obu_parser_test.cc +++ b/src/obu_parser_test.cc @@ -31,6 +31,7 @@ #include "src/gav1/status_code.h" #include "src/utils/common.h" #include "src/utils/constants.h" +#include "src/utils/dynamic_buffer.h" #include "src/utils/segmentation.h" #include "src/utils/types.h" #include "src/utils/vector.h" @@ -780,39 +781,38 @@ class ObuParserTest : public testing::Test { OBU_TEST_COMPARE(film_grain_params_present); } - void VerifyMetadata(MetadataType type, const ObuMetadata& expected) { - const ObuMetadata& actual = obu_->metadata(); - switch (type) { - case kMetadataTypeHdrContentLightLevel: - OBU_TEST_COMPARE(max_cll); - OBU_TEST_COMPARE(max_fall); - break; - case kMetadataTypeHdrMasteringDisplayColorVolume: - for (int i = 0; i < 3; ++i) { - OBU_TEST_COMPARE(primary_chromaticity_x[i]); - OBU_TEST_COMPARE(primary_chromaticity_y[i]); - } - OBU_TEST_COMPARE(white_point_chromaticity_x); - OBU_TEST_COMPARE(white_point_chromaticity_y); - OBU_TEST_COMPARE(luminance_max); - OBU_TEST_COMPARE(luminance_min); - break; - case kMetadataTypeScalability: - break; - case kMetadataTypeItutT35: - OBU_TEST_COMPARE(itu_t_t35_country_code); - OBU_TEST_COMPARE(itu_t_t35_country_code_extension_byte); - ASSERT_EQ(expected.itu_t_t35_payload_size, - actual.itu_t_t35_payload_size); - if (actual.itu_t_t35_payload_size != 0) { - EXPECT_EQ(memcmp(expected.itu_t_t35_payload_bytes.get(), - actual.itu_t_t35_payload_bytes.get(), - actual.itu_t_t35_payload_size), - 0); - } - break; - case kMetadataTypeTimecode: - break; + void VerifyMetadataHdrCll(const ObuMetadataHdrCll& expected) { + EXPECT_TRUE(obu_->current_frame_->hdr_cll_set()); + const ObuMetadataHdrCll& actual = obu_->current_frame_->hdr_cll(); + OBU_TEST_COMPARE(max_cll); + OBU_TEST_COMPARE(max_fall); + } + + void VerifyMetadataHdrMdcv(const ObuMetadataHdrMdcv& expected) { + EXPECT_TRUE(obu_->current_frame_->hdr_mdcv_set()); + const ObuMetadataHdrMdcv& actual = obu_->current_frame_->hdr_mdcv(); + for (int i = 0; i < 3; ++i) { + OBU_TEST_COMPARE(primary_chromaticity_x[i]); + OBU_TEST_COMPARE(primary_chromaticity_y[i]); + } + OBU_TEST_COMPARE(white_point_chromaticity_x); + OBU_TEST_COMPARE(white_point_chromaticity_y); + OBU_TEST_COMPARE(luminance_max); + OBU_TEST_COMPARE(luminance_min); + } + + void VerifyMetadataItutT35(const ObuMetadataItutT35& expected) { + EXPECT_TRUE(obu_->current_frame_->itut_t35_set()); + const ObuMetadataItutT35& actual = obu_->current_frame_->itut_t35(); + OBU_TEST_COMPARE(country_code); + if (actual.country_code == 0xFF) { + OBU_TEST_COMPARE(country_code_extension_byte); + } + ASSERT_EQ(expected.payload_size, actual.payload_size); + if (actual.payload_size != 0) { + EXPECT_EQ(memcmp(expected.payload_bytes, actual.payload_bytes, + actual.payload_size), + 0); } } @@ -2521,9 +2521,9 @@ TEST_F(ObuParserTest, MetadataUnknownType) { ASSERT_TRUE(ParseMetadata(data.GenerateData())); } -TEST_F(ObuParserTest, MetadataCll) { +TEST_F(ObuParserTest, MetadataHdrCll) { BytesAndBits data; - ObuMetadata gold; + ObuMetadataHdrCll gold; gold.max_cll = 25; gold.max_fall = 100; @@ -2532,12 +2532,12 @@ TEST_F(ObuParserTest, MetadataCll) { data.AppendLiteral(16, gold.max_fall); ASSERT_TRUE(ParseMetadata(data.GenerateData())); - VerifyMetadata(kMetadataTypeHdrContentLightLevel, gold); + VerifyMetadataHdrCll(gold); } -TEST_F(ObuParserTest, MetadataMdcv) { +TEST_F(ObuParserTest, MetadataHdrMdcv) { BytesAndBits data; - ObuMetadata gold; + ObuMetadataHdrMdcv gold; for (int i = 0; i < 3; ++i) { gold.primary_chromaticity_x[i] = 0; gold.primary_chromaticity_y[i] = 0; @@ -2558,34 +2558,32 @@ TEST_F(ObuParserTest, MetadataMdcv) { data.AppendLiteral(32, gold.luminance_min); ASSERT_TRUE(ParseMetadata(data.GenerateData())); - VerifyMetadata(kMetadataTypeHdrMasteringDisplayColorVolume, gold); + VerifyMetadataHdrMdcv(gold); } TEST_F(ObuParserTest, MetadataScalability) { BytesAndBits data; - ObuMetadata gold; data.AppendLiteral(8, kMetadataTypeScalability); data.AppendLiteral(8, 0); // scalability_mode_idc ASSERT_TRUE(ParseMetadata(data.GenerateData())); - VerifyMetadata(kMetadataTypeScalability, gold); } TEST_F(ObuParserTest, MetadataItutT35) { BytesAndBits data; - ObuMetadata gold; - gold.itu_t_t35_country_code = 0xA6; // 1 0 1 0 0 1 1 0 Switzerland - gold.itu_t_t35_country_code_extension_byte = 0; - gold.itu_t_t35_payload_bytes.reset(new (std::nothrow) uint8_t[10]); - ASSERT_NE(gold.itu_t_t35_payload_bytes, nullptr); + ObuMetadataItutT35 gold; + gold.country_code = 0xA6; // 1 0 1 0 0 1 1 0 Switzerland + DynamicBuffer<uint8_t> payload_bytes; + ASSERT_TRUE(payload_bytes.Resize(10)); + gold.payload_bytes = payload_bytes.get(); for (int i = 0; i < 10; ++i) { - gold.itu_t_t35_payload_bytes[i] = 9 - i; + gold.payload_bytes[i] = 9 - i; } - gold.itu_t_t35_payload_size = 10; + gold.payload_size = 10; data.AppendLiteral(8, kMetadataTypeItutT35); - data.AppendLiteral(8, gold.itu_t_t35_country_code); + data.AppendLiteral(8, gold.country_code); for (int i = 0; i < 10; ++i) { data.AppendLiteral(8, 9 - i); } @@ -2596,12 +2594,20 @@ TEST_F(ObuParserTest, MetadataItutT35) { data.AppendLiteral(8, 0x00); ASSERT_TRUE(ParseMetadata(data.GenerateData())); - VerifyMetadata(kMetadataTypeItutT35, gold); + VerifyMetadataItutT35(gold); + + gold.country_code = 0xFF; + gold.country_code_extension_byte = 10; + + data.SetLiteral(8, 8, gold.country_code); + data.InsertLiteral(16, 8, gold.country_code_extension_byte); + + ASSERT_TRUE(ParseMetadata(data.GenerateData())); + VerifyMetadataItutT35(gold); } TEST_F(ObuParserTest, MetadataTimecode) { BytesAndBits data; - ObuMetadata gold; data.AppendLiteral(8, kMetadataTypeTimecode); data.AppendLiteral(5, 0); // counting_type @@ -2615,12 +2621,10 @@ TEST_F(ObuParserTest, MetadataTimecode) { data.AppendLiteral(5, 0); // time_offset_length ASSERT_TRUE(ParseMetadata(data.GenerateData())); - VerifyMetadata(kMetadataTypeTimecode, gold); } TEST_F(ObuParserTest, MetadataTimecodeInvalidSecondsValue) { BytesAndBits data; - ObuMetadata gold; data.AppendLiteral(8, kMetadataTypeTimecode); data.AppendLiteral(5, 0); // counting_type @@ -2638,7 +2642,6 @@ TEST_F(ObuParserTest, MetadataTimecodeInvalidSecondsValue) { TEST_F(ObuParserTest, MetadataTimecodeInvalidMinutesValue) { BytesAndBits data; - ObuMetadata gold; data.AppendLiteral(8, kMetadataTypeTimecode); data.AppendLiteral(5, 0); // counting_type @@ -2656,7 +2659,6 @@ TEST_F(ObuParserTest, MetadataTimecodeInvalidMinutesValue) { TEST_F(ObuParserTest, MetadataTimecodeInvalidHoursValue) { BytesAndBits data; - ObuMetadata gold; data.AppendLiteral(8, kMetadataTypeTimecode); data.AppendLiteral(5, 0); // counting_type diff --git a/src/post_filter/deblock.cc b/src/post_filter/deblock.cc index 48ad823..daee01c 100644 --- a/src/post_filter/deblock.cc +++ b/src/post_filter/deblock.cc @@ -329,7 +329,6 @@ void PostFilter::HorizontalDeblockFilter(int row4x4_start, int row4x4_end, src_row, src_stride, outer_thresh_[level], inner_thresh_[level], HevThresh(level)); } - // TODO(chengchen): use shifts instead of multiplication. src_row += row_step * src_stride; row_step = DivideBy4(row_step); } diff --git a/src/post_filter_test.cc b/src/post_filter_test.cc index db9d0f4..034d31f 100644 --- a/src/post_filter_test.cc +++ b/src/post_filter_test.cc @@ -141,6 +141,45 @@ const char* GetSuperResDigest10bpp(int id, int plane) { } #endif // LIBGAV1_MAX_BITDEPTH >= 10 +#if LIBGAV1_MAX_BITDEPTH == 12 +const char* GetSuperResDigest12bpp(int id, int plane) { + // Digests are in Y/U/V order. + static const char* const kDigestSuperRes[][kMaxPlanes] = { + { + // all input is 0. + "fccb1f57b252b1a86d335aea929d1d58", + "2f244a56091c9705794e92e6bcc38058", + "2f244a56091c9705794e92e6bcc38058", + }, + { + // all input is 1. + "de8556204999d6e4bf74cfdde61a095b", + "e7d0f4ce6df81c46de95da7790a67384", + "e7d0f4ce6df81c46de95da7790a67384", + }, + { + // all input is 2048. + "83d600a7b3dc9bc3f710668ee2244e6b", + "468eec1453edc1befeb8a346f61950a7", + "468eec1453edc1befeb8a346f61950a7", + }, + { + // all input is 4095. + "30bdb1dfee2b02b12b38e6b9f6287e27", + "34d673f075d2caa93a2f648ee3569e20", + "34d673f075d2caa93a2f648ee3569e20", + }, + { + // random input. + "f10f21f5322231d991550fce7ef9787d", + "a2d8b6140bd5002e86644ef433b8eb42", + "a2d8b6140bd5002e86644ef433b8eb42", + }, + }; + return kDigestSuperRes[id][plane]; +} +#endif // LIBGAV1_MAX_BITDEPTH == 12 + } // namespace // This type is used to parameterize the tests so is defined outside the @@ -175,6 +214,7 @@ static std::ostream& operator<<(std::ostream& os, const FrameSizeParam& param) { template <int bitdepth, typename Pixel> class PostFilterTestBase : public testing::TestWithParam<FrameSizeParam> { public: + static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, ""); PostFilterTestBase() = default; PostFilterTestBase(const PostFilterTestBase&) = delete; PostFilterTestBase& operator=(const PostFilterTestBase&) = delete; @@ -231,6 +271,7 @@ class PostFilterTestBase : public testing::TestWithParam<FrameSizeParam> { template <int bitdepth, typename Pixel> class PostFilterHelperFuncTest : public PostFilterTestBase<bitdepth, Pixel> { public: + static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, ""); PostFilterHelperFuncTest() = default; PostFilterHelperFuncTest(const PostFilterHelperFuncTest&) = delete; PostFilterHelperFuncTest& operator=(const PostFilterHelperFuncTest&) = delete; @@ -425,6 +466,7 @@ void PostFilterHelperFuncTest<bitdepth, Pixel>::TestExtendFrame( template <int bitdepth, typename Pixel> class PostFilterSuperResTest : public PostFilterTestBase<bitdepth, Pixel> { public: + static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, ""); PostFilterSuperResTest() { test_utils::ResetDspTable(bitdepth); dsp::SuperResInit_C(); @@ -581,6 +623,11 @@ void PostFilterSuperResTest<bitdepth, Pixel>::TestApplySuperRes( expected_digest = GetSuperResDigest10bpp(id, plane); break; #endif +#if LIBGAV1_MAX_BITDEPTH == 12 + case 12: + expected_digest = GetSuperResDigest12bpp(id, plane); + break; +#endif } ASSERT_NE(expected_digest, nullptr); EXPECT_STREQ(digest.c_str(), expected_digest); @@ -680,6 +727,44 @@ INSTANTIATE_TEST_SUITE_P(PostFilterHelperFuncTestInstance, testing::ValuesIn(kTestParamExtendFrame)); #endif // LIBGAV1_MAX_BITDEPTH >= 10 +#if LIBGAV1_MAX_BITDEPTH == 12 +using PostFilterSuperResTest12bpp = PostFilterSuperResTest<12, uint16_t>; + +TEST_P(PostFilterSuperResTest12bpp, ApplySuperRes) { + TestApplySuperRes(true, 0, 0, false); + TestApplySuperRes(true, 1, 1, false); + TestApplySuperRes(true, 1 << 11, 2, false); + TestApplySuperRes(true, (1 << 12) - 1, 3, false); + TestApplySuperRes(false, 0, 4, false); +} + +TEST_P(PostFilterSuperResTest12bpp, ApplySuperResThreaded) { + TestApplySuperRes(true, 0, 0, true); + TestApplySuperRes(true, 1, 1, true); + TestApplySuperRes(true, 1 << 11, 2, true); + TestApplySuperRes(true, (1 << 12) - 1, 3, true); + TestApplySuperRes(false, 0, 4, true); +} + +INSTANTIATE_TEST_SUITE_P(PostFilterSuperResTestInstance, + PostFilterSuperResTest12bpp, + testing::ValuesIn(kTestParamSuperRes)); + +using PostFilterHelperFuncTest12bpp = PostFilterHelperFuncTest<12, uint16_t>; + +TEST_P(PostFilterHelperFuncTest12bpp, ExtendFrame) { + TestExtendFrame(true, 0); + TestExtendFrame(true, 1); + TestExtendFrame(true, 255); + TestExtendFrame(true, (1 << 12) - 1); + TestExtendFrame(false, 0); +} + +INSTANTIATE_TEST_SUITE_P(PostFilterHelperFuncTestInstance, + PostFilterHelperFuncTest12bpp, + testing::ValuesIn(kTestParamExtendFrame)); +#endif // LIBGAV1_MAX_BITDEPTH == 12 + namespace { const char* GetDigestApplyCdef8bpp(int id) { @@ -712,12 +797,29 @@ const char* GetDigestApplyCdef10bpp(int id) { } #endif // LIBGAV1_MAX_BITDEPTH >= 10 +#if LIBGAV1_MAX_BITDEPTH == 12 +const char* GetDigestApplyCdef12bpp(int id) { + static const char* const kDigest[] = { + "06e2d09b6ce3924f3b5d4c00ab76eea5", "287240e4b13cb75e17932a3dd7ba3b3c", + "265da123e3347c4fb3e434f26a3949e7", "e032ce6eb76242df6894482ac6688406", + "f648328221f0f02a5b7fc3d55a66271a", "8f759aa84a110902025dacf8062d2f6a", + "592b49e4b993d6b4634d8eb1ee3bba54", "29a3e8e329ec70d06910e982ea763e6b", + "f648328221f0f02a5b7fc3d55a66271a", "8f759aa84a110902025dacf8062d2f6a", + "592b49e4b993d6b4634d8eb1ee3bba54", "29a3e8e329ec70d06910e982ea763e6b", + "155dd4283f8037f86cce34b6cfe67a7e", "0a022c70ead199517af9bad2002d70cd", + "a966dfea52a7a2084545f68b2c9e1735", "e098438a23a7c9f276e594b98b2db922", + }; + return kDigest[id]; +} +#endif // LIBGAV1_MAX_BITDEPTH == 12 + } // namespace template <int bitdepth, typename Pixel> class PostFilterApplyCdefTest : public testing::TestWithParam<FrameSizeParam>, public test_utils::MaxAlignedAllocable { public: + static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, ""); PostFilterApplyCdefTest() = default; PostFilterApplyCdefTest(const PostFilterApplyCdefTest&) = delete; PostFilterApplyCdefTest& operator=(const PostFilterApplyCdefTest&) = delete; @@ -903,17 +1005,25 @@ void PostFilterApplyCdefTest<bitdepth, Pixel>::TestMultiThread( elapsed_time += absl::Now() - start; CopyFilterOutputToDestBuffer(); - if (bitdepth == 8) { - test_utils::CheckMd5Digest(kCdef, kApplyCdefName, - GetDigestApplyCdef8bpp(id), dest_, size_, - elapsed_time); + const char* expected_digest = nullptr; + switch (bitdepth) { + case 8: + expected_digest = GetDigestApplyCdef8bpp(id); + break; #if LIBGAV1_MAX_BITDEPTH >= 10 - } else { - test_utils::CheckMd5Digest(kCdef, kApplyCdefName, - GetDigestApplyCdef10bpp(id), dest_, size_, - elapsed_time); -#endif // LIBGAV1_MAX_BITDEPTH >= 10 + case 10: + expected_digest = GetDigestApplyCdef10bpp(id); + break; +#endif +#if LIBGAV1_MAX_BITDEPTH == 12 + case 12: + expected_digest = GetDigestApplyCdef12bpp(id); + break; +#endif } + ASSERT_NE(expected_digest, nullptr); + test_utils::CheckMd5Digest(kCdef, kApplyCdefName, expected_digest, dest_, + size_, elapsed_time); } const FrameSizeParam kTestParamApplyCdef[] = { @@ -953,4 +1063,18 @@ INSTANTIATE_TEST_SUITE_P(PostFilterApplyCdefTestInstance, testing::ValuesIn(kTestParamApplyCdef)); #endif // LIBGAV1_MAX_BITDEPTH >= 10 +#if LIBGAV1_MAX_BITDEPTH == 12 +using PostFilterApplyCdefTest12bpp = PostFilterApplyCdefTest<12, uint16_t>; + +TEST_P(PostFilterApplyCdefTest12bpp, ApplyCdef) { + TestMultiThread(2); + TestMultiThread(4); + TestMultiThread(8); +} + +INSTANTIATE_TEST_SUITE_P(PostFilterApplyCdefTestInstance, + PostFilterApplyCdefTest12bpp, + testing::ValuesIn(kTestParamApplyCdef)); +#endif // LIBGAV1_MAX_BITDEPTH == 12 + } // namespace libgav1 diff --git a/src/quantizer.cc b/src/quantizer.cc index cd720d6..eb13314 100644 --- a/src/quantizer.cc +++ b/src/quantizer.cc @@ -20,8 +20,9 @@ #include "src/utils/common.h" #include "src/utils/constants.h" -#if LIBGAV1_MAX_BITDEPTH != 8 && LIBGAV1_MAX_BITDEPTH != 10 -#error LIBGAV1_MAX_BITDEPTH must be 8 or 10 +#if LIBGAV1_MAX_BITDEPTH != 8 && LIBGAV1_MAX_BITDEPTH != 10 && \ + LIBGAV1_MAX_BITDEPTH != 12 +#error LIBGAV1_MAX_BITDEPTH must be 8, 10 or 12 #endif namespace libgav1 { @@ -87,6 +88,43 @@ constexpr int16_t kDcLookup[][256] = { 4737, 4929, 5130, 5347 }, #endif // LIBGAV1_MAX_BITDEPTH >= 10 +#if LIBGAV1_MAX_BITDEPTH == 12 + // Lookup table for 12 bit. + { + 4, 12, 18, 25, 33, 41, 50, 60, + 70, 80, 91, 103, 115, 127, 140, 153, + 166, 180, 194, 208, 222, 237, 251, 266, + 281, 296, 312, 327, 343, 358, 374, 390, + 405, 421, 437, 453, 469, 484, 500, 516, + 532, 548, 564, 580, 596, 611, 627, 643, + 659, 674, 690, 706, 721, 737, 752, 768, + 783, 798, 814, 829, 844, 859, 874, 889, + 904, 919, 934, 949, 964, 978, 993, 1008, + 1022, 1037, 1051, 1065, 1080, 1094, 1108, 1122, + 1136, 1151, 1165, 1179, 1192, 1206, 1220, 1234, + 1248, 1261, 1275, 1288, 1302, 1315, 1329, 1342, + 1368, 1393, 1419, 1444, 1469, 1494, 1519, 1544, + 1569, 1594, 1618, 1643, 1668, 1692, 1717, 1741, + 1765, 1789, 1814, 1838, 1862, 1885, 1909, 1933, + 1957, 1992, 2027, 2061, 2096, 2130, 2165, 2199, + 2233, 2267, 2300, 2334, 2367, 2400, 2434, 2467, + 2499, 2532, 2575, 2618, 2661, 2704, 2746, 2788, + 2830, 2872, 2913, 2954, 2995, 3036, 3076, 3127, + 3177, 3226, 3275, 3324, 3373, 3421, 3469, 3517, + 3565, 3621, 3677, 3733, 3788, 3843, 3897, 3951, + 4005, 4058, 4119, 4181, 4241, 4301, 4361, 4420, + 4479, 4546, 4612, 4677, 4742, 4807, 4871, 4942, + 5013, 5083, 5153, 5222, 5291, 5367, 5442, 5517, + 5591, 5665, 5745, 5825, 5905, 5984, 6063, 6149, + 6234, 6319, 6404, 6495, 6587, 6678, 6769, 6867, + 6966, 7064, 7163, 7269, 7376, 7483, 7599, 7715, + 7832, 7958, 8085, 8214, 8352, 8492, 8635, 8788, + 8945, 9104, 9275, 9450, 9639, 9832, 10031, 10245, + 10465, 10702, 10946, 11210, 11482, 11776, 12081, 12409, + 12750, 13118, 13501, 13913, 14343, 14807, 15290, 15812, + 16356, 16943, 17575, 18237, 18949, 19718, 20521, 21387 + } +#endif // LIBGAV1_MAX_BITDEPTH == 12 }; constexpr int16_t kAcLookup[][256] = { @@ -142,6 +180,43 @@ constexpr int16_t kAcLookup[][256] = { 6900, 7036, 7172, 7312 }, #endif // LIBGAV1_MAX_BITDEPTH >= 10 +#if LIBGAV1_MAX_BITDEPTH == 12 + // Lookup table for 12 bit. + { + 4, 13, 19, 27, 35, 44, 54, 64, + 75, 87, 99, 112, 126, 139, 154, 168, + 183, 199, 214, 230, 247, 263, 280, 297, + 314, 331, 349, 366, 384, 402, 420, 438, + 456, 475, 493, 511, 530, 548, 567, 586, + 604, 623, 642, 660, 679, 698, 716, 735, + 753, 772, 791, 809, 828, 846, 865, 884, + 902, 920, 939, 957, 976, 994, 1012, 1030, + 1049, 1067, 1085, 1103, 1121, 1139, 1157, 1175, + 1193, 1211, 1229, 1246, 1264, 1282, 1299, 1317, + 1335, 1352, 1370, 1387, 1405, 1422, 1440, 1457, + 1474, 1491, 1509, 1526, 1543, 1560, 1577, 1595, + 1627, 1660, 1693, 1725, 1758, 1791, 1824, 1856, + 1889, 1922, 1954, 1987, 2020, 2052, 2085, 2118, + 2150, 2183, 2216, 2248, 2281, 2313, 2346, 2378, + 2411, 2459, 2508, 2556, 2605, 2653, 2701, 2750, + 2798, 2847, 2895, 2943, 2992, 3040, 3088, 3137, + 3185, 3234, 3298, 3362, 3426, 3491, 3555, 3619, + 3684, 3748, 3812, 3876, 3941, 4005, 4069, 4149, + 4230, 4310, 4390, 4470, 4550, 4631, 4711, 4791, + 4871, 4967, 5064, 5160, 5256, 5352, 5448, 5544, + 5641, 5737, 5849, 5961, 6073, 6185, 6297, 6410, + 6522, 6650, 6778, 6906, 7034, 7162, 7290, 7435, + 7579, 7723, 7867, 8011, 8155, 8315, 8475, 8635, + 8795, 8956, 9132, 9308, 9484, 9660, 9836, 10028, + 10220, 10412, 10604, 10812, 11020, 11228, 11437, 11661, + 11885, 12109, 12333, 12573, 12813, 13053, 13309, 13565, + 13821, 14093, 14365, 14637, 14925, 15213, 15502, 15806, + 16110, 16414, 16734, 17054, 17390, 17726, 18062, 18414, + 18766, 19134, 19502, 19886, 20270, 20670, 21070, 21486, + 21902, 22334, 22766, 23214, 23662, 24126, 24590, 25070, + 25551, 26047, 26559, 27071, 27599, 28143, 28687, 29247 + } +#endif // LIBGAV1_MAX_BITDEPTH == 12 }; // clang-format on diff --git a/src/quantizer_test.cc b/src/quantizer_test.cc index 618d247..0c27027 100644 --- a/src/quantizer_test.cc +++ b/src/quantizer_test.cc @@ -106,6 +106,32 @@ TEST(QuantizerTest, GetDcValue) { EXPECT_EQ(quantizer.GetDcValue(kPlaneV, 253), 5347); } #endif // LIBGAV1_MAX_BITDEPTH >= 10 + +#if LIBGAV1_MAX_BITDEPTH == 12 + // Test lookups of Dc_Qlookup[2][0], Dc_Qlookup[2][11], Dc_Qlookup[2][12], + // and Dc_Qlookup[2][255] in the spec, including the clipping of qindex. + { + Quantizer quantizer(12, ¶ms); + EXPECT_EQ(quantizer.GetDcValue(kPlaneY, -2), 4); + EXPECT_EQ(quantizer.GetDcValue(kPlaneY, -1), 4); + EXPECT_EQ(quantizer.GetDcValue(kPlaneY, 10), 103); + EXPECT_EQ(quantizer.GetDcValue(kPlaneY, 11), 115); + EXPECT_EQ(quantizer.GetDcValue(kPlaneY, 254), 21387); + EXPECT_EQ(quantizer.GetDcValue(kPlaneY, 255), 21387); + EXPECT_EQ(quantizer.GetDcValue(kPlaneU, -3), 4); + EXPECT_EQ(quantizer.GetDcValue(kPlaneU, -2), 4); + EXPECT_EQ(quantizer.GetDcValue(kPlaneU, 9), 103); + EXPECT_EQ(quantizer.GetDcValue(kPlaneU, 10), 115); + EXPECT_EQ(quantizer.GetDcValue(kPlaneU, 253), 21387); + EXPECT_EQ(quantizer.GetDcValue(kPlaneU, 254), 21387); + EXPECT_EQ(quantizer.GetDcValue(kPlaneV, -4), 4); + EXPECT_EQ(quantizer.GetDcValue(kPlaneV, -3), 4); + EXPECT_EQ(quantizer.GetDcValue(kPlaneV, 8), 103); + EXPECT_EQ(quantizer.GetDcValue(kPlaneV, 9), 115); + EXPECT_EQ(quantizer.GetDcValue(kPlaneV, 254), 21387); + EXPECT_EQ(quantizer.GetDcValue(kPlaneV, 253), 21387); + } +#endif // LIBGAV1_MAX_BITDEPTH == 12 } TEST(QuantizerTest, GetAcValue) { @@ -162,6 +188,32 @@ TEST(QuantizerTest, GetAcValue) { EXPECT_EQ(quantizer.GetAcValue(kPlaneV, 254), 7312); } #endif // LIBGAV1_MAX_BITDEPTH >= 10 + +#if LIBGAV1_MAX_BITDEPTH == 12 + // Test lookups of Ac_Qlookup[1][0], Ac_Qlookup[1][11], Ac_Qlookup[1][12], + // and Ac_Qlookup[1][255] in the spec, including the clipping of qindex. + { + Quantizer quantizer(12, ¶ms); + EXPECT_EQ(quantizer.GetAcValue(kPlaneY, -1), 4); + EXPECT_EQ(quantizer.GetAcValue(kPlaneY, 0), 4); + EXPECT_EQ(quantizer.GetAcValue(kPlaneY, 11), 112); + EXPECT_EQ(quantizer.GetAcValue(kPlaneY, 12), 126); + EXPECT_EQ(quantizer.GetAcValue(kPlaneY, 255), 29247); + EXPECT_EQ(quantizer.GetAcValue(kPlaneY, 256), 29247); + EXPECT_EQ(quantizer.GetAcValue(kPlaneU, -2), 4); + EXPECT_EQ(quantizer.GetAcValue(kPlaneU, -1), 4); + EXPECT_EQ(quantizer.GetAcValue(kPlaneU, 10), 112); + EXPECT_EQ(quantizer.GetAcValue(kPlaneU, 11), 126); + EXPECT_EQ(quantizer.GetAcValue(kPlaneU, 254), 29247); + EXPECT_EQ(quantizer.GetAcValue(kPlaneU, 255), 29247); + EXPECT_EQ(quantizer.GetAcValue(kPlaneV, -3), 4); + EXPECT_EQ(quantizer.GetAcValue(kPlaneV, -2), 4); + EXPECT_EQ(quantizer.GetAcValue(kPlaneV, 9), 112); + EXPECT_EQ(quantizer.GetAcValue(kPlaneV, 10), 126); + EXPECT_EQ(quantizer.GetAcValue(kPlaneV, 253), 29247); + EXPECT_EQ(quantizer.GetAcValue(kPlaneV, 254), 29247); + } +#endif // LIBGAV1_MAX_BITDEPTH == 12 } } // namespace diff --git a/src/threading_strategy_test.cc b/src/threading_strategy_test.cc index 2a7a781..beea36f 100644 --- a/src/threading_strategy_test.cc +++ b/src/threading_strategy_test.cc @@ -99,7 +99,14 @@ TEST_F(ThreadingStrategyTest, MultipleCalls) { ASSERT_TRUE(strategy_.Reset(frame_header_, 16)); EXPECT_NE(strategy_.tile_thread_pool(), nullptr); for (int i = 0; i < 8; ++i) { - EXPECT_NE(strategy_.row_thread_pool(i), nullptr); + // See ThreadingStrategy::Reset(). +#if defined(__ANDROID__) + if (i >= 4) { + EXPECT_EQ(strategy_.row_thread_pool(i), nullptr) << "i = " << i; + continue; + } +#endif + EXPECT_NE(strategy_.row_thread_pool(i), nullptr) << "i = " << i; } EXPECT_NE(strategy_.post_filter_thread_pool(), nullptr); @@ -120,11 +127,18 @@ TEST_F(ThreadingStrategyTest, MultipleCalls) { EXPECT_NE(strategy_.tile_thread_pool(), nullptr); // First two tiles will get 1 thread each. for (int i = 0; i < 2; ++i) { - EXPECT_NE(strategy_.row_thread_pool(i), nullptr); + // See ThreadingStrategy::Reset(). +#if defined(__ANDROID__) + if (i == 1) { + EXPECT_EQ(strategy_.row_thread_pool(i), nullptr) << "i = " << i; + continue; + } +#endif + EXPECT_NE(strategy_.row_thread_pool(i), nullptr) << "i = " << i; } // All the other row threads must be reset. for (int i = 2; i < 8; ++i) { - EXPECT_EQ(strategy_.row_thread_pool(i), nullptr); + EXPECT_EQ(strategy_.row_thread_pool(i), nullptr) << "i = " << i; } EXPECT_NE(strategy_.post_filter_thread_pool(), nullptr); @@ -153,6 +167,13 @@ TEST_F(ThreadingStrategyTest, MultipleCalls2) { ASSERT_TRUE(strategy_.Reset(frame_header_, 4)); EXPECT_NE(strategy_.tile_thread_pool(), nullptr); for (int i = 0; i < 2; ++i) { + // See ThreadingStrategy::Reset(). +#if defined(__ANDROID__) + if (i == 1) { + EXPECT_EQ(strategy_.row_thread_pool(i), nullptr) << "i = " << i; + continue; + } +#endif EXPECT_NE(strategy_.row_thread_pool(i), nullptr); } for (int i = 2; i < 8; ++i) { @@ -464,13 +464,14 @@ class Tile : public MaxAlignedAllocable { int* start_y, int* step_x, int* step_y); // 7.11.3.3. // If the method returns false, the caller only uses the output parameters // *ref_block_start_x and *ref_block_start_y. If the method returns true, the - // caller uses all three output parameters. + // caller uses all four output parameters. static bool GetReferenceBlockPosition( int reference_frame_index, bool is_scaled, int width, int height, int ref_start_x, int ref_last_x, int ref_start_y, int ref_last_y, int start_x, int start_y, int step_x, int step_y, int left_border, int right_border, int top_border, int bottom_border, - int* ref_block_start_x, int* ref_block_start_y, int* ref_block_end_x); + int* ref_block_start_x, int* ref_block_start_y, int* ref_block_end_x, + int* ref_block_end_y); template <typename Pixel> void BuildConvolveBlock(Plane plane, int reference_frame_index, diff --git a/src/tile/prediction.cc b/src/tile/prediction.cc index bba5a69..4348548 100644 --- a/src/tile/prediction.cc +++ b/src/tile/prediction.cc @@ -771,11 +771,10 @@ bool Tile::InterPrediction(const Block& block, const Plane plane, const int x, [static_cast<int>(prediction_parameters.mask_is_inverse)]( block.scratch_buffer->prediction_buffer[0], block.scratch_buffer->prediction_buffer[1], - block.scratch_buffer->weight_mask, - kMaxSuperBlockSizeInPixels); + block.scratch_buffer->weight_mask, block.width); } prediction_mask = block.scratch_buffer->weight_mask; - prediction_mask_stride = kMaxSuperBlockSizeInPixels; + prediction_mask_stride = block.width; } if (is_compound) { @@ -996,7 +995,7 @@ bool Tile::GetReferenceBlockPosition( const int start_y, const int step_x, const int step_y, const int left_border, const int right_border, const int top_border, const int bottom_border, int* ref_block_start_x, int* ref_block_start_y, - int* ref_block_end_x) { + int* ref_block_end_x, int* ref_block_end_y) { *ref_block_start_x = GetPixelPositionFromHighScale(start_x, 0, 0); *ref_block_start_y = GetPixelPositionFromHighScale(start_y, 0, 0); if (reference_frame_index == -1) { @@ -1006,7 +1005,7 @@ bool Tile::GetReferenceBlockPosition( *ref_block_start_y -= kConvolveBorderLeftTop; *ref_block_end_x = GetPixelPositionFromHighScale(start_x, step_x, width - 1) + kConvolveBorderRight; - int ref_block_end_y = + *ref_block_end_y = GetPixelPositionFromHighScale(start_y, step_y, height - 1) + kConvolveBorderBottom; if (is_scaled) { @@ -1015,13 +1014,13 @@ bool Tile::GetReferenceBlockPosition( kScaleSubPixelBits) + kSubPixelTaps; *ref_block_end_x += kConvolveScaleBorderRight - kConvolveBorderRight; - ref_block_end_y = *ref_block_start_y + block_height - 1; + *ref_block_end_y = *ref_block_start_y + block_height - 1; } // Determines if we need to extend beyond the left/right/top/bottom border. return *ref_block_start_x < (ref_start_x - left_border) || *ref_block_end_x > (ref_last_x + right_border) || *ref_block_start_y < (ref_start_y - top_border) || - ref_block_end_y > (ref_last_y + bottom_border); + *ref_block_end_y > (ref_last_y + bottom_border); } // Builds a block as the input for convolve, by copying the content of @@ -1140,6 +1139,7 @@ bool Tile::BlockInterPrediction( int ref_block_start_x; int ref_block_start_y; int ref_block_end_x; + int ref_block_end_y; const bool extend_block = GetReferenceBlockPosition( reference_frame_index, is_scaled, width, height, ref_start_x, ref_last_x, ref_start_y, ref_last_y, start_x, start_y, step_x, step_y, @@ -1147,24 +1147,15 @@ bool Tile::BlockInterPrediction( reference_buffer->right_border(plane), reference_buffer->top_border(plane), reference_buffer->bottom_border(plane), &ref_block_start_x, - &ref_block_start_y, &ref_block_end_x); + &ref_block_start_y, &ref_block_end_x, &ref_block_end_y); // In frame parallel mode, ensure that the reference block has been decoded // and available for referencing. if (reference_frame_index != -1 && frame_parallel_) { - int reference_y_max; - if (is_scaled) { - // TODO(vigneshv): For now, we wait for the entire reference frame to be - // decoded if we are using scaled references. This will eventually be - // fixed. - reference_y_max = reference_height; - } else { - reference_y_max = - std::min(ref_block_start_y + height + kSubPixelTaps, ref_last_y); - // For U and V planes with subsampling, we need to multiply - // reference_y_max by 2 since we only track the progress of Y planes. - reference_y_max = LeftShift(reference_y_max, subsampling_y); - } + // For U and V planes with subsampling, we need to multiply the value of + // ref_block_end_y by 2 since we only track the progress of the Y planes. + const int reference_y_max = LeftShift( + std::min(ref_block_end_y + kSubPixelTaps, ref_last_y), subsampling_y); if (reference_frame_progress_cache_[reference_frame_index] < reference_y_max && !reference_frames_[reference_frame_index]->WaitUntil( @@ -1297,11 +1288,12 @@ bool Tile::BlockWarpProcess(const Block& block, const Plane plane, start_x += 8) { const int src_x = (start_x + 4) << subsampling_x_[plane]; const int src_y = (start_y + 4) << subsampling_y_[plane]; - const int dst_y = src_x * warp_params->params[4] + - src_y * warp_params->params[5] + - warp_params->params[1]; - const int y4 = dst_y >> subsampling_y_[plane]; - const int iy4 = y4 >> kWarpedModelPrecisionBits; + const int64_t dst_y = + src_x * warp_params->params[4] + + static_cast<int64_t>(src_y) * warp_params->params[5] + + warp_params->params[1]; + const int64_t y4 = dst_y >> subsampling_y_[plane]; + const int iy4 = static_cast<int>(y4 >> kWarpedModelPrecisionBits); reference_y_max = std::max(iy4 + 8, reference_y_max); } } diff --git a/src/utils/constants.h b/src/utils/constants.h index 1126ad6..8281aad 100644 --- a/src/utils/constants.h +++ b/src/utils/constants.h @@ -37,6 +37,10 @@ enum { }; // anonymous enum enum { + // Documentation variables. + kBitdepth8 = 8, + kBitdepth10 = 10, + kBitdepth12 = 12, kInvalidMvValue = -32768, kCdfMaxProbability = 32768, kBlockWidthCount = 5, @@ -59,6 +63,13 @@ enum { kRestorationTypeSymbolCount = 3, kSgrProjParamsBits = 4, kSgrProjPrecisionBits = 7, + // Precision of a division table (mtable) + kSgrProjScaleBits = 20, + kSgrProjReciprocalBits = 12, + // Core self-guided restoration precision bits. + kSgrProjSgrBits = 8, + // Precision bits of generated values higher than source before projection. + kSgrProjRestoreBits = 4, // Padding on left and right side of a restoration block. // 3 is enough, but padding to 4 is more efficient, and makes the temporary // source buffer 8-pixel aligned. @@ -177,6 +188,15 @@ enum { // On Linux, the cache line size can be looked up with the command: // getconf LEVEL1_DCACHE_LINESIZE kCacheLineSize = 64, + // InterRound0, Section 7.11.3.2. + kInterRoundBitsHorizontal = 3, // 8 & 10-bit. + kInterRoundBitsHorizontal12bpp = 5, + kInterRoundBitsCompoundVertical = 7, // 8, 10 & 12-bit compound prediction. + kInterRoundBitsVertical = 11, // 8 & 10-bit, single prediction. + kInterRoundBitsVertical12bpp = 9, + // Offset applied to 10bpp and 12bpp predictors to allow storing them in + // uint16_t. Removed before blending. + kCompoundOffset = (1 << 14) + (1 << 13), }; // anonymous enum enum FrameType : uint8_t { diff --git a/src/utils/segmentation_map.cc b/src/utils/segmentation_map.cc index 4284ca2..bbf40c3 100644 --- a/src/utils/segmentation_map.cc +++ b/src/utils/segmentation_map.cc @@ -21,9 +21,12 @@ namespace libgav1 { bool SegmentationMap::Allocate(int32_t rows4x4, int32_t columns4x4) { + if (rows4x4 * columns4x4 > rows4x4_ * columns4x4_) { + segment_id_buffer_.reset(new (std::nothrow) int8_t[rows4x4 * columns4x4]); + } + rows4x4_ = rows4x4; columns4x4_ = columns4x4; - segment_id_buffer_.reset(new (std::nothrow) int8_t[rows4x4_ * columns4x4_]); if (segment_id_buffer_ == nullptr) return false; segment_id_.Reset(rows4x4_, columns4x4_, segment_id_buffer_.get()); return true; diff --git a/src/warp_prediction.cc b/src/warp_prediction.cc index 69b40e8..0da8a1f 100644 --- a/src/warp_prediction.cc +++ b/src/warp_prediction.cc @@ -231,9 +231,6 @@ bool WarpEstimation(const int num_samples, const int block_width4x4, Clip3(vx, -kWarpModelTranslationClamp, kWarpModelTranslationClamp - 1); params[1] = Clip3(vy, -kWarpModelTranslationClamp, kWarpModelTranslationClamp - 1); - - params[6] = 0; - params[7] = 0; return true; } diff --git a/tests/block_utils.cc b/tests/block_utils.cc index 07337c4..a68ae64 100644 --- a/tests/block_utils.cc +++ b/tests/block_utils.cc @@ -55,7 +55,6 @@ void PrintBlockDiff(const Pixel* block1, const Pixel* block2, int width, block2 += stride2; } } -#undef LIBGAV1_DEBUG_FORMAT_CODE } // namespace @@ -68,15 +67,16 @@ void PrintBlock(const Pixel* block, int width, int height, int stride, printf("[%2d] ", y); for (int x = 0; x < print_width; ++x) { if (x >= width) { - printf("[%*d] ", field_width, block[x]); + printf("[%*" LIBGAV1_DEBUG_FORMAT_CODE "] ", field_width, block[x]); } else { - printf("%*d ", field_width, block[x]); + printf("%*" LIBGAV1_DEBUG_FORMAT_CODE " ", field_width, block[x]); } } printf("\n"); block += stride; } } +#undef LIBGAV1_DEBUG_FORMAT_CODE template void PrintBlock(const uint8_t* block, int width, int height, int stride, bool print_padding /*= false*/); diff --git a/tests/libgav1_tests.cmake b/tests/libgav1_tests.cmake index 2b3f41c..c759d4f 100644 --- a/tests/libgav1_tests.cmake +++ b/tests/libgav1_tests.cmake @@ -96,9 +96,13 @@ list(APPEND libgav1_common_sse4_test_sources list(APPEND libgav1_convolve_test_sources "${libgav1_source}/dsp/convolve_test.cc") list(APPEND libgav1_cpu_test_sources "${libgav1_source}/utils/cpu_test.cc") -list(APPEND libgav1_c_decoder_test_sources "${libgav1_source}/c_decoder_test.c") +list(APPEND libgav1_c_decoder_test_sources + "${libgav1_source}/c_decoder_test.c" + "${libgav1_source}/decoder_test_data.h") list(APPEND libgav1_c_version_test_sources "${libgav1_source}/c_version_test.c") -list(APPEND libgav1_decoder_test_sources "${libgav1_source}/decoder_test.cc") +list(APPEND libgav1_decoder_test_sources + "${libgav1_source}/decoder_test.cc" + "${libgav1_source}/decoder_test_data.h") list(APPEND libgav1_decoder_buffer_test_sources "${libgav1_source}/decoder_buffer_test.cc") list(APPEND libgav1_distance_weighted_blend_test_sources @@ -217,18 +221,6 @@ macro(libgav1_add_tests_targets) ${libgav1_gtest_include_paths} ${libgav1_include_paths}) - if(ANDROID OR IOS) - if(DEFINED LIBGAV1_THREADPOOL_USE_STD_MUTEX - AND NOT LIBGAV1_THREADPOOL_USE_STD_MUTEX) - set(use_absl_threading TRUE) - endif() - elseif(NOT - (DEFINED - LIBGAV1_THREADPOOL_USE_STD_MUTEX - AND LIBGAV1_THREADPOOL_USE_STD_MUTEX)) - set(use_absl_threading TRUE) - endif() - if(use_absl_threading) list(APPEND libgav1_common_test_absl_deps absl::synchronization) endif() diff --git a/tests/utils.h b/tests/utils.h index 4d73070..3394d64 100644 --- a/tests/utils.h +++ b/tests/utils.h @@ -25,6 +25,7 @@ #include "absl/strings/string_view.h" #include "absl/time/time.h" #include "src/gav1/decoder_buffer.h" +#include "src/utils/compiler_attributes.h" #include "src/utils/memory.h" #include "tests/third_party/libvpx/acm_random.h" @@ -42,9 +43,22 @@ static_assert(kAlternateDeterministicSeed != // Similar to libgav1::MaxAlignedAllocable, but retains the throwing versions // of new to support googletest allocations. +// Note when building the source as C++17 or greater, gcc 11.2.0 may issue a +// warning of the form: +// warning: 'void operator delete [](void*, std::align_val_t)' called on +// pointer returned from a mismatched allocation function +// note: returned from 'static void* +// libgav1::test_utils::MaxAlignedAllocable::operator new [](size_t)' +// This is a false positive as this function calls +// libgav1::MaxAlignedAllocable::operator new[](size, std::nothrow) which in +// turn calls +// void* operator new[](std::size_t, std::align_val_t, const std::nothrow_t&). +// This is due to unbalanced inlining of the functions, so we force them to be +// inlined. +// See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=103993 struct MaxAlignedAllocable { // Class-specific allocation functions. - static void* operator new(size_t size) { + static LIBGAV1_ALWAYS_INLINE void* operator new(size_t size) { void* const p = libgav1::MaxAlignedAllocable::operator new(size, std::nothrow); #ifdef ABSL_HAVE_EXCEPTIONS @@ -52,7 +66,7 @@ struct MaxAlignedAllocable { #endif return p; } - static void* operator new[](size_t size) { + static LIBGAV1_ALWAYS_INLINE void* operator new[](size_t size) { void* const p = libgav1::MaxAlignedAllocable::operator new[](size, std::nothrow); #ifdef ABSL_HAVE_EXCEPTIONS @@ -62,29 +76,33 @@ struct MaxAlignedAllocable { } // Class-specific non-throwing allocation functions - static void* operator new(size_t size, const std::nothrow_t& tag) noexcept { + static LIBGAV1_ALWAYS_INLINE void* operator new( + size_t size, const std::nothrow_t& tag) noexcept { return libgav1::MaxAlignedAllocable::operator new(size, tag); } - static void* operator new[](size_t size, const std::nothrow_t& tag) noexcept { + static LIBGAV1_ALWAYS_INLINE void* operator new[]( + size_t size, const std::nothrow_t& tag) noexcept { return libgav1::MaxAlignedAllocable::operator new[](size, tag); } // Class-specific deallocation functions. - static void operator delete(void* ptr) noexcept { + static LIBGAV1_ALWAYS_INLINE void operator delete(void* ptr) noexcept { libgav1::MaxAlignedAllocable::operator delete(ptr); } - static void operator delete[](void* ptr) noexcept { + static LIBGAV1_ALWAYS_INLINE void operator delete[](void* ptr) noexcept { libgav1::MaxAlignedAllocable::operator delete[](ptr); } // Only called if new (std::nothrow) is used and the constructor throws an // exception. - static void operator delete(void* ptr, const std::nothrow_t& tag) noexcept { + static LIBGAV1_ALWAYS_INLINE void operator delete( + void* ptr, const std::nothrow_t& tag) noexcept { libgav1::MaxAlignedAllocable::operator delete(ptr, tag); } // Only called if new[] (std::nothrow) is used and the constructor throws an // exception. - static void operator delete[](void* ptr, const std::nothrow_t& tag) noexcept { + static LIBGAV1_ALWAYS_INLINE void operator delete[]( + void* ptr, const std::nothrow_t& tag) noexcept { libgav1::MaxAlignedAllocable::operator delete[](ptr, tag); } }; |