aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBoyuan Yang <byang@debian.org>2022-07-14 15:56:57 -0400
committerBoyuan Yang <byang@debian.org>2022-07-14 15:56:57 -0400
commitd4dbf19f6b0181ee78034bfe4caf189d1c016998 (patch)
tree47d5d28d2ab770a10e6c48788725c51dffeb84a9
parent320ef65362608ee1148c299d8d5d7618af34e470 (diff)
downloadlibgav1-d4dbf19f6b0181ee78034bfe4caf189d1c016998.tar.gz
libgav1-d4dbf19f6b0181ee78034bfe4caf189d1c016998.tar.bz2
libgav1-d4dbf19f6b0181ee78034bfe4caf189d1c016998.zip
New upstream version 0.18.0
-rw-r--r--CMakeLists.txt34
-rw-r--r--README.md4
-rw-r--r--cmake/libgav1_build_definitions.cmake10
-rw-r--r--cmake/libgav1_install.cmake6
-rw-r--r--cmake/toolchains/android.cmake6
-rw-r--r--cmake/toolchains/arm-linux-gnueabihf.cmake7
-rw-r--r--examples/libgav1_examples.cmake7
-rw-r--r--src/buffer_pool.cc10
-rw-r--r--src/buffer_pool.h39
-rw-r--r--src/c_decoder_test.c104
-rw-r--r--src/decoder_impl.cc44
-rw-r--r--src/decoder_impl.h5
-rw-r--r--src/decoder_test.cc95
-rw-r--r--src/decoder_test_data.h65
-rw-r--r--src/dsp/arm/common_neon.h52
-rw-r--r--src/dsp/arm/convolve_10bit_neon.cc224
-rw-r--r--src/dsp/arm/distance_weighted_blend_neon.cc105
-rw-r--r--src/dsp/arm/film_grain_neon.cc218
-rw-r--r--src/dsp/arm/film_grain_neon.h4
-rw-r--r--src/dsp/arm/intrapred_directional_neon.cc688
-rw-r--r--src/dsp/arm/intrapred_neon.cc10
-rw-r--r--src/dsp/arm/intrapred_smooth_neon.cc339
-rw-r--r--src/dsp/arm/inverse_transform_10bit_neon.cc28
-rw-r--r--src/dsp/arm/inverse_transform_neon.cc146
-rw-r--r--src/dsp/arm/loop_filter_10bit_neon.cc1218
-rw-r--r--src/dsp/arm/loop_filter_neon.cc1298
-rw-r--r--src/dsp/arm/loop_filter_neon.h1
-rw-r--r--src/dsp/arm/loop_restoration_neon.cc8
-rw-r--r--src/dsp/arm/mask_blend_neon.cc375
-rw-r--r--src/dsp/arm/obmc_neon.cc523
-rw-r--r--src/dsp/arm/warp_neon.cc97
-rw-r--r--src/dsp/average_blend.cc18
-rw-r--r--src/dsp/average_blend_test.cc55
-rw-r--r--src/dsp/cdef.cc67
-rw-r--r--src/dsp/cdef.h5
-rw-r--r--src/dsp/cdef_test.cc80
-rw-r--r--src/dsp/constants.h18
-rw-r--r--src/dsp/convolve.cc89
-rw-r--r--src/dsp/convolve.h31
-rw-r--r--src/dsp/convolve.inc31
-rw-r--r--src/dsp/convolve_test.cc246
-rw-r--r--src/dsp/distance_weighted_blend.cc18
-rw-r--r--src/dsp/distance_weighted_blend_test.cc51
-rw-r--r--src/dsp/dsp.cc7
-rw-r--r--src/dsp/dsp_test.cc30
-rw-r--r--src/dsp/film_grain.cc125
-rw-r--r--src/dsp/film_grain_common.h8
-rw-r--r--src/dsp/intra_edge.cc22
-rw-r--r--src/dsp/intra_edge_test.cc46
-rw-r--r--src/dsp/intrapred.cc548
-rw-r--r--src/dsp/intrapred_cfl.cc260
-rw-r--r--src/dsp/intrapred_cfl_test.cc247
-rw-r--r--src/dsp/intrapred_directional.cc47
-rw-r--r--src/dsp/intrapred_directional_test.cc185
-rw-r--r--src/dsp/intrapred_filter.cc18
-rw-r--r--src/dsp/intrapred_filter_test.cc133
-rw-r--r--src/dsp/intrapred_smooth.cc263
-rw-r--r--src/dsp/intrapred_smooth.h6
-rw-r--r--src/dsp/intrapred_test.cc204
-rw-r--r--src/dsp/inverse_transform.cc267
-rw-r--r--src/dsp/inverse_transform_test.cc15
-rw-r--r--src/dsp/libgav1_dsp.cmake1
-rw-r--r--src/dsp/loop_filter.cc70
-rw-r--r--src/dsp/loop_filter_test.cc83
-rw-r--r--src/dsp/loop_restoration.cc25
-rw-r--r--src/dsp/loop_restoration.h10
-rw-r--r--src/dsp/loop_restoration_test.cc108
-rw-r--r--src/dsp/mask_blend.cc46
-rw-r--r--src/dsp/mask_blend_test.cc83
-rw-r--r--src/dsp/obmc.cc24
-rw-r--r--src/dsp/obmc_test.cc80
-rw-r--r--src/dsp/super_res.cc19
-rw-r--r--src/dsp/super_res_test.cc50
-rw-r--r--src/dsp/warp.cc69
-rw-r--r--src/dsp/warp.h30
-rw-r--r--src/dsp/warp_test.cc70
-rw-r--r--src/dsp/weight_mask.cc82
-rw-r--r--src/dsp/weight_mask_test.cc219
-rw-r--r--src/dsp/x86/average_blend_sse4.cc84
-rw-r--r--src/dsp/x86/common_sse4_test.cc4
-rw-r--r--src/dsp/x86/convolve_avx2.cc322
-rw-r--r--src/dsp/x86/convolve_sse4.cc187
-rw-r--r--src/dsp/x86/convolve_sse4.inc98
-rw-r--r--src/dsp/x86/distance_weighted_blend_sse4.cc152
-rw-r--r--src/dsp/x86/film_grain_sse4.cc14
-rw-r--r--src/dsp/x86/intrapred_directional_sse4.cc239
-rw-r--r--src/dsp/x86/loop_restoration_sse4.cc1
-rw-r--r--src/dsp/x86/mask_blend_sse4.cc336
-rw-r--r--src/dsp/x86/obmc_sse4.cc144
-rw-r--r--src/dsp/x86/warp_sse4.cc58
-rw-r--r--src/dsp/x86/weight_mask_sse4.cc360
-rw-r--r--src/film_grain.cc3
-rw-r--r--src/film_grain.h4
-rw-r--r--src/film_grain_test.cc563
-rw-r--r--src/gav1/decoder_buffer.h37
-rw-r--r--src/gav1/version.h2
-rw-r--r--src/libgav1_decoder.cmake2
-rw-r--r--src/obu_parser.cc88
-rw-r--r--src/obu_parser.h27
-rw-r--r--src/obu_parser_test.cc112
-rw-r--r--src/post_filter/deblock.cc1
-rw-r--r--src/post_filter_test.cc142
-rw-r--r--src/quantizer.cc79
-rw-r--r--src/quantizer_test.cc52
-rw-r--r--src/threading_strategy_test.cc27
-rw-r--r--src/tile.h5
-rw-r--r--src/tile/prediction.cc44
-rw-r--r--src/utils/constants.h20
-rw-r--r--src/utils/segmentation_map.cc5
-rw-r--r--src/warp_prediction.cc3
-rw-r--r--tests/block_utils.cc6
-rw-r--r--tests/libgav1_tests.cmake20
-rw-r--r--tests/utils.h34
113 files changed, 8878 insertions, 4376 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4029de1..52b1b32 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -48,6 +48,8 @@ libgav1_option(NAME LIBGAV1_ENABLE_NEON HELPSTRING "Enables neon optimizations."
VALUE ON)
libgav1_option(NAME LIBGAV1_ENABLE_SSE4_1 HELPSTRING
"Enables sse4.1 optimizations." VALUE ON)
+libgav1_option(NAME LIBGAV1_ENABLE_EXAMPLES HELPSTRING "Enables examples." VALUE
+ ON)
libgav1_option(NAME LIBGAV1_ENABLE_TESTS HELPSTRING "Enables tests." VALUE ON)
libgav1_option(
NAME LIBGAV1_VERBOSE HELPSTRING
@@ -101,6 +103,12 @@ libgav1_track_configuration_variable(LIBGAV1_GENERATED_SOURCES_DIRECTORY)
# Controls use of std::mutex and absl::Mutex in ThreadPool.
libgav1_track_configuration_variable(LIBGAV1_THREADPOOL_USE_STD_MUTEX)
+if((DEFINED
+ LIBGAV1_THREADPOOL_USE_STD_MUTEX
+ AND NOT LIBGAV1_THREADPOOL_USE_STD_MUTEX)
+ OR NOT (DEFINED LIBGAV1_THREADPOOL_USE_STD_MUTEX OR ANDROID OR IOS))
+ set(use_absl_threading TRUE)
+endif()
if(LIBGAV1_VERBOSE)
libgav1_dump_cmake_flag_variables()
@@ -124,18 +132,22 @@ endif()
libgav1_set_test_flags()
set(libgav1_abseil "${libgav1_root}/third_party/abseil-cpp")
-if(NOT EXISTS "${libgav1_abseil}")
- message(
- FATAL_ERROR
- "Abseil not found. This dependency is required by the"
- " examples & tests and libgav1 when LIBGAV1_THREADPOOL_USE_STD_MUTEX is"
- " not defined. To continue, download the Abseil repository to"
- " third_party/abseil-cpp:\n git \\\n -C ${libgav1_root} \\\n"
- " clone \\\n"
- " https://github.com/abseil/abseil-cpp.git third_party/abseil-cpp")
+if(EXISTS "${libgav1_abseil}")
+ set(ABSL_PROPAGATE_CXX_STD ON)
+ add_subdirectory("${libgav1_abseil}" "${libgav1_abseil_build}"
+ EXCLUDE_FROM_ALL)
+else()
+ if(use_absl_threading OR LIBGAV1_ENABLE_EXAMPLES OR LIBGAV1_ENABLE_TESTS)
+ message(
+ FATAL_ERROR
+ "Abseil not found. This dependency is required by the"
+ " examples & tests and libgav1 when LIBGAV1_THREADPOOL_USE_STD_MUTEX is"
+ " not defined. To continue, download the Abseil repository to"
+ " third_party/abseil-cpp:\n git \\\n -C ${libgav1_root} \\\n"
+ " clone \\\n"
+ " https://github.com/abseil/abseil-cpp.git third_party/abseil-cpp")
+ endif()
endif()
-set(ABSL_PROPAGATE_CXX_STD ON)
-add_subdirectory("${libgav1_abseil}" "${libgav1_abseil_build}" EXCLUDE_FROM_ALL)
libgav1_reset_target_lists()
libgav1_add_dsp_targets()
diff --git a/README.md b/README.md
index 6744291..04c6a94 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
# libgav1 -- an AV1 decoder
-libgav1 is a Main profile (0) & High profile (1) compliant AV1 decoder. More
-information on the AV1 video format can be found at
+libgav1 is a Main profile (0), High profile (1) & Professional profile (2)
+compliant AV1 decoder. More information on the AV1 video format can be found at
[aomedia.org](https://aomedia.org).
[TOC]
diff --git a/cmake/libgav1_build_definitions.cmake b/cmake/libgav1_build_definitions.cmake
index 0d00bb6..95c17be 100644
--- a/cmake/libgav1_build_definitions.cmake
+++ b/cmake/libgav1_build_definitions.cmake
@@ -31,8 +31,8 @@ macro(libgav1_set_build_definitions)
# passed to libtool.
#
# We set LIBGAV1_SOVERSION = [c-a].a.r
- set(LT_CURRENT 0)
- set(LT_REVISION 1)
+ set(LT_CURRENT 1)
+ set(LT_REVISION 0)
set(LT_AGE 0)
math(EXPR LIBGAV1_SOVERSION_MAJOR "${LT_CURRENT} - ${LT_AGE}")
set(LIBGAV1_SOVERSION "${LIBGAV1_SOVERSION_MAJOR}.${LT_AGE}.${LT_REVISION}")
@@ -142,8 +142,10 @@ macro(libgav1_set_build_definitions)
if(NOT LIBGAV1_MAX_BITDEPTH)
set(LIBGAV1_MAX_BITDEPTH 10)
- elseif(NOT LIBGAV1_MAX_BITDEPTH EQUAL 8 AND NOT LIBGAV1_MAX_BITDEPTH EQUAL 10)
- libgav1_die("LIBGAV1_MAX_BITDEPTH must be 8 or 10.")
+ elseif(NOT LIBGAV1_MAX_BITDEPTH EQUAL 8
+ AND NOT LIBGAV1_MAX_BITDEPTH EQUAL 10
+ AND NOT LIBGAV1_MAX_BITDEPTH EQUAL 12)
+ libgav1_die("LIBGAV1_MAX_BITDEPTH must be 8, 10 or 12.")
endif()
list(APPEND libgav1_defines "LIBGAV1_MAX_BITDEPTH=${LIBGAV1_MAX_BITDEPTH}")
diff --git a/cmake/libgav1_install.cmake b/cmake/libgav1_install.cmake
index b7f6006..e2c79b9 100644
--- a/cmake/libgav1_install.cmake
+++ b/cmake/libgav1_install.cmake
@@ -48,8 +48,10 @@ macro(libgav1_setup_install_target)
FILES ${libgav1_api_includes}
DESTINATION "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR}/gav1")
- install(TARGETS gav1_decode DESTINATION
- "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR}")
+ if(LIBGAV1_ENABLE_EXAMPLES)
+ install(TARGETS gav1_decode DESTINATION
+ "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR}")
+ endif()
install(TARGETS libgav1_static DESTINATION
"${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
if(BUILD_SHARED_LIBS)
diff --git a/cmake/toolchains/android.cmake b/cmake/toolchains/android.cmake
index 492957b..b550397 100644
--- a/cmake/toolchains/android.cmake
+++ b/cmake/toolchains/android.cmake
@@ -30,9 +30,9 @@ if(NOT ANDROID_ABI)
set(ANDROID_ABI arm64-v8a)
endif()
-# Force arm mode for 32-bit targets (instead of the default thumb) to improve
-# performance.
-if(NOT ANDROID_ARM_MODE)
+# Force arm mode for 32-bit arm targets (instead of the default thumb) to
+# improve performance.
+if(ANDROID_ABI MATCHES "^armeabi" AND NOT ANDROID_ARM_MODE)
set(ANDROID_ARM_MODE arm)
endif()
diff --git a/cmake/toolchains/arm-linux-gnueabihf.cmake b/cmake/toolchains/arm-linux-gnueabihf.cmake
index 7448f54..7d58ce1 100644
--- a/cmake/toolchains/arm-linux-gnueabihf.cmake
+++ b/cmake/toolchains/arm-linux-gnueabihf.cmake
@@ -27,10 +27,13 @@ endif()
if(NOT CMAKE_C_COMPILER)
set(CMAKE_C_COMPILER ${CROSS}gcc)
endif()
-set(CMAKE_C_FLAGS_INIT "-march=armv7-a -marm")
+# Note: -march=armv7-a+fp is an alternative to -mfpu with newer versions of
+# gcc:
+# https://gcc.gnu.org/git/?p=gcc.git&a=commit;h=dff2abcbee65dbb4b7ca3ade0f7622ffdc0af391
+set(CMAKE_C_FLAGS_INIT "-march=armv7-a -marm -mfpu=vfpv3")
if(NOT CMAKE_CXX_COMPILER)
set(CMAKE_CXX_COMPILER ${CROSS}g++)
endif()
-set(CMAKE_CXX_FLAGS_INIT "-march=armv7-a -marm")
+set(CMAKE_CXX_FLAGS_INIT "-march=armv7-a -marm -mfpu=vfpv3")
set(CMAKE_SYSTEM_PROCESSOR "armv7")
set(LIBGAV1_NEON_INTRINSICS_FLAG "-mfpu=neon")
diff --git a/examples/libgav1_examples.cmake b/examples/libgav1_examples.cmake
index 1f949f3..a3ec156 100644
--- a/examples/libgav1_examples.cmake
+++ b/examples/libgav1_examples.cmake
@@ -17,6 +17,13 @@ if(LIBGAV1_EXAMPLES_LIBGAV1_EXAMPLES_CMAKE_)
endif() # LIBGAV1_EXAMPLES_LIBGAV1_EXAMPLES_CMAKE_
set(LIBGAV1_EXAMPLES_LIBGAV1_EXAMPLES_CMAKE_ 1)
+if(NOT LIBGAV1_ENABLE_EXAMPLES)
+ macro(libgav1_add_examples_targets)
+
+ endmacro()
+ return()
+endif()
+
set(libgav1_file_reader_sources "${libgav1_examples}/file_reader.cc"
"${libgav1_examples}/file_reader.h"
"${libgav1_examples}/file_reader_constants.cc"
diff --git a/src/buffer_pool.cc b/src/buffer_pool.cc
index c1a5606..582f13c 100644
--- a/src/buffer_pool.cc
+++ b/src/buffer_pool.cc
@@ -156,19 +156,15 @@ bool BufferPool::OnFrameBufferSizeChanged(int bitdepth,
}
RefCountedBufferPtr BufferPool::GetFreeBuffer() {
- // In frame parallel mode, the GetFreeBuffer() calls from ObuParser all happen
- // from the same thread serially, but the GetFreeBuffer() call in
- // DecoderImpl::ApplyFilmGrain can happen from multiple threads at the same
- // time. So this function has to be thread safe.
- // TODO(b/142583029): Investigate if the GetFreeBuffer() call in
- // DecoderImpl::ApplyFilmGrain() call can be serialized so that this function
- // need not be thread safe.
std::unique_lock<std::mutex> lock(mutex_);
for (auto buffer : buffers_) {
if (!buffer->in_use_) {
buffer->in_use_ = true;
buffer->progress_row_ = -1;
buffer->frame_state_ = kFrameStateUnknown;
+ buffer->hdr_cll_set_ = false;
+ buffer->hdr_mdcv_set_ = false;
+ buffer->itut_t35_set_ = false;
lock.unlock();
return RefCountedBufferPtr(buffer, RefCountedBuffer::ReturnToBufferPool);
}
diff --git a/src/buffer_pool.h b/src/buffer_pool.h
index d9eba6d..d4e50e0 100644
--- a/src/buffer_pool.h
+++ b/src/buffer_pool.h
@@ -33,6 +33,7 @@
#include "src/symbol_decoder_context.h"
#include "src/utils/compiler_attributes.h"
#include "src/utils/constants.h"
+#include "src/utils/dynamic_buffer.h"
#include "src/utils/reference_info.h"
#include "src/utils/segmentation.h"
#include "src/utils/segmentation_map.h"
@@ -134,6 +135,36 @@ class RefCountedBuffer : public MaxAlignedAllocable {
int temporal_id() const { return temporal_id_; }
void set_temporal_id(int value) { temporal_id_ = value; }
+ ObuMetadataHdrCll hdr_cll() const { return hdr_cll_; }
+ void set_hdr_cll(const ObuMetadataHdrCll& hdr_cll) {
+ hdr_cll_set_ = true;
+ hdr_cll_ = hdr_cll;
+ }
+ bool hdr_cll_set() const { return hdr_cll_set_; }
+
+ ObuMetadataHdrMdcv hdr_mdcv() const { return hdr_mdcv_; }
+ void set_hdr_mdcv(const ObuMetadataHdrMdcv& hdr_mdcv) {
+ hdr_mdcv_set_ = true;
+ hdr_mdcv_ = hdr_mdcv;
+ }
+ bool hdr_mdcv_set() const { return hdr_mdcv_set_; }
+
+ ObuMetadataItutT35 itut_t35() const { return itut_t35_; }
+ bool set_itut_t35(const ObuMetadataItutT35& itut_t35,
+ const uint8_t* const payload) {
+ itut_t35_ = itut_t35;
+ if (itut_t35.payload_size > 0) {
+ if (!itut_t35_payload_.Resize(itut_t35.payload_size)) return false;
+ memcpy(itut_t35_payload_.get(), payload, itut_t35.payload_size);
+ itut_t35_.payload_bytes = itut_t35_payload_.get();
+ } else {
+ itut_t35_.payload_bytes = nullptr;
+ }
+ itut_t35_set_ = true;
+ return true;
+ }
+ bool itut_t35_set() const { return itut_t35_set_; }
+
SegmentationMap* segmentation_map() { return &segmentation_map_; }
const SegmentationMap* segmentation_map() const { return &segmentation_map_; }
@@ -317,6 +348,14 @@ class RefCountedBuffer : public MaxAlignedAllocable {
int spatial_id_ = 0;
int temporal_id_ = 0;
+ ObuMetadataHdrCll hdr_cll_ = {};
+ bool hdr_cll_set_ = false; // Set to true when set_hdr_cll() is called.
+ ObuMetadataHdrMdcv hdr_mdcv_ = {};
+ bool hdr_mdcv_set_ = false; // Set to true when set_hdr_mdcv() is called.
+ ObuMetadataItutT35 itut_t35_ = {};
+ DynamicBuffer<uint8_t> itut_t35_payload_;
+ bool itut_t35_set_ = false; // Set to true when set_itut_t35() is called.
+
// segmentation_map_ contains a rows4x4_ by columns4x4_ 2D array.
SegmentationMap segmentation_map_;
diff --git a/src/c_decoder_test.c b/src/c_decoder_test.c
index 10ef29f..9587262 100644
--- a/src/c_decoder_test.c
+++ b/src/c_decoder_test.c
@@ -20,6 +20,9 @@
// clang-format off
#include "src/gav1/decoder.h"
+
+// Import the test frame #defines.
+#include "src/decoder_test_data.h"
// clang-format on
#include <stddef.h>
@@ -67,40 +70,17 @@
} \
} while (0)
-// These two frames come from the libaom test vector av1-1-b8-01-size-32x32.ivf
-static const uint8_t kFrame1[] = {
- 0x12, 0x0, 0xa, 0xa, 0x0, 0x0, 0x0, 0x2, 0x27, 0xfe, 0xff, 0xfc,
- 0xc0, 0x20, 0x32, 0x93, 0x2, 0x10, 0x0, 0xa8, 0x80, 0x0, 0x3, 0x0,
- 0x10, 0x10, 0x30, 0x0, 0xd3, 0xc6, 0xc6, 0x82, 0xaa, 0x5e, 0xbf, 0x82,
- 0xf2, 0xa4, 0xa4, 0x29, 0xab, 0xda, 0xd7, 0x1, 0x5, 0x0, 0xb3, 0xde,
- 0xa8, 0x6f, 0x8d, 0xbf, 0x1b, 0xa8, 0x25, 0xc3, 0x84, 0x7c, 0x1a, 0x2b,
- 0x8b, 0x0, 0xff, 0x19, 0x1f, 0x45, 0x7e, 0xe0, 0xbe, 0xe1, 0x3a, 0x63,
- 0xc2, 0xc6, 0x6e, 0xf4, 0xc8, 0xce, 0x11, 0xe1, 0x9f, 0x48, 0x64, 0x72,
- 0xeb, 0xbb, 0x4f, 0xf3, 0x94, 0xb4, 0xb6, 0x9d, 0x4f, 0x4, 0x18, 0x5e,
- 0x5e, 0x1b, 0x65, 0x49, 0x74, 0x90, 0x13, 0x50, 0xef, 0x8c, 0xb8, 0xe8,
- 0xd9, 0x8e, 0x9c, 0xc9, 0x4d, 0xda, 0x60, 0x6a, 0xa, 0xf9, 0x75, 0xd0,
- 0x62, 0x69, 0xd, 0xf5, 0xdc, 0xa9, 0xb9, 0x4c, 0x8, 0x9e, 0x33, 0x15,
- 0xa3, 0xe1, 0x42, 0x0, 0xe2, 0xb0, 0x46, 0xd0, 0xf7, 0xad, 0x55, 0xbc,
- 0x75, 0xe9, 0xe3, 0x1f, 0xa3, 0x41, 0x11, 0xba, 0xaa, 0x81, 0xf3, 0xcb,
- 0x82, 0x87, 0x71, 0x0, 0xe6, 0xb9, 0x8c, 0xe1, 0xe9, 0xd3, 0x21, 0xcc,
- 0xcd, 0xe7, 0x12, 0xb9, 0xe, 0x43, 0x6a, 0xa3, 0x76, 0x5c, 0x35, 0x90,
- 0x45, 0x36, 0x52, 0xb4, 0x2d, 0xa3, 0x55, 0xde, 0x20, 0xf8, 0x80, 0xe1,
- 0x26, 0x46, 0x1b, 0x3f, 0x59, 0xc7, 0x2e, 0x5b, 0x4a, 0x73, 0xf8, 0xb3,
- 0xf4, 0x62, 0xf4, 0xf5, 0xa4, 0xc2, 0xae, 0x9e, 0xa6, 0x9c, 0x10, 0xbb,
- 0xe1, 0xd6, 0x88, 0x75, 0xb9, 0x85, 0x48, 0xe5, 0x7, 0x12, 0xf3, 0x11,
- 0x85, 0x8e, 0xa2, 0x95, 0x9d, 0xed, 0x50, 0xfb, 0x6, 0x5a, 0x1, 0x37,
- 0xc4, 0x8e, 0x9e, 0x73, 0x9b, 0x96, 0x64, 0xbd, 0x42, 0xb, 0x80, 0xde,
- 0x57, 0x86, 0xcb, 0x7d, 0xab, 0x12, 0xb2, 0xcc, 0xe6, 0xea, 0xb5, 0x89,
- 0xeb, 0x91, 0xb3, 0x93, 0xb2, 0x4f, 0x2f, 0x5b, 0xf3, 0x72, 0x12, 0x51,
- 0x56, 0x75, 0xb3, 0xdd, 0x49, 0xb6, 0x5b, 0x77, 0xbe, 0xc5, 0xd7, 0xd4,
- 0xaf, 0xd6, 0x6b, 0x38};
-
-static const uint8_t kFrame2[] = {
- 0x12, 0x0, 0x32, 0x33, 0x30, 0x3, 0xc3, 0x0, 0xa7, 0x2e, 0x46,
- 0xa8, 0x80, 0x0, 0x3, 0x0, 0x10, 0x1, 0x0, 0xa0, 0x0, 0xed,
- 0xb1, 0x51, 0x15, 0x58, 0xc7, 0x69, 0x3, 0x26, 0x35, 0xeb, 0x5a,
- 0x2d, 0x7a, 0x53, 0x24, 0x26, 0x20, 0xa6, 0x11, 0x7, 0x49, 0x76,
- 0xa3, 0xc7, 0x62, 0xf8, 0x3, 0x32, 0xb0, 0x98, 0x17, 0x3d, 0x80};
+static const uint8_t kFrame1[] = {OBU_TEMPORAL_DELIMITER, OBU_SEQUENCE_HEADER,
+ OBU_FRAME_1};
+
+static const uint8_t kFrame2[] = {OBU_TEMPORAL_DELIMITER, OBU_FRAME_2};
+
+static const uint8_t kFrame1WithHdrCllAndHdrMdcv[] = {
+ OBU_TEMPORAL_DELIMITER, OBU_SEQUENCE_HEADER, OBU_METADATA_HDR_CLL,
+ OBU_METADATA_HDR_MDCV, OBU_FRAME_1};
+
+static const uint8_t kFrame2WithItutT35[] = {
+ OBU_TEMPORAL_DELIMITER, OBU_METADATA_ITUT_T35, OBU_FRAME_2};
typedef struct DecoderTest {
Libgav1Decoder* decoder;
@@ -429,12 +409,68 @@ static void DecoderTestNonFrameParallelModeInvalidFrameAfterEOS(void) {
test.decoder = NULL;
}
+static void DecoderTestMetadataObu(void) {
+ DecoderTest test;
+ DecoderTestInit(&test);
+ DecoderTestSetUp(&test);
+
+ Libgav1StatusCode status;
+ const Libgav1DecoderBuffer* buffer;
+
+ // Enqueue frame1 for decoding.
+ status = Libgav1DecoderEnqueueFrame(test.decoder, kFrame1WithHdrCllAndHdrMdcv,
+ sizeof(kFrame1WithHdrCllAndHdrMdcv), 0,
+ (uint8_t*)&kFrame1WithHdrCllAndHdrMdcv);
+ ASSERT_EQ(status, kLibgav1StatusOk);
+ ASSERT_EQ(test.frames_in_use, 0);
+
+ // Dequeue the output of frame1.
+ status = Libgav1DecoderDequeueFrame(test.decoder, &buffer);
+ ASSERT_EQ(status, kLibgav1StatusOk);
+ ASSERT_NE(buffer, NULL);
+ ASSERT_EQ(buffer->has_hdr_cll, 1);
+ ASSERT_EQ(buffer->has_hdr_mdcv, 1);
+ ASSERT_EQ(buffer->has_itut_t35, 0);
+ ASSERT_EQ(test.released_input_buffer, &kFrame1WithHdrCllAndHdrMdcv);
+
+ ASSERT_EQ(test.frames_in_use, 1);
+ ASSERT_EQ(test.buffer_private_data, buffer->buffer_private_data);
+
+ // Enqueue frame2 for decoding.
+ status = Libgav1DecoderEnqueueFrame(test.decoder, kFrame2WithItutT35,
+ sizeof(kFrame2WithItutT35), 0,
+ (uint8_t*)&kFrame2WithItutT35);
+ ASSERT_EQ(status, kLibgav1StatusOk);
+
+ ASSERT_EQ(test.frames_in_use, 1);
+
+ // Dequeue the output of frame2.
+ status = Libgav1DecoderDequeueFrame(test.decoder, &buffer);
+ ASSERT_EQ(status, kLibgav1StatusOk);
+ ASSERT_NE(buffer, NULL);
+ ASSERT_EQ(buffer->has_hdr_cll, 0);
+ ASSERT_EQ(buffer->has_hdr_mdcv, 0);
+ ASSERT_EQ(buffer->has_itut_t35, 1);
+ ASSERT_NE(buffer->itut_t35.payload_bytes, NULL);
+ ASSERT_NE(buffer->itut_t35.payload_size, 0);
+ ASSERT_EQ(test.released_input_buffer, &kFrame2WithItutT35);
+
+ ASSERT_EQ(test.frames_in_use, 2);
+ ASSERT_EQ(test.buffer_private_data, buffer->buffer_private_data);
+
+ status = Libgav1DecoderSignalEOS(test.decoder);
+ ASSERT_EQ(test.frames_in_use, 0);
+
+ Libgav1DecoderDestroy(test.decoder);
+}
+
int main(void) {
fprintf(stderr, "C DecoderTest started\n");
DecoderTestAPIFlowForNonFrameParallelMode();
DecoderTestNonFrameParallelModeEnqueueMultipleFramesWithoutDequeuing();
DecoderTestNonFrameParallelModeEOSBeforeDequeuingLastFrame();
DecoderTestNonFrameParallelModeInvalidFrameAfterEOS();
+ DecoderTestMetadataObu();
fprintf(stderr, "C DecoderTest passed\n");
return 0;
}
diff --git a/src/decoder_impl.cc b/src/decoder_impl.cc
index dbb9e81..e8de64a 100644
--- a/src/decoder_impl.cc
+++ b/src/decoder_impl.cc
@@ -1171,6 +1171,24 @@ StatusCode DecoderImpl::CopyFrameToOutputBuffer(
buffer_.spatial_id = frame->spatial_id();
buffer_.temporal_id = frame->temporal_id();
buffer_.buffer_private_data = frame->buffer_private_data();
+ if (frame->hdr_cll_set()) {
+ buffer_.has_hdr_cll = 1;
+ buffer_.hdr_cll = frame->hdr_cll();
+ } else {
+ buffer_.has_hdr_cll = 0;
+ }
+ if (frame->hdr_mdcv_set()) {
+ buffer_.has_hdr_mdcv = 1;
+ buffer_.hdr_mdcv = frame->hdr_mdcv();
+ } else {
+ buffer_.has_hdr_mdcv = 0;
+ }
+ if (frame->itut_t35_set()) {
+ buffer_.has_itut_t35 = 1;
+ buffer_.itut_t35 = frame->itut_t35();
+ } else {
+ buffer_.has_itut_t35 = 0;
+ }
output_frame_ = frame;
return kStatusOk;
}
@@ -1602,7 +1620,7 @@ StatusCode DecoderImpl::ApplyFilmGrain(
(*film_grain_frame)->buffer()->stride(kPlaneV));
const int output_stride_uv = (*film_grain_frame)->buffer()->stride(kPlaneU);
#if LIBGAV1_MAX_BITDEPTH >= 10
- if (displayable_frame->buffer()->bitdepth() > 8) {
+ if (displayable_frame->buffer()->bitdepth() == 10) {
FilmGrain<10> film_grain(displayable_frame->film_grain_params(),
displayable_frame->buffer()->is_monochrome(),
color_matrix_is_identity,
@@ -1625,6 +1643,30 @@ StatusCode DecoderImpl::ApplyFilmGrain(
return kStatusOk;
}
#endif // LIBGAV1_MAX_BITDEPTH >= 10
+#if LIBGAV1_MAX_BITDEPTH == 12
+ if (displayable_frame->buffer()->bitdepth() == 12) {
+ FilmGrain<12> film_grain(displayable_frame->film_grain_params(),
+ displayable_frame->buffer()->is_monochrome(),
+ color_matrix_is_identity,
+ displayable_frame->buffer()->subsampling_x(),
+ displayable_frame->buffer()->subsampling_y(),
+ displayable_frame->upscaled_width(),
+ displayable_frame->frame_height(), thread_pool);
+ if (!film_grain.AddNoise(
+ displayable_frame->buffer()->data(kPlaneY),
+ displayable_frame->buffer()->stride(kPlaneY),
+ displayable_frame->buffer()->data(kPlaneU),
+ displayable_frame->buffer()->data(kPlaneV), input_stride_uv,
+ (*film_grain_frame)->buffer()->data(kPlaneY),
+ (*film_grain_frame)->buffer()->stride(kPlaneY),
+ (*film_grain_frame)->buffer()->data(kPlaneU),
+ (*film_grain_frame)->buffer()->data(kPlaneV), output_stride_uv)) {
+ LIBGAV1_DLOG(ERROR, "film_grain.AddNoise() failed.");
+ return kStatusOutOfMemory;
+ }
+ return kStatusOk;
+ }
+#endif // LIBGAV1_MAX_BITDEPTH == 12
FilmGrain<8> film_grain(displayable_frame->film_grain_params(),
displayable_frame->buffer()->is_monochrome(),
color_matrix_is_identity,
diff --git a/src/decoder_impl.h b/src/decoder_impl.h
index b52ecdf..b75417d 100644
--- a/src/decoder_impl.h
+++ b/src/decoder_impl.h
@@ -141,8 +141,9 @@ class DecoderImpl : public Allocable {
int64_t user_private_data, void* buffer_private_data);
StatusCode DequeueFrame(const DecoderBuffer** out_ptr);
static constexpr int GetMaxBitdepth() {
- static_assert(LIBGAV1_MAX_BITDEPTH == 8 || LIBGAV1_MAX_BITDEPTH == 10,
- "LIBGAV1_MAX_BITDEPTH must be 8 or 10.");
+ static_assert(LIBGAV1_MAX_BITDEPTH == 8 || LIBGAV1_MAX_BITDEPTH == 10 ||
+ LIBGAV1_MAX_BITDEPTH == 12,
+ "LIBGAV1_MAX_BITDEPTH must be 8, 10 or 12.");
return LIBGAV1_MAX_BITDEPTH;
}
diff --git a/src/decoder_test.cc b/src/decoder_test.cc
index de7d490..e274122 100644
--- a/src/decoder_test.cc
+++ b/src/decoder_test.cc
@@ -20,44 +20,22 @@
#include <new>
#include "gtest/gtest.h"
+#include "src/decoder_test_data.h"
namespace libgav1 {
namespace {
-// These two frames come from the libaom test vector av1-1-b8-01-size-32x32.ivf
-constexpr uint8_t kFrame1[] = {
- 0x12, 0x0, 0xa, 0xa, 0x0, 0x0, 0x0, 0x2, 0x27, 0xfe, 0xff, 0xfc,
- 0xc0, 0x20, 0x32, 0x93, 0x2, 0x10, 0x0, 0xa8, 0x80, 0x0, 0x3, 0x0,
- 0x10, 0x10, 0x30, 0x0, 0xd3, 0xc6, 0xc6, 0x82, 0xaa, 0x5e, 0xbf, 0x82,
- 0xf2, 0xa4, 0xa4, 0x29, 0xab, 0xda, 0xd7, 0x1, 0x5, 0x0, 0xb3, 0xde,
- 0xa8, 0x6f, 0x8d, 0xbf, 0x1b, 0xa8, 0x25, 0xc3, 0x84, 0x7c, 0x1a, 0x2b,
- 0x8b, 0x0, 0xff, 0x19, 0x1f, 0x45, 0x7e, 0xe0, 0xbe, 0xe1, 0x3a, 0x63,
- 0xc2, 0xc6, 0x6e, 0xf4, 0xc8, 0xce, 0x11, 0xe1, 0x9f, 0x48, 0x64, 0x72,
- 0xeb, 0xbb, 0x4f, 0xf3, 0x94, 0xb4, 0xb6, 0x9d, 0x4f, 0x4, 0x18, 0x5e,
- 0x5e, 0x1b, 0x65, 0x49, 0x74, 0x90, 0x13, 0x50, 0xef, 0x8c, 0xb8, 0xe8,
- 0xd9, 0x8e, 0x9c, 0xc9, 0x4d, 0xda, 0x60, 0x6a, 0xa, 0xf9, 0x75, 0xd0,
- 0x62, 0x69, 0xd, 0xf5, 0xdc, 0xa9, 0xb9, 0x4c, 0x8, 0x9e, 0x33, 0x15,
- 0xa3, 0xe1, 0x42, 0x0, 0xe2, 0xb0, 0x46, 0xd0, 0xf7, 0xad, 0x55, 0xbc,
- 0x75, 0xe9, 0xe3, 0x1f, 0xa3, 0x41, 0x11, 0xba, 0xaa, 0x81, 0xf3, 0xcb,
- 0x82, 0x87, 0x71, 0x0, 0xe6, 0xb9, 0x8c, 0xe1, 0xe9, 0xd3, 0x21, 0xcc,
- 0xcd, 0xe7, 0x12, 0xb9, 0xe, 0x43, 0x6a, 0xa3, 0x76, 0x5c, 0x35, 0x90,
- 0x45, 0x36, 0x52, 0xb4, 0x2d, 0xa3, 0x55, 0xde, 0x20, 0xf8, 0x80, 0xe1,
- 0x26, 0x46, 0x1b, 0x3f, 0x59, 0xc7, 0x2e, 0x5b, 0x4a, 0x73, 0xf8, 0xb3,
- 0xf4, 0x62, 0xf4, 0xf5, 0xa4, 0xc2, 0xae, 0x9e, 0xa6, 0x9c, 0x10, 0xbb,
- 0xe1, 0xd6, 0x88, 0x75, 0xb9, 0x85, 0x48, 0xe5, 0x7, 0x12, 0xf3, 0x11,
- 0x85, 0x8e, 0xa2, 0x95, 0x9d, 0xed, 0x50, 0xfb, 0x6, 0x5a, 0x1, 0x37,
- 0xc4, 0x8e, 0x9e, 0x73, 0x9b, 0x96, 0x64, 0xbd, 0x42, 0xb, 0x80, 0xde,
- 0x57, 0x86, 0xcb, 0x7d, 0xab, 0x12, 0xb2, 0xcc, 0xe6, 0xea, 0xb5, 0x89,
- 0xeb, 0x91, 0xb3, 0x93, 0xb2, 0x4f, 0x2f, 0x5b, 0xf3, 0x72, 0x12, 0x51,
- 0x56, 0x75, 0xb3, 0xdd, 0x49, 0xb6, 0x5b, 0x77, 0xbe, 0xc5, 0xd7, 0xd4,
- 0xaf, 0xd6, 0x6b, 0x38};
-
-constexpr uint8_t kFrame2[] = {
- 0x12, 0x0, 0x32, 0x33, 0x30, 0x3, 0xc3, 0x0, 0xa7, 0x2e, 0x46,
- 0xa8, 0x80, 0x0, 0x3, 0x0, 0x10, 0x1, 0x0, 0xa0, 0x0, 0xed,
- 0xb1, 0x51, 0x15, 0x58, 0xc7, 0x69, 0x3, 0x26, 0x35, 0xeb, 0x5a,
- 0x2d, 0x7a, 0x53, 0x24, 0x26, 0x20, 0xa6, 0x11, 0x7, 0x49, 0x76,
- 0xa3, 0xc7, 0x62, 0xf8, 0x3, 0x32, 0xb0, 0x98, 0x17, 0x3d, 0x80};
+constexpr uint8_t kFrame1[] = {OBU_TEMPORAL_DELIMITER, OBU_SEQUENCE_HEADER,
+ OBU_FRAME_1};
+
+constexpr uint8_t kFrame2[] = {OBU_TEMPORAL_DELIMITER, OBU_FRAME_2};
+
+constexpr uint8_t kFrame1WithHdrCllAndHdrMdcv[] = {
+ OBU_TEMPORAL_DELIMITER, OBU_SEQUENCE_HEADER, OBU_METADATA_HDR_CLL,
+ OBU_METADATA_HDR_MDCV, OBU_FRAME_1};
+
+constexpr uint8_t kFrame2WithItutT35[] = {OBU_TEMPORAL_DELIMITER,
+ OBU_METADATA_ITUT_T35, OBU_FRAME_2};
class DecoderTest : public testing::Test {
public:
@@ -348,5 +326,54 @@ TEST_F(DecoderTest, NonFrameParallelModeInvalidFrameAfterEOS) {
EXPECT_EQ(frames_in_use_, 0);
}
+TEST_F(DecoderTest, MetadataObu) {
+ StatusCode status;
+ const DecoderBuffer* buffer;
+
+ // Enqueue frame1 for decoding.
+ status = decoder_->EnqueueFrame(
+ kFrame1WithHdrCllAndHdrMdcv, sizeof(kFrame1WithHdrCllAndHdrMdcv), 0,
+ const_cast<uint8_t*>(kFrame1WithHdrCllAndHdrMdcv));
+ ASSERT_EQ(status, kStatusOk);
+
+ // Dequeue the output of frame1.
+ status = decoder_->DequeueFrame(&buffer);
+ ASSERT_EQ(status, kStatusOk);
+ ASSERT_NE(buffer, nullptr);
+ EXPECT_EQ(buffer->has_hdr_cll, 1);
+ EXPECT_EQ(buffer->has_hdr_mdcv, 1);
+ EXPECT_EQ(buffer->has_itut_t35, 0);
+ EXPECT_EQ(released_input_buffer_, &kFrame1WithHdrCllAndHdrMdcv);
+
+ // libgav1 has decoded frame1 and is holding a reference to it.
+ EXPECT_EQ(frames_in_use_, 1);
+ EXPECT_EQ(buffer_private_data_, buffer->buffer_private_data);
+
+ // Enqueue frame2 for decoding.
+ status =
+ decoder_->EnqueueFrame(kFrame2WithItutT35, sizeof(kFrame2WithItutT35), 0,
+ const_cast<uint8_t*>(kFrame2WithItutT35));
+ ASSERT_EQ(status, kStatusOk);
+
+ EXPECT_EQ(frames_in_use_, 1);
+
+ // Dequeue the output of frame2.
+ status = decoder_->DequeueFrame(&buffer);
+ ASSERT_EQ(status, kStatusOk);
+ ASSERT_NE(buffer, nullptr);
+ EXPECT_EQ(buffer->has_hdr_cll, 0);
+ EXPECT_EQ(buffer->has_hdr_mdcv, 0);
+ EXPECT_EQ(buffer->has_itut_t35, 1);
+ EXPECT_NE(buffer->itut_t35.payload_bytes, nullptr);
+ EXPECT_GT(buffer->itut_t35.payload_size, 0);
+ EXPECT_EQ(released_input_buffer_, &kFrame2WithItutT35);
+
+ EXPECT_EQ(frames_in_use_, 2);
+ EXPECT_EQ(buffer_private_data_, buffer->buffer_private_data);
+
+ status = decoder_->SignalEOS();
+ EXPECT_EQ(frames_in_use_, 0);
+}
+
} // namespace
} // namespace libgav1
diff --git a/src/decoder_test_data.h b/src/decoder_test_data.h
new file mode 100644
index 0000000..78b6b46
--- /dev/null
+++ b/src/decoder_test_data.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright 2022 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DECODER_TEST_DATA_H_
+#define LIBGAV1_SRC_DECODER_TEST_DATA_H_
+
+// The bytes for these two frames come from the libaom test vector
+// av1-1-b8-01-size-32x32.ivf
+#define OBU_TEMPORAL_DELIMITER 0x12, 0x0
+#define OBU_SEQUENCE_HEADER \
+ 0xa, 0xa, 0x0, 0x0, 0x0, 0x2, 0x27, 0xfe, 0xff, 0xfc, 0xc0, 0x20
+#define OBU_FRAME_1 \
+ 0x32, 0x93, 0x2, 0x10, 0x0, 0xa8, 0x80, 0x0, 0x3, 0x0, 0x10, 0x10, 0x30, \
+ 0x0, 0xd3, 0xc6, 0xc6, 0x82, 0xaa, 0x5e, 0xbf, 0x82, 0xf2, 0xa4, 0xa4, \
+ 0x29, 0xab, 0xda, 0xd7, 0x1, 0x5, 0x0, 0xb3, 0xde, 0xa8, 0x6f, 0x8d, \
+ 0xbf, 0x1b, 0xa8, 0x25, 0xc3, 0x84, 0x7c, 0x1a, 0x2b, 0x8b, 0x0, 0xff, \
+ 0x19, 0x1f, 0x45, 0x7e, 0xe0, 0xbe, 0xe1, 0x3a, 0x63, 0xc2, 0xc6, 0x6e, \
+ 0xf4, 0xc8, 0xce, 0x11, 0xe1, 0x9f, 0x48, 0x64, 0x72, 0xeb, 0xbb, 0x4f, \
+ 0xf3, 0x94, 0xb4, 0xb6, 0x9d, 0x4f, 0x4, 0x18, 0x5e, 0x5e, 0x1b, 0x65, \
+ 0x49, 0x74, 0x90, 0x13, 0x50, 0xef, 0x8c, 0xb8, 0xe8, 0xd9, 0x8e, 0x9c, \
+ 0xc9, 0x4d, 0xda, 0x60, 0x6a, 0xa, 0xf9, 0x75, 0xd0, 0x62, 0x69, 0xd, \
+ 0xf5, 0xdc, 0xa9, 0xb9, 0x4c, 0x8, 0x9e, 0x33, 0x15, 0xa3, 0xe1, 0x42, \
+ 0x0, 0xe2, 0xb0, 0x46, 0xd0, 0xf7, 0xad, 0x55, 0xbc, 0x75, 0xe9, 0xe3, \
+ 0x1f, 0xa3, 0x41, 0x11, 0xba, 0xaa, 0x81, 0xf3, 0xcb, 0x82, 0x87, 0x71, \
+ 0x0, 0xe6, 0xb9, 0x8c, 0xe1, 0xe9, 0xd3, 0x21, 0xcc, 0xcd, 0xe7, 0x12, \
+ 0xb9, 0xe, 0x43, 0x6a, 0xa3, 0x76, 0x5c, 0x35, 0x90, 0x45, 0x36, 0x52, \
+ 0xb4, 0x2d, 0xa3, 0x55, 0xde, 0x20, 0xf8, 0x80, 0xe1, 0x26, 0x46, 0x1b, \
+ 0x3f, 0x59, 0xc7, 0x2e, 0x5b, 0x4a, 0x73, 0xf8, 0xb3, 0xf4, 0x62, 0xf4, \
+ 0xf5, 0xa4, 0xc2, 0xae, 0x9e, 0xa6, 0x9c, 0x10, 0xbb, 0xe1, 0xd6, 0x88, \
+ 0x75, 0xb9, 0x85, 0x48, 0xe5, 0x7, 0x12, 0xf3, 0x11, 0x85, 0x8e, 0xa2, \
+ 0x95, 0x9d, 0xed, 0x50, 0xfb, 0x6, 0x5a, 0x1, 0x37, 0xc4, 0x8e, 0x9e, \
+ 0x73, 0x9b, 0x96, 0x64, 0xbd, 0x42, 0xb, 0x80, 0xde, 0x57, 0x86, 0xcb, \
+ 0x7d, 0xab, 0x12, 0xb2, 0xcc, 0xe6, 0xea, 0xb5, 0x89, 0xeb, 0x91, 0xb3, \
+ 0x93, 0xb2, 0x4f, 0x2f, 0x5b, 0xf3, 0x72, 0x12, 0x51, 0x56, 0x75, 0xb3, \
+ 0xdd, 0x49, 0xb6, 0x5b, 0x77, 0xbe, 0xc5, 0xd7, 0xd4, 0xaf, 0xd6, 0x6b, \
+ 0x38
+#define OBU_FRAME_2 \
+ 0x32, 0x33, 0x30, 0x3, 0xc3, 0x0, 0xa7, 0x2e, 0x46, 0xa8, 0x80, 0x0, 0x3, \
+ 0x0, 0x10, 0x1, 0x0, 0xa0, 0x0, 0xed, 0xb1, 0x51, 0x15, 0x58, 0xc7, \
+ 0x69, 0x3, 0x26, 0x35, 0xeb, 0x5a, 0x2d, 0x7a, 0x53, 0x24, 0x26, 0x20, \
+ 0xa6, 0x11, 0x7, 0x49, 0x76, 0xa3, 0xc7, 0x62, 0xf8, 0x3, 0x32, 0xb0, \
+ 0x98, 0x17, 0x3d, 0x80
+#define OBU_METADATA_HDR_CLL 0x2a, 0x06, 0x01, 0x27, 0x10, 0x0d, 0xdf, 0x80
+#define OBU_METADATA_HDR_MDCV \
+ 0x2a, 0x1a, 0x02, 0xae, 0x14, 0x51, 0xec, 0x43, 0xd7, 0xb0, 0xa4, 0x26, \
+ 0x66, 0x0f, 0x5c, 0x50, 0x0d, 0x54, 0x39, 0x00, 0x0f, 0xa0, 0x00, 0x00, \
+ 0x00, 0x00, 0x52, 0x80
+#define OBU_METADATA_ITUT_T35 \
+ 0x2a, 0xf, 0x04, 0xa6, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, \
+ 0x00, 0x80, 0x00, 0x00
+
+#endif // LIBGAV1_SRC_DECODER_TEST_DATA_H_
diff --git a/src/dsp/arm/common_neon.h b/src/dsp/arm/common_neon.h
index 9c46525..c0af2c1 100644
--- a/src/dsp/arm/common_neon.h
+++ b/src/dsp/arm/common_neon.h
@@ -309,6 +309,12 @@ inline uint8x16_t MaskOverreadsQ(const uint8x16_t source,
return dst;
}
+inline uint16x8_t MaskOverreadsQ(const uint16x8_t source,
+ const ptrdiff_t over_read_in_bytes) {
+ return vreinterpretq_u16_u8(
+ MaskOverreadsQ(vreinterpretq_u8_u16(source), over_read_in_bytes));
+}
+
inline uint8x8_t Load1MsanU8(const uint8_t* const source,
const ptrdiff_t over_read_in_bytes) {
return MaskOverreads(vld1_u8(source), over_read_in_bytes);
@@ -325,20 +331,6 @@ inline uint16x8_t Load1QMsanU16(const uint16_t* const source,
vreinterpretq_u8_u16(vld1q_u16(source)), over_read_in_bytes));
}
-inline uint16x8x2_t Load2QMsanU16(const uint16_t* const source,
- const ptrdiff_t over_read_in_bytes) {
- // Relative source index of elements (2 bytes each):
- // dst.val[0]: 00 02 04 06 08 10 12 14
- // dst.val[1]: 01 03 05 07 09 11 13 15
- uint16x8x2_t dst = vld2q_u16(source);
- dst.val[0] = vreinterpretq_u16_u8(MaskOverreadsQ(
- vreinterpretq_u8_u16(dst.val[0]), over_read_in_bytes >> 1));
- dst.val[1] = vreinterpretq_u16_u8(
- MaskOverreadsQ(vreinterpretq_u8_u16(dst.val[1]),
- (over_read_in_bytes >> 1) + (over_read_in_bytes % 4)));
- return dst;
-}
-
inline uint32x4_t Load1QMsanU32(const uint32_t* const source,
const ptrdiff_t over_read_in_bytes) {
return vreinterpretq_u32_u8(MaskOverreadsQ(
@@ -402,6 +394,24 @@ inline void Store8(void* const buf, const uint16x8_t val) {
vst1q_u16(static_cast<uint16_t*>(buf), val);
}
+inline void Store4QMsanS16(void* const buf, const int16x8x4_t src) {
+#if LIBGAV1_MSAN
+ // The memory shadow is incorrect for vst4q_u16, only marking the first 16
+ // bytes of the destination as initialized. To avoid missing truly
+ // uninitialized memory, check the input vectors first, before marking the
+ // whole 64 bytes initialized. If any input vector contains unused values, it
+ // should pass through MaskOverreadsQ first.
+ __msan_check_mem_is_initialized(&src.val[0], sizeof(src.val[0]));
+ __msan_check_mem_is_initialized(&src.val[1], sizeof(src.val[1]));
+ __msan_check_mem_is_initialized(&src.val[2], sizeof(src.val[2]));
+ __msan_check_mem_is_initialized(&src.val[3], sizeof(src.val[3]));
+ vst4q_s16(static_cast<int16_t*>(buf), src);
+ __msan_unpoison(buf, sizeof(int16x8x4_t));
+#else
+ vst4q_s16(static_cast<int16_t*>(buf), src);
+#endif // LIBGAV1_MSAN
+}
+
//------------------------------------------------------------------------------
// Pointer helpers.
@@ -587,7 +597,8 @@ inline int8x8_t VQTbl1S8(const int8x16_t a, const uint8x8_t index) {
//------------------------------------------------------------------------------
// Saturation helpers.
-inline int16x4_t Clip3S16(int16x4_t val, int16x4_t low, int16x4_t high) {
+inline int16x4_t Clip3S16(const int16x4_t val, const int16x4_t low,
+ const int16x4_t high) {
return vmin_s16(vmax_s16(val, low), high);
}
@@ -596,7 +607,7 @@ inline int16x8_t Clip3S16(const int16x8_t val, const int16x8_t low,
return vminq_s16(vmaxq_s16(val, low), high);
}
-inline uint16x8_t ConvertToUnsignedPixelU16(int16x8_t val, int bitdepth) {
+inline uint16x8_t ConvertToUnsignedPixelU16(const int16x8_t val, int bitdepth) {
const int16x8_t low = vdupq_n_s16(0);
const uint16x8_t high = vdupq_n_u16((1 << bitdepth) - 1);
@@ -727,7 +738,7 @@ inline uint16x8_t Transpose64(const uint16x8_t a) { return vextq_u16(a, a, 4); }
// Output:
// b0.val[0]: 00 01 02 03 16 17 18 19
// b0.val[1]: 04 05 06 07 20 21 22 23
-inline int16x8x2_t VtrnqS64(int32x4_t a0, int32x4_t a1) {
+inline int16x8x2_t VtrnqS64(const int32x4_t a0, const int32x4_t a1) {
int16x8x2_t b0;
b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)),
vreinterpret_s16_s32(vget_low_s32(a1)));
@@ -736,7 +747,7 @@ inline int16x8x2_t VtrnqS64(int32x4_t a0, int32x4_t a1) {
return b0;
}
-inline uint16x8x2_t VtrnqU64(uint32x4_t a0, uint32x4_t a1) {
+inline uint16x8x2_t VtrnqU64(const uint32x4_t a0, const uint32x4_t a1) {
uint16x8x2_t b0;
b0.val[0] = vcombine_u16(vreinterpret_u16_u32(vget_low_u32(a0)),
vreinterpret_u16_u32(vget_low_u32(a1)));
@@ -750,6 +761,11 @@ inline uint16x8x2_t VtrnqU64(uint32x4_t a0, uint32x4_t a1) {
// 10 11 12 13
// 20 21 22 23
// 30 31 32 33
+// Output:
+// 00 10 20 30
+// 01 11 21 31
+// 02 12 22 32
+// 03 13 23 33
inline void Transpose4x4(uint16x4_t a[4]) {
// b:
// 00 10 02 12
diff --git a/src/dsp/arm/convolve_10bit_neon.cc b/src/dsp/arm/convolve_10bit_neon.cc
index b7205df..389f029 100644
--- a/src/dsp/arm/convolve_10bit_neon.cc
+++ b/src/dsp/arm/convolve_10bit_neon.cc
@@ -45,12 +45,12 @@ namespace {
// Pixel output range: [ 0, 1023]
// Compound output range: [ 3988, 61532]
-template <int filter_index>
+template <int num_taps>
int32x4x2_t SumOnePassTaps(const uint16x8_t* const src,
const int16x4_t* const taps) {
const auto* ssrc = reinterpret_cast<const int16x8_t*>(src);
int32x4x2_t sum;
- if (filter_index < 2) {
+ if (num_taps == 6) {
// 6 taps.
sum.val[0] = vmull_s16(vget_low_s16(ssrc[0]), taps[0]);
sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[1]), taps[1]);
@@ -65,7 +65,7 @@ int32x4x2_t SumOnePassTaps(const uint16x8_t* const src,
sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[3]), taps[3]);
sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[4]), taps[4]);
sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[5]), taps[5]);
- } else if (filter_index == 2) {
+ } else if (num_taps == 8) {
// 8 taps.
sum.val[0] = vmull_s16(vget_low_s16(ssrc[0]), taps[0]);
sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[1]), taps[1]);
@@ -84,7 +84,7 @@ int32x4x2_t SumOnePassTaps(const uint16x8_t* const src,
sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[5]), taps[5]);
sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[6]), taps[6]);
sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[7]), taps[7]);
- } else if (filter_index == 3) {
+ } else if (num_taps == 2) {
// 2 taps.
sum.val[0] = vmull_s16(vget_low_s16(ssrc[0]), taps[0]);
sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[1]), taps[1]);
@@ -106,12 +106,12 @@ int32x4x2_t SumOnePassTaps(const uint16x8_t* const src,
return sum;
}
-template <int filter_index>
+template <int num_taps>
int32x4_t SumOnePassTaps(const uint16x4_t* const src,
const int16x4_t* const taps) {
const auto* ssrc = reinterpret_cast<const int16x4_t*>(src);
int32x4_t sum;
- if (filter_index < 2) {
+ if (num_taps == 6) {
// 6 taps.
sum = vmull_s16(ssrc[0], taps[0]);
sum = vmlal_s16(sum, ssrc[1], taps[1]);
@@ -119,7 +119,7 @@ int32x4_t SumOnePassTaps(const uint16x4_t* const src,
sum = vmlal_s16(sum, ssrc[3], taps[3]);
sum = vmlal_s16(sum, ssrc[4], taps[4]);
sum = vmlal_s16(sum, ssrc[5], taps[5]);
- } else if (filter_index == 2) {
+ } else if (num_taps == 8) {
// 8 taps.
sum = vmull_s16(ssrc[0], taps[0]);
sum = vmlal_s16(sum, ssrc[1], taps[1]);
@@ -129,7 +129,7 @@ int32x4_t SumOnePassTaps(const uint16x4_t* const src,
sum = vmlal_s16(sum, ssrc[5], taps[5]);
sum = vmlal_s16(sum, ssrc[6], taps[6]);
sum = vmlal_s16(sum, ssrc[7], taps[7]);
- } else if (filter_index == 3) {
+ } else if (num_taps == 2) {
// 2 taps.
sum = vmull_s16(ssrc[0], taps[0]);
sum = vmlal_s16(sum, ssrc[1], taps[1]);
@@ -143,7 +143,7 @@ int32x4_t SumOnePassTaps(const uint16x4_t* const src,
return sum;
}
-template <int filter_index, bool is_compound, bool is_2d>
+template <int num_taps, bool is_compound, bool is_2d>
void FilterHorizontalWidth8AndUp(const uint16_t* LIBGAV1_RESTRICT src,
const ptrdiff_t src_stride,
void* LIBGAV1_RESTRICT const dest,
@@ -162,15 +162,15 @@ void FilterHorizontalWidth8AndUp(const uint16_t* LIBGAV1_RESTRICT src,
const uint16x8_t src_long_hi = vld1q_u16(s + 8);
uint16x8_t v_src[8];
int32x4x2_t v_sum;
- if (filter_index < 2) {
+ if (num_taps == 6) {
v_src[0] = src_long;
v_src[1] = vextq_u16(src_long, src_long_hi, 1);
v_src[2] = vextq_u16(src_long, src_long_hi, 2);
v_src[3] = vextq_u16(src_long, src_long_hi, 3);
v_src[4] = vextq_u16(src_long, src_long_hi, 4);
v_src[5] = vextq_u16(src_long, src_long_hi, 5);
- v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 1);
- } else if (filter_index == 2) {
+ v_sum = SumOnePassTaps<num_taps>(v_src, v_tap + 1);
+ } else if (num_taps == 8) {
v_src[0] = src_long;
v_src[1] = vextq_u16(src_long, src_long_hi, 1);
v_src[2] = vextq_u16(src_long, src_long_hi, 2);
@@ -179,17 +179,17 @@ void FilterHorizontalWidth8AndUp(const uint16_t* LIBGAV1_RESTRICT src,
v_src[5] = vextq_u16(src_long, src_long_hi, 5);
v_src[6] = vextq_u16(src_long, src_long_hi, 6);
v_src[7] = vextq_u16(src_long, src_long_hi, 7);
- v_sum = SumOnePassTaps<filter_index>(v_src, v_tap);
- } else if (filter_index == 3) {
+ v_sum = SumOnePassTaps<num_taps>(v_src, v_tap);
+ } else if (num_taps == 2) {
v_src[0] = src_long;
v_src[1] = vextq_u16(src_long, src_long_hi, 1);
- v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 3);
- } else { // filter_index > 3
+ v_sum = SumOnePassTaps<num_taps>(v_src, v_tap + 3);
+ } else { // 4 taps
v_src[0] = src_long;
v_src[1] = vextq_u16(src_long, src_long_hi, 1);
v_src[2] = vextq_u16(src_long, src_long_hi, 2);
v_src[3] = vextq_u16(src_long, src_long_hi, 3);
- v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 2);
+ v_sum = SumOnePassTaps<num_taps>(v_src, v_tap + 2);
}
const int16x4_t d0 =
@@ -213,15 +213,15 @@ void FilterHorizontalWidth8AndUp(const uint16_t* LIBGAV1_RESTRICT src,
const uint16x8_t src_long_hi = vld1q_u16(src + x + 8);
uint16x8_t v_src[8];
int32x4x2_t v_sum;
- if (filter_index < 2) {
+ if (num_taps == 6) {
v_src[0] = src_long;
v_src[1] = vextq_u16(src_long, src_long_hi, 1);
v_src[2] = vextq_u16(src_long, src_long_hi, 2);
v_src[3] = vextq_u16(src_long, src_long_hi, 3);
v_src[4] = vextq_u16(src_long, src_long_hi, 4);
v_src[5] = vextq_u16(src_long, src_long_hi, 5);
- v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 1);
- } else if (filter_index == 2) {
+ v_sum = SumOnePassTaps<num_taps>(v_src, v_tap + 1);
+ } else if (num_taps == 8) {
v_src[0] = src_long;
v_src[1] = vextq_u16(src_long, src_long_hi, 1);
v_src[2] = vextq_u16(src_long, src_long_hi, 2);
@@ -230,17 +230,17 @@ void FilterHorizontalWidth8AndUp(const uint16_t* LIBGAV1_RESTRICT src,
v_src[5] = vextq_u16(src_long, src_long_hi, 5);
v_src[6] = vextq_u16(src_long, src_long_hi, 6);
v_src[7] = vextq_u16(src_long, src_long_hi, 7);
- v_sum = SumOnePassTaps<filter_index>(v_src, v_tap);
- } else if (filter_index == 3) {
+ v_sum = SumOnePassTaps<num_taps>(v_src, v_tap);
+ } else if (num_taps == 2) {
v_src[0] = src_long;
v_src[1] = vextq_u16(src_long, src_long_hi, 1);
- v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 3);
- } else { // filter_index > 3
+ v_sum = SumOnePassTaps<num_taps>(v_src, v_tap + 3);
+ } else { // 4 taps
v_src[0] = src_long;
v_src[1] = vextq_u16(src_long, src_long_hi, 1);
v_src[2] = vextq_u16(src_long, src_long_hi, 2);
v_src[3] = vextq_u16(src_long, src_long_hi, 3);
- v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 2);
+ v_sum = SumOnePassTaps<num_taps>(v_src, v_tap + 2);
}
if (is_compound) {
const int16x4_t v_compound_offset = vdup_n_s16(kCompoundOffset);
@@ -276,7 +276,7 @@ void FilterHorizontalWidth8AndUp(const uint16_t* LIBGAV1_RESTRICT src,
} while (--y != 0);
}
-template <int filter_index, bool is_compound, bool is_2d>
+template <int num_taps, bool is_compound, bool is_2d>
void FilterHorizontalWidth4(const uint16_t* LIBGAV1_RESTRICT src,
const ptrdiff_t src_stride,
void* LIBGAV1_RESTRICT const dest,
@@ -291,14 +291,14 @@ void FilterHorizontalWidth4(const uint16_t* LIBGAV1_RESTRICT src,
int32x4_t v_sum;
const uint16x8_t src_long = vld1q_u16(src);
v_src[0] = vget_low_u16(src_long);
- if (filter_index == 3) {
+ if (num_taps == 2) {
v_src[1] = vget_low_u16(vextq_u16(src_long, v_zero, 1));
- v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 3);
+ v_sum = SumOnePassTaps<num_taps>(v_src, v_tap + 3);
} else {
v_src[1] = vget_low_u16(vextq_u16(src_long, v_zero, 1));
v_src[2] = vget_low_u16(vextq_u16(src_long, v_zero, 2));
v_src[3] = vget_low_u16(vextq_u16(src_long, v_zero, 3));
- v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 2);
+ v_sum = SumOnePassTaps<num_taps>(v_src, v_tap + 2);
}
if (is_compound || is_2d) {
const int16x4_t d0 = vqrshrn_n_s32(v_sum, kInterRoundBitsHorizontal - 1);
@@ -321,7 +321,7 @@ void FilterHorizontalWidth4(const uint16_t* LIBGAV1_RESTRICT src,
} while (--y != 0);
}
-template <int filter_index, bool is_2d>
+template <int num_taps, bool is_2d>
void FilterHorizontalWidth2(const uint16_t* LIBGAV1_RESTRICT src,
const ptrdiff_t src_stride,
void* LIBGAV1_RESTRICT const dest,
@@ -336,7 +336,7 @@ void FilterHorizontalWidth2(const uint16_t* LIBGAV1_RESTRICT src,
const int16x8_t input1 = vreinterpretq_s16_u16(vld1q_u16(src + src_stride));
const int16x8x2_t input = vzipq_s16(input0, input1);
int32x4_t v_sum;
- if (filter_index == 3) {
+ if (num_taps == 2) {
v_sum = vmull_s16(vget_low_s16(input.val[0]), v_tap[3]);
v_sum = vmlal_s16(v_sum,
vget_low_s16(vextq_s16(input.val[0], input.val[1], 2)),
@@ -387,7 +387,7 @@ void FilterHorizontalWidth2(const uint16_t* LIBGAV1_RESTRICT src,
assert(height % 2 == 1);
const int16x8_t input = vreinterpretq_s16_u16(vld1q_u16(src));
int32x4_t v_sum;
- if (filter_index == 3) {
+ if (num_taps == 2) {
v_sum = vmull_s16(vget_low_s16(input), v_tap[3]);
v_sum =
vmlal_s16(v_sum, vget_low_s16(vextq_s16(input, input, 1)), v_tap[4]);
@@ -406,17 +406,17 @@ void FilterHorizontalWidth2(const uint16_t* LIBGAV1_RESTRICT src,
}
}
-template <int filter_index, bool is_compound, bool is_2d>
+template <int num_taps, bool is_compound, bool is_2d>
void FilterHorizontal(const uint16_t* LIBGAV1_RESTRICT const src,
const ptrdiff_t src_stride,
void* LIBGAV1_RESTRICT const dest,
const ptrdiff_t pred_stride, const int width,
const int height, const int16x4_t* const v_tap) {
- assert(width < 8 || filter_index <= 3);
+ assert(width < 8 || num_taps != 4);
// Don't simplify the redundant if conditions with the template parameters,
// which helps the compiler generate compact code.
- if (width >= 8 && filter_index <= 3) {
- FilterHorizontalWidth8AndUp<filter_index, is_compound, is_2d>(
+ if (width >= 8 && num_taps != 4) {
+ FilterHorizontalWidth8AndUp<num_taps, is_compound, is_2d>(
src, src_stride, dest, pred_stride, width, height, v_tap);
return;
}
@@ -424,17 +424,17 @@ void FilterHorizontal(const uint16_t* LIBGAV1_RESTRICT const src,
// Horizontal passes only needs to account for number of taps 2 and 4 when
// |width| <= 4.
assert(width <= 4);
- assert(filter_index >= 3 && filter_index <= 5);
- if (filter_index >= 3 && filter_index <= 5) {
+ assert(num_taps == 2 || num_taps == 4);
+ if (num_taps == 2 || num_taps == 4) {
if (width == 4) {
- FilterHorizontalWidth4<filter_index, is_compound, is_2d>(
+ FilterHorizontalWidth4<num_taps, is_compound, is_2d>(
src, src_stride, dest, pred_stride, height, v_tap);
return;
}
assert(width == 2);
if (!is_compound) {
- FilterHorizontalWidth2<filter_index, is_2d>(src, src_stride, dest,
- pred_stride, height, v_tap);
+ FilterHorizontalWidth2<num_taps, is_2d>(src, src_stride, dest,
+ pred_stride, height, v_tap);
}
}
}
@@ -455,22 +455,17 @@ LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
}
if (filter_index == 2) { // 8 tap.
- FilterHorizontal<2, is_compound, is_2d>(src, src_stride, dst, dst_stride,
+ FilterHorizontal<8, is_compound, is_2d>(src, src_stride, dst, dst_stride,
width, height, v_tap);
- } else if (filter_index == 1) { // 6 tap.
- FilterHorizontal<1, is_compound, is_2d>(src + 1, src_stride, dst,
+ } else if (filter_index < 2) { // 6 tap.
+ FilterHorizontal<6, is_compound, is_2d>(src + 1, src_stride, dst,
dst_stride, width, height, v_tap);
- } else if (filter_index == 0) { // 6 tap.
- FilterHorizontal<0, is_compound, is_2d>(src + 1, src_stride, dst,
- dst_stride, width, height, v_tap);
- } else if (filter_index == 4) { // 4 tap.
+ } else if ((filter_index & 0x4) != 0) { // 4 tap.
+ // ((filter_index == 4) | (filter_index == 5))
FilterHorizontal<4, is_compound, is_2d>(src + 2, src_stride, dst,
dst_stride, width, height, v_tap);
- } else if (filter_index == 5) { // 4 tap.
- FilterHorizontal<5, is_compound, is_2d>(src + 2, src_stride, dst,
- dst_stride, width, height, v_tap);
} else { // 2 tap.
- FilterHorizontal<3, is_compound, is_2d>(src + 3, src_stride, dst,
+ FilterHorizontal<2, is_compound, is_2d>(src + 3, src_stride, dst,
dst_stride, width, height, v_tap);
}
}
@@ -510,13 +505,12 @@ void ConvolveCompoundHorizontal_NEON(
filter_index);
}
-template <int filter_index, bool is_compound = false>
+template <int num_taps, bool is_compound = false>
void FilterVertical(const uint16_t* LIBGAV1_RESTRICT const src,
const ptrdiff_t src_stride,
void* LIBGAV1_RESTRICT const dst,
const ptrdiff_t dst_stride, const int width,
const int height, const int16x4_t* const taps) {
- const int num_taps = GetNumTapsInFilter(filter_index);
const int next_row = num_taps - 1;
const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1);
auto* const dst16 = static_cast<uint16_t*>(dst);
@@ -555,7 +549,7 @@ void FilterVertical(const uint16_t* LIBGAV1_RESTRICT const src,
srcs[next_row] = vld1q_u16(src_x);
src_x += src_stride;
- const int32x4x2_t v_sum = SumOnePassTaps<filter_index>(srcs, taps);
+ const int32x4x2_t v_sum = SumOnePassTaps<num_taps>(srcs, taps);
if (is_compound) {
const int16x4_t v_compound_offset = vdup_n_s16(kCompoundOffset);
const int16x4_t d0 =
@@ -593,13 +587,12 @@ void FilterVertical(const uint16_t* LIBGAV1_RESTRICT const src,
} while (x < width);
}
-template <int filter_index, bool is_compound = false>
+template <int num_taps, bool is_compound = false>
void FilterVertical4xH(const uint16_t* LIBGAV1_RESTRICT src,
const ptrdiff_t src_stride,
void* LIBGAV1_RESTRICT const dst,
const ptrdiff_t dst_stride, const int height,
const int16x4_t* const taps) {
- const int num_taps = GetNumTapsInFilter(filter_index);
const int next_row = num_taps - 1;
const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1);
auto* dst16 = static_cast<uint16_t*>(dst);
@@ -633,8 +626,8 @@ void FilterVertical4xH(const uint16_t* LIBGAV1_RESTRICT src,
srcs[num_taps] = vld1_u16(src);
src += src_stride;
- const int32x4_t v_sum = SumOnePassTaps<filter_index>(srcs, taps);
- const int32x4_t v_sum_1 = SumOnePassTaps<filter_index>(srcs + 1, taps);
+ const int32x4_t v_sum = SumOnePassTaps<num_taps>(srcs, taps);
+ const int32x4_t v_sum_1 = SumOnePassTaps<num_taps>(srcs + 1, taps);
if (is_compound) {
const int16x4_t d0 = vqrshrn_n_s32(v_sum, kInterRoundBitsHorizontal - 1);
const int16x4_t d1 =
@@ -673,13 +666,12 @@ void FilterVertical4xH(const uint16_t* LIBGAV1_RESTRICT src,
} while (y != 0);
}
-template <int filter_index>
+template <int num_taps>
void FilterVertical2xH(const uint16_t* LIBGAV1_RESTRICT src,
const ptrdiff_t src_stride,
void* LIBGAV1_RESTRICT const dst,
const ptrdiff_t dst_stride, const int height,
const int16x4_t* const taps) {
- const int num_taps = GetNumTapsInFilter(filter_index);
const int next_row = num_taps - 1;
const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1);
auto* dst16 = static_cast<uint16_t*>(dst);
@@ -718,7 +710,7 @@ void FilterVertical2xH(const uint16_t* LIBGAV1_RESTRICT src,
src += src_stride;
srcs[next_row] = vext_u16(srcs[next_row - 1], srcs[num_taps], 2);
- const int32x4_t v_sum = SumOnePassTaps<filter_index>(srcs, taps);
+ const int32x4_t v_sum = SumOnePassTaps<num_taps>(srcs, taps);
const uint16x4_t d0 =
vmin_u16(vqrshrun_n_s32(v_sum, kFilterBits - 1), v_max_bitdepth);
Store2<0>(dst16, d0);
@@ -1180,13 +1172,13 @@ void ConvolveVertical_NEON(
if (filter_index == 0) { // 6 tap.
if (width == 2) {
- FilterVertical2xH<0>(src, src_stride, dest, dest_stride, height,
+ FilterVertical2xH<6>(src, src_stride, dest, dest_stride, height,
taps + 1);
} else if (width == 4) {
- FilterVertical4xH<0>(src, src_stride, dest, dest_stride, height,
+ FilterVertical4xH<6>(src, src_stride, dest, dest_stride, height,
taps + 1);
} else {
- FilterVertical<0>(src, src_stride, dest, dest_stride, width, height,
+ FilterVertical<6>(src, src_stride, dest, dest_stride, width, height,
taps + 1);
}
} else if ((static_cast<int>(filter_index == 1) &
@@ -1196,33 +1188,33 @@ void ConvolveVertical_NEON(
static_cast<int>(vertical_filter_id == 9) |
static_cast<int>(vertical_filter_id == 15))) != 0) { // 6 tap.
if (width == 2) {
- FilterVertical2xH<1>(src, src_stride, dest, dest_stride, height,
+ FilterVertical2xH<6>(src, src_stride, dest, dest_stride, height,
taps + 1);
} else if (width == 4) {
- FilterVertical4xH<1>(src, src_stride, dest, dest_stride, height,
+ FilterVertical4xH<6>(src, src_stride, dest, dest_stride, height,
taps + 1);
} else {
- FilterVertical<1>(src, src_stride, dest, dest_stride, width, height,
+ FilterVertical<6>(src, src_stride, dest, dest_stride, width, height,
taps + 1);
}
} else if (filter_index == 2) { // 8 tap.
if (width == 2) {
- FilterVertical2xH<2>(src, src_stride, dest, dest_stride, height, taps);
+ FilterVertical2xH<8>(src, src_stride, dest, dest_stride, height, taps);
} else if (width == 4) {
- FilterVertical4xH<2>(src, src_stride, dest, dest_stride, height, taps);
+ FilterVertical4xH<8>(src, src_stride, dest, dest_stride, height, taps);
} else {
- FilterVertical<2>(src, src_stride, dest, dest_stride, width, height,
+ FilterVertical<8>(src, src_stride, dest, dest_stride, width, height,
taps);
}
} else if (filter_index == 3) { // 2 tap.
if (width == 2) {
- FilterVertical2xH<3>(src, src_stride, dest, dest_stride, height,
+ FilterVertical2xH<2>(src, src_stride, dest, dest_stride, height,
taps + 3);
} else if (width == 4) {
- FilterVertical4xH<3>(src, src_stride, dest, dest_stride, height,
+ FilterVertical4xH<2>(src, src_stride, dest, dest_stride, height,
taps + 3);
} else {
- FilterVertical<3>(src, src_stride, dest, dest_stride, width, height,
+ FilterVertical<2>(src, src_stride, dest, dest_stride, width, height,
taps + 3);
}
} else {
@@ -1240,13 +1232,13 @@ void ConvolveVertical_NEON(
// treating it as though it has 4.
if (filter_index == 1) src += src_stride;
if (width == 2) {
- FilterVertical2xH<5>(src, src_stride, dest, dest_stride, height,
+ FilterVertical2xH<4>(src, src_stride, dest, dest_stride, height,
taps + 2);
} else if (width == 4) {
- FilterVertical4xH<5>(src, src_stride, dest, dest_stride, height,
+ FilterVertical4xH<4>(src, src_stride, dest, dest_stride, height,
taps + 2);
} else {
- FilterVertical<5>(src, src_stride, dest, dest_stride, width, height,
+ FilterVertical<4>(src, src_stride, dest, dest_stride, width, height,
taps + 2);
}
}
@@ -1274,10 +1266,10 @@ void ConvolveCompoundVertical_NEON(
if (filter_index == 0) { // 6 tap.
if (width == 4) {
- FilterVertical4xH<0, /*is_compound=*/true>(src, src_stride, dest, 4,
+ FilterVertical4xH<6, /*is_compound=*/true>(src, src_stride, dest, 4,
height, taps + 1);
} else {
- FilterVertical<0, /*is_compound=*/true>(src, src_stride, dest, width,
+ FilterVertical<6, /*is_compound=*/true>(src, src_stride, dest, width,
width, height, taps + 1);
}
} else if ((static_cast<int>(filter_index == 1) &
@@ -1287,26 +1279,26 @@ void ConvolveCompoundVertical_NEON(
static_cast<int>(vertical_filter_id == 9) |
static_cast<int>(vertical_filter_id == 15))) != 0) { // 6 tap.
if (width == 4) {
- FilterVertical4xH<1, /*is_compound=*/true>(src, src_stride, dest, 4,
+ FilterVertical4xH<6, /*is_compound=*/true>(src, src_stride, dest, 4,
height, taps + 1);
} else {
- FilterVertical<1, /*is_compound=*/true>(src, src_stride, dest, width,
+ FilterVertical<6, /*is_compound=*/true>(src, src_stride, dest, width,
width, height, taps + 1);
}
} else if (filter_index == 2) { // 8 tap.
if (width == 4) {
- FilterVertical4xH<2, /*is_compound=*/true>(src, src_stride, dest, 4,
+ FilterVertical4xH<8, /*is_compound=*/true>(src, src_stride, dest, 4,
height, taps);
} else {
- FilterVertical<2, /*is_compound=*/true>(src, src_stride, dest, width,
+ FilterVertical<8, /*is_compound=*/true>(src, src_stride, dest, width,
width, height, taps);
}
} else if (filter_index == 3) { // 2 tap.
if (width == 4) {
- FilterVertical4xH<3, /*is_compound=*/true>(src, src_stride, dest, 4,
+ FilterVertical4xH<2, /*is_compound=*/true>(src, src_stride, dest, 4,
height, taps + 3);
} else {
- FilterVertical<3, /*is_compound=*/true>(src, src_stride, dest, width,
+ FilterVertical<2, /*is_compound=*/true>(src, src_stride, dest, width,
width, height, taps + 3);
}
} else {
@@ -1323,10 +1315,10 @@ void ConvolveCompoundVertical_NEON(
// treating it as though it has 4.
if (filter_index == 1) src += src_stride;
if (width == 4) {
- FilterVertical4xH<5, /*is_compound=*/true>(src, src_stride, dest, 4,
+ FilterVertical4xH<4, /*is_compound=*/true>(src, src_stride, dest, 4,
height, taps + 2);
} else {
- FilterVertical<5, /*is_compound=*/true>(src, src_stride, dest, width,
+ FilterVertical<4, /*is_compound=*/true>(src, src_stride, dest, width,
width, height, taps + 2);
}
}
@@ -1980,7 +1972,7 @@ inline void ConvolveKernelHorizontal2Tap(
PermuteSrcVals(src_bytes, src_lookup[1])};
vst1_s16(intermediate,
- vrshrn_n_s32(SumOnePassTaps</*filter_index=*/3>(src, taps),
+ vrshrn_n_s32(SumOnePassTaps</*num_taps=*/2>(src, taps),
kInterRoundBitsHorizontal - 1));
src_y = AddByteStride(src_y, src_stride);
intermediate += kIntermediateStride;
@@ -2034,13 +2026,12 @@ inline void ConvolveKernelHorizontal2Tap(
const uint16x4_t src_high[2] = {vget_high_u16(src[0]),
vget_high_u16(src[1])};
- vst1_s16(intermediate_x, vrshrn_n_s32(SumOnePassTaps</*filter_index=*/3>(
- src_low, taps_low),
- kInterRoundBitsHorizontal - 1));
- vst1_s16(
- intermediate_x + 4,
- vrshrn_n_s32(SumOnePassTaps</*filter_index=*/3>(src_high, taps_high),
- kInterRoundBitsHorizontal - 1));
+ vst1_s16(intermediate_x,
+ vrshrn_n_s32(SumOnePassTaps</*num_taps=*/2>(src_low, taps_low),
+ kInterRoundBitsHorizontal - 1));
+ vst1_s16(intermediate_x + 4,
+ vrshrn_n_s32(SumOnePassTaps</*num_taps=*/2>(src_high, taps_high),
+ kInterRoundBitsHorizontal - 1));
// Avoid right shifting the stride.
src_x = AddByteStride(src_x, src_stride);
intermediate_x += kIntermediateStride;
@@ -2123,7 +2114,7 @@ inline void ConvolveKernelHorizontalPositive4Tap(
PermuteSrcVals(src_bytes, src_lookup[3])};
vst1_s16(intermediate,
- vrshrn_n_s32(SumOnePassTaps</*filter_index=*/5>(src, taps),
+ vrshrn_n_s32(SumOnePassTaps</*num_taps=*/4>(src, taps),
kInterRoundBitsHorizontal - 1));
src_y = AddByteStride(src_y, src_stride);
intermediate += kIntermediateStride;
@@ -2202,7 +2193,7 @@ inline void ConvolveKernelHorizontalSigned4Tap(
PermuteSrcVals(src_bytes, src_lookup[3])};
vst1_s16(intermediate,
- vrshrn_n_s32(SumOnePassTaps</*filter_index=*/4>(src, taps),
+ vrshrn_n_s32(SumOnePassTaps</*num_taps=*/4>(src, taps),
kInterRoundBitsHorizontal - 1));
src_y = AddByteStride(src_y, src_stride);
intermediate += kIntermediateStride;
@@ -2297,13 +2288,12 @@ inline void ConvolveKernelHorizontalSigned6Tap(
src_high[i] = vget_high_u16(src_i);
}
- vst1_s16(intermediate_x, vrshrn_n_s32(SumOnePassTaps</*filter_index=*/0>(
- src_low, taps_low),
- kInterRoundBitsHorizontal - 1));
- vst1_s16(
- intermediate_x + 4,
- vrshrn_n_s32(SumOnePassTaps</*filter_index=*/0>(src_high, taps_high),
- kInterRoundBitsHorizontal - 1));
+ vst1_s16(intermediate_x,
+ vrshrn_n_s32(SumOnePassTaps</*num_taps=*/6>(src_low, taps_low),
+ kInterRoundBitsHorizontal - 1));
+ vst1_s16(intermediate_x + 4,
+ vrshrn_n_s32(SumOnePassTaps</*num_taps=*/6>(src_high, taps_high),
+ kInterRoundBitsHorizontal - 1));
// Avoid right shifting the stride.
src_x = AddByteStride(src_x, src_stride);
intermediate_x += kIntermediateStride;
@@ -2401,13 +2391,12 @@ inline void ConvolveKernelHorizontalMixed6Tap(
src_high[i] = vget_high_u16(src_i);
}
- vst1_s16(intermediate_x, vrshrn_n_s32(SumOnePassTaps</*filter_index=*/0>(
- src_low, taps_low),
- kInterRoundBitsHorizontal - 1));
- vst1_s16(
- intermediate_x + 4,
- vrshrn_n_s32(SumOnePassTaps</*filter_index=*/0>(src_high, taps_high),
- kInterRoundBitsHorizontal - 1));
+ vst1_s16(intermediate_x,
+ vrshrn_n_s32(SumOnePassTaps</*num_taps=*/6>(src_low, taps_low),
+ kInterRoundBitsHorizontal - 1));
+ vst1_s16(intermediate_x + 4,
+ vrshrn_n_s32(SumOnePassTaps</*num_taps=*/6>(src_high, taps_high),
+ kInterRoundBitsHorizontal - 1));
// Avoid right shifting the stride.
src_x = AddByteStride(src_x, src_stride);
intermediate_x += kIntermediateStride;
@@ -2505,13 +2494,12 @@ inline void ConvolveKernelHorizontalSigned8Tap(
src_high[i] = vget_high_u16(src_i);
}
- vst1_s16(intermediate_x, vrshrn_n_s32(SumOnePassTaps</*filter_index=*/2>(
- src_low, taps_low),
- kInterRoundBitsHorizontal - 1));
- vst1_s16(
- intermediate_x + 4,
- vrshrn_n_s32(SumOnePassTaps</*filter_index=*/2>(src_high, taps_high),
- kInterRoundBitsHorizontal - 1));
+ vst1_s16(intermediate_x,
+ vrshrn_n_s32(SumOnePassTaps</*num_taps=*/8>(src_low, taps_low),
+ kInterRoundBitsHorizontal - 1));
+ vst1_s16(intermediate_x + 4,
+ vrshrn_n_s32(SumOnePassTaps</*num_taps=*/8>(src_high, taps_high),
+ kInterRoundBitsHorizontal - 1));
// Avoid right shifting the stride.
src_x = AddByteStride(src_x, src_stride);
intermediate_x += kIntermediateStride;
diff --git a/src/dsp/arm/distance_weighted_blend_neon.cc b/src/dsp/arm/distance_weighted_blend_neon.cc
index 7d287c8..6087276 100644
--- a/src/dsp/arm/distance_weighted_blend_neon.cc
+++ b/src/dsp/arm/distance_weighted_blend_neon.cc
@@ -36,44 +36,48 @@ constexpr int kInterPostRoundBit = 4;
namespace low_bitdepth {
namespace {
-inline int16x8_t ComputeWeightedAverage8(const int16x8_t pred0,
+inline uint8x8_t ComputeWeightedAverage8(const int16x8_t pred0,
const int16x8_t pred1,
- const int16x4_t weights[2]) {
- // TODO(https://issuetracker.google.com/issues/150325685): Investigate range.
- const int32x4_t wpred0_lo = vmull_s16(weights[0], vget_low_s16(pred0));
- const int32x4_t wpred0_hi = vmull_s16(weights[0], vget_high_s16(pred0));
- const int32x4_t blended_lo =
- vmlal_s16(wpred0_lo, weights[1], vget_low_s16(pred1));
- const int32x4_t blended_hi =
- vmlal_s16(wpred0_hi, weights[1], vget_high_s16(pred1));
-
- return vcombine_s16(vqrshrn_n_s32(blended_lo, kInterPostRoundBit + 4),
- vqrshrn_n_s32(blended_hi, kInterPostRoundBit + 4));
+ const int16x8_t weight) {
+ // Given: p0,p1 in range [-5132,9212] and w0 = 16 - w1, w1 = 16 - w0
+ // Output: (p0 * w0 + p1 * w1 + 128(=rounding bit)) >>
+ // 8(=kInterPostRoundBit + 4)
+ // The formula is manipulated to avoid lengthening to 32 bits.
+ // p0 * w0 + p1 * w1 = p0 * w0 + (16 - w0) * p1
+ // = (p0 - p1) * w0 + 16 * p1
+ // Maximum value of p0 - p1 is 9212 + 5132 = 0x3808.
+ const int16x8_t diff = vsubq_s16(pred0, pred1);
+ // (((p0 - p1) * (w0 << 11) << 1) >> 16) + ((16 * p1) >> 4)
+ const int16x8_t weighted_diff = vqdmulhq_s16(diff, weight);
+ // ((p0 - p1) * w0 >> 4) + p1
+ const int16x8_t upscaled_average = vaddq_s16(weighted_diff, pred1);
+ // (((p0 - p1) * w0 >> 4) + p1 + (128 >> 4)) >> 4
+ return vqrshrun_n_s16(upscaled_average, kInterPostRoundBit);
}
-template <int width, int height>
+template <int width>
inline void DistanceWeightedBlendSmall_NEON(
const int16_t* LIBGAV1_RESTRICT prediction_0,
- const int16_t* LIBGAV1_RESTRICT prediction_1, const int16x4_t weights[2],
- void* LIBGAV1_RESTRICT const dest, const ptrdiff_t dest_stride) {
+ const int16_t* LIBGAV1_RESTRICT prediction_1, const int height,
+ const int16x8_t weight, void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t dest_stride) {
auto* dst = static_cast<uint8_t*>(dest);
constexpr int step = 16 / width;
- for (int y = 0; y < height; y += step) {
+ int y = height;
+ do {
const int16x8_t src_00 = vld1q_s16(prediction_0);
const int16x8_t src_10 = vld1q_s16(prediction_1);
prediction_0 += 8;
prediction_1 += 8;
- const int16x8_t res0 = ComputeWeightedAverage8(src_00, src_10, weights);
+ const uint8x8_t result0 = ComputeWeightedAverage8(src_00, src_10, weight);
const int16x8_t src_01 = vld1q_s16(prediction_0);
const int16x8_t src_11 = vld1q_s16(prediction_1);
prediction_0 += 8;
prediction_1 += 8;
- const int16x8_t res1 = ComputeWeightedAverage8(src_01, src_11, weights);
+ const uint8x8_t result1 = ComputeWeightedAverage8(src_01, src_11, weight);
- const uint8x8_t result0 = vqmovun_s16(res0);
- const uint8x8_t result1 = vqmovun_s16(res1);
if (width == 4) {
StoreLo4(dst, result0);
dst += dest_stride;
@@ -90,12 +94,13 @@ inline void DistanceWeightedBlendSmall_NEON(
vst1_u8(dst, result1);
dst += dest_stride;
}
- }
+ y -= step;
+ } while (y != 0);
}
inline void DistanceWeightedBlendLarge_NEON(
const int16_t* LIBGAV1_RESTRICT prediction_0,
- const int16_t* LIBGAV1_RESTRICT prediction_1, const int16x4_t weights[2],
+ const int16_t* LIBGAV1_RESTRICT prediction_1, const int16x8_t weight,
const int width, const int height, void* LIBGAV1_RESTRICT const dest,
const ptrdiff_t dest_stride) {
auto* dst = static_cast<uint8_t*>(dest);
@@ -106,16 +111,15 @@ inline void DistanceWeightedBlendLarge_NEON(
do {
const int16x8_t src0_lo = vld1q_s16(prediction_0 + x);
const int16x8_t src1_lo = vld1q_s16(prediction_1 + x);
- const int16x8_t res_lo =
- ComputeWeightedAverage8(src0_lo, src1_lo, weights);
+ const uint8x8_t res_lo =
+ ComputeWeightedAverage8(src0_lo, src1_lo, weight);
const int16x8_t src0_hi = vld1q_s16(prediction_0 + x + 8);
const int16x8_t src1_hi = vld1q_s16(prediction_1 + x + 8);
- const int16x8_t res_hi =
- ComputeWeightedAverage8(src0_hi, src1_hi, weights);
+ const uint8x8_t res_hi =
+ ComputeWeightedAverage8(src0_hi, src1_hi, weight);
- const uint8x16_t result =
- vcombine_u8(vqmovun_s16(res_lo), vqmovun_s16(res_hi));
+ const uint8x16_t result = vcombine_u8(res_lo, res_hi);
vst1q_u8(dst + x, result);
x += 16;
} while (x < width);
@@ -128,52 +132,25 @@ inline void DistanceWeightedBlendLarge_NEON(
inline void DistanceWeightedBlend_NEON(
const void* LIBGAV1_RESTRICT prediction_0,
const void* LIBGAV1_RESTRICT prediction_1, const uint8_t weight_0,
- const uint8_t weight_1, const int width, const int height,
+ const uint8_t /*weight_1*/, const int width, const int height,
void* LIBGAV1_RESTRICT const dest, const ptrdiff_t dest_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
- int16x4_t weights[2] = {vdup_n_s16(weight_0), vdup_n_s16(weight_1)};
- // TODO(johannkoenig): Investigate the branching. May be fine to call with a
- // variable height.
+ // Upscale the weight for vqdmulh.
+ const int16x8_t weight = vdupq_n_s16(weight_0 << 11);
if (width == 4) {
- if (height == 4) {
- DistanceWeightedBlendSmall_NEON<4, 4>(pred_0, pred_1, weights, dest,
- dest_stride);
- } else if (height == 8) {
- DistanceWeightedBlendSmall_NEON<4, 8>(pred_0, pred_1, weights, dest,
- dest_stride);
- } else {
- assert(height == 16);
- DistanceWeightedBlendSmall_NEON<4, 16>(pred_0, pred_1, weights, dest,
- dest_stride);
- }
+ DistanceWeightedBlendSmall_NEON<4>(pred_0, pred_1, height, weight, dest,
+ dest_stride);
return;
}
if (width == 8) {
- switch (height) {
- case 4:
- DistanceWeightedBlendSmall_NEON<8, 4>(pred_0, pred_1, weights, dest,
- dest_stride);
- return;
- case 8:
- DistanceWeightedBlendSmall_NEON<8, 8>(pred_0, pred_1, weights, dest,
- dest_stride);
- return;
- case 16:
- DistanceWeightedBlendSmall_NEON<8, 16>(pred_0, pred_1, weights, dest,
- dest_stride);
- return;
- default:
- assert(height == 32);
- DistanceWeightedBlendSmall_NEON<8, 32>(pred_0, pred_1, weights, dest,
- dest_stride);
-
- return;
- }
+ DistanceWeightedBlendSmall_NEON<8>(pred_0, pred_1, height, weight, dest,
+ dest_stride);
+ return;
}
- DistanceWeightedBlendLarge_NEON(pred_0, pred_1, weights, width, height, dest,
+ DistanceWeightedBlendLarge_NEON(pred_0, pred_1, weight, width, height, dest,
dest_stride);
}
diff --git a/src/dsp/arm/film_grain_neon.cc b/src/dsp/arm/film_grain_neon.cc
index 0b1b481..76e1151 100644
--- a/src/dsp/arm/film_grain_neon.cc
+++ b/src/dsp/arm/film_grain_neon.cc
@@ -18,23 +18,21 @@
#if LIBGAV1_ENABLE_NEON
#include <arm_neon.h>
-#include <algorithm>
#include <cassert>
#include <cstddef>
#include <cstdint>
#include <cstring>
-#include <new>
#include "src/dsp/arm/common_neon.h"
-#include "src/dsp/arm/film_grain_neon.h"
-#include "src/dsp/common.h"
#include "src/dsp/constants.h"
#include "src/dsp/dsp.h"
#include "src/dsp/film_grain_common.h"
+#include "src/utils/array_2d.h"
#include "src/utils/common.h"
#include "src/utils/compiler_attributes.h"
-#include "src/utils/logging.h"
+#include "src/utils/constants.h"
#include "src/utils/memory.h"
+#include "src/utils/types.h"
namespace libgav1 {
namespace dsp {
@@ -52,10 +50,8 @@ inline int16x8_t GetSignedSource8(const uint8_t* src) {
return ZeroExtend(vld1_u8(src));
}
-inline int16x8_t GetSignedSource8Msan(const uint8_t* src, int /*valid_range*/) {
- // TODO(b/194217060): restore |valid_range| usage after correcting call sites
- // causing test vector failures.
- return ZeroExtend(Load1MsanU8(src, 0));
+inline int16x8_t GetSignedSource8Msan(const uint8_t* src, int valid_range) {
+ return ZeroExtend(Load1MsanU8(src, 8 - valid_range));
}
inline void StoreUnsigned8(uint8_t* dest, const uint16x8_t data) {
@@ -69,11 +65,8 @@ inline int16x8_t GetSignedSource8(const uint16_t* src) {
return vreinterpretq_s16_u16(vld1q_u16(src));
}
-inline int16x8_t GetSignedSource8Msan(const uint16_t* src,
- int /*valid_range*/) {
- // TODO(b/194217060): restore |valid_range| usage after correcting call sites
- // causing test vector failures.
- return vreinterpretq_s16_u16(Load1QMsanU16(src, 0));
+inline int16x8_t GetSignedSource8Msan(const uint16_t* src, int valid_range) {
+ return vreinterpretq_s16_u16(Load1QMsanU16(src, 16 - valid_range));
}
inline void StoreUnsigned8(uint16_t* dest, const uint16x8_t data) {
@@ -198,17 +191,13 @@ inline uint16x8_t GetAverageLuma(const uint8_t* const luma, int subsampling_x) {
}
inline uint16x8_t GetAverageLumaMsan(const uint8_t* const luma,
- int subsampling_x, int /*valid_range*/) {
+ int subsampling_x, int valid_range) {
if (subsampling_x != 0) {
- // TODO(b/194217060): restore |valid_range| usage after correcting call
- // sites causing test vector failures.
- const uint8x16_t src = Load1QMsanU8(luma, 0);
-
+ const uint8x16_t src = MaskOverreadsQ(vld1q_u8(luma), 16 - valid_range);
+ // MemorySanitizer registers vpaddlq_u8 as a use of the memory.
return vrshrq_n_u16(vpaddlq_u8(src), 1);
}
- // TODO(b/194217060): restore |valid_range| usage after correcting call sites
- // causing test vector failures.
- return vmovl_u8(Load1MsanU8(luma, 0));
+ return MaskOverreadsQ(vmovl_u8(vld1_u8(luma)), 16 - valid_range);
}
#if LIBGAV1_MAX_BITDEPTH >= 10
@@ -252,16 +241,13 @@ inline uint16x8_t GetAverageLuma(const uint16_t* const luma,
}
inline uint16x8_t GetAverageLumaMsan(const uint16_t* const luma,
- int subsampling_x, int /*valid_range*/) {
+ int subsampling_x, int valid_range) {
if (subsampling_x != 0) {
- // TODO(b/194217060): restore |valid_range| usage after correcting call
- // sites causing test vector failures.
- const uint16x8x2_t src = Load2QMsanU16(luma, 0);
- return vrhaddq_u16(src.val[0], src.val[1]);
+ const uint16x8x2_t src = vld2q_u16(luma);
+ const uint16x8_t result = vrhaddq_u16(src.val[0], src.val[1]);
+ return MaskOverreadsQ(result, 16 - valid_range);
}
- // TODO(b/194217060): restore |valid_range| usage after correcting call sites
- // causing test vector failures.
- return Load1QMsanU16(luma, 0);
+ return Load1QMsanU16(luma, 16 - valid_range);
}
#endif // LIBGAV1_MAX_BITDEPTH >= 10
@@ -614,8 +600,7 @@ void InitializeScalingLookupTable_NEON(int num_points,
}
static_assert(sizeof(scaling_lut[0]) == 2, "");
Memset(scaling_lut, point_scaling[0],
- std::max(static_cast<int>(point_value[0]), 1)
- << (bitdepth - kBitdepth8));
+ (static_cast<int>(point_value[0]) + 1) << (bitdepth - kBitdepth8));
const int32x4_t steps = vmovl_s16(vcreate_s16(0x0003000200010000));
const int32x4_t rounding = vdupq_n_s32(32768);
for (int i = 0; i < num_points - 1; ++i) {
@@ -666,7 +651,7 @@ void InitializeScalingLookupTable_NEON(int num_points,
const int16x8x4_t result = {
start, vaddq_s16(start, vrshrq_n_s16(delta, 2)),
vaddq_s16(start, delta2), vaddq_s16(start, delta3)};
- vst4q_s16(&scaling_lut[x_base], result);
+ Store4QMsanS16(&scaling_lut[x_base], result);
} else {
vst1q_s16(&scaling_lut[x_base], full_interp);
}
@@ -696,13 +681,29 @@ inline int16x8_t Clip3(const int16x8_t value, const int16x8_t low,
}
template <int bitdepth, typename Pixel>
-inline int16x8_t GetScalingFactors(
- const int16_t scaling_lut[kScalingLookupTableSize], const Pixel* source) {
+inline int16x8_t GetScalingFactors(const int16_t scaling_lut[],
+ const Pixel* source) {
int16_t start_vals[8];
static_assert(bitdepth <= kBitdepth10,
"NEON Film Grain is not yet implemented for 12bpp.");
+#if LIBGAV1_MSAN
+ memset(start_vals, 0, sizeof(start_vals));
+#endif
for (int i = 0; i < 8; ++i) {
- assert(source[i] < kScalingLookupTableSize << (bitdepth - 2));
+ assert(source[i] < (kScalingLookupTableSize << (bitdepth - kBitdepth8)));
+ start_vals[i] = scaling_lut[source[i]];
+ }
+ return vld1q_s16(start_vals);
+}
+
+template <int bitdepth, typename Pixel>
+inline int16x8_t GetScalingFactors(const int16_t scaling_lut[],
+ const Pixel* source, const int valid_range) {
+ int16_t start_vals[8];
+ static_assert(bitdepth <= kBitdepth10,
+ "NEON Film Grain is not yet implemented for 12bpp.");
+ for (int i = 0; i < valid_range; ++i) {
+ assert(source[i] < (kScalingLookupTableSize << (bitdepth - kBitdepth8)));
start_vals[i] = scaling_lut[source[i]];
}
return vld1q_s16(start_vals);
@@ -743,10 +744,11 @@ void BlendNoiseWithImageLuma_NEON(
const int16x8_t scaling_shift_vect = vdupq_n_s16(
(bitdepth == kBitdepth10) ? 15 - scaling_shift : -scaling_shift);
+ const int safe_width = width & ~15;
int y = 0;
do {
int x = 0;
- do {
+ for (; x + 8 <= safe_width; x += 8) {
// This operation on the unsigned input is safe in 8bpp because the vector
// is widened before it is reinterpreted.
const int16x8_t orig0 = GetSignedSource8(&in_y_row[x]);
@@ -767,8 +769,8 @@ void BlendNoiseWithImageLuma_NEON(
// This operation on the unsigned input is safe in 8bpp because the vector
// is widened before it is reinterpreted.
const int16x8_t orig1 = GetSignedSource8(&in_y_row[x]);
- const int16x8_t scaling1 = GetScalingFactors<bitdepth, Pixel>(
- scaling_lut_y, &in_y_row[std::min(x, width)]);
+ const int16x8_t scaling1 =
+ GetScalingFactors<bitdepth, Pixel>(scaling_lut_y, &in_y_row[x]);
noise = GetSignedSource8(&(noise_image[kPlaneY][y + start_height][x]));
noise = ScaleNoise<bitdepth>(noise, scaling1, scaling_shift_vect);
@@ -778,8 +780,41 @@ void BlendNoiseWithImageLuma_NEON(
// function for just that case, though the gain would be very small.
StoreUnsigned8(&out_y_row[x],
vreinterpretq_u16_s16(Clip3(combined1, floor, ceiling)));
- x += 8;
- } while (x < width);
+ }
+
+ if (x < width) {
+ assert(width - x < 16);
+ if (x < width - 8) {
+ const int16x8_t orig = GetSignedSource8(&in_y_row[x]);
+ const int16x8_t scaling =
+ GetScalingFactors<bitdepth, Pixel>(scaling_lut_y, &in_y_row[x]);
+ int16x8_t noise =
+ GetSignedSource8(&(noise_image[kPlaneY][y + start_height][x]));
+
+ noise = ScaleNoise<bitdepth>(noise, scaling, scaling_shift_vect);
+ const int16x8_t combined = vaddq_s16(orig, noise);
+ // In 8bpp, when params_.clip_to_restricted_range == false, we can
+ // replace clipping with vqmovun_s16, but it's not likely to be worth
+ // copying the function for just that case, though the gain would be
+ // very small.
+ StoreUnsigned8(&out_y_row[x],
+ vreinterpretq_u16_s16(Clip3(combined, floor, ceiling)));
+ x += 8;
+ }
+ const int valid_range_pixels = width - x;
+ const int valid_range_bytes = (width - x) * sizeof(in_y_row[0]);
+ const int16x8_t orig =
+ GetSignedSource8Msan(&in_y_row[x], valid_range_bytes);
+ const int16x8_t scaling = GetScalingFactors<bitdepth, Pixel>(
+ scaling_lut_y, &in_y_row[x], valid_range_pixels);
+ int16x8_t noise =
+ GetSignedSource8(&(noise_image[kPlaneY][y + start_height][x]));
+ noise = ScaleNoise<bitdepth>(noise, scaling, scaling_shift_vect);
+
+ const int16x8_t combined = vaddq_s16(orig, noise);
+ StoreUnsigned8(&out_y_row[x],
+ vreinterpretq_u16_s16(Clip3(combined, floor, ceiling)));
+ }
in_y_row += source_stride_y;
out_y_row += dest_stride_y;
} while (++y < height);
@@ -787,13 +822,9 @@ void BlendNoiseWithImageLuma_NEON(
template <int bitdepth, typename GrainType, typename Pixel>
inline int16x8_t BlendChromaValsWithCfl(
- const Pixel* LIBGAV1_RESTRICT average_luma_buffer,
- const int16_t* LIBGAV1_RESTRICT scaling_lut,
const Pixel* LIBGAV1_RESTRICT chroma_cursor,
const GrainType* LIBGAV1_RESTRICT noise_image_cursor,
- const int16x8_t scaling_shift_vect) {
- const int16x8_t scaling =
- GetScalingFactors<bitdepth, Pixel>(scaling_lut, average_luma_buffer);
+ const int16x8_t scaling, const int16x8_t scaling_shift_vect) {
const int16x8_t orig = GetSignedSource8(chroma_cursor);
int16x8_t noise = GetSignedSource8(noise_image_cursor);
noise = ScaleNoise<bitdepth>(noise, scaling, scaling_shift_vect);
@@ -812,7 +843,6 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlaneWithCfl_NEON(
const int16x8_t floor = vdupq_n_s16(min_value);
const int16x8_t ceiling = vdupq_n_s16(max_chroma);
Pixel luma_buffer[16];
- memset(luma_buffer, 0, sizeof(luma_buffer));
// In 8bpp, the maximum upscaled noise is 127*255 = 0x7E81, which is safe
// for 16 bit signed integers. In higher bitdepths, however, we have to
// expand to 32 to protect the sign bit.
@@ -831,40 +861,45 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlaneWithCfl_NEON(
int y = 0;
do {
int x = 0;
- do {
+ for (; x + 8 <= safe_chroma_width; x += 8) {
const int luma_x = x << subsampling_x;
const uint16x8_t average_luma =
GetAverageLuma(&in_y_row[luma_x], subsampling_x);
StoreUnsigned8(average_luma_buffer, average_luma);
+ const int16x8_t scaling =
+ GetScalingFactors<bitdepth, Pixel>(scaling_lut, average_luma_buffer);
const int16x8_t blended =
BlendChromaValsWithCfl<bitdepth, GrainType, Pixel>(
- average_luma_buffer, scaling_lut, &in_chroma_row[x],
- &(noise_image[y + start_height][x]), scaling_shift_vect);
+ &in_chroma_row[x], &(noise_image[y + start_height][x]), scaling,
+ scaling_shift_vect);
// In 8bpp, when params_.clip_to_restricted_range == false, we can replace
// clipping with vqmovun_s16, but it's not likely to be worth copying the
// function for just that case.
StoreUnsigned8(&out_chroma_row[x],
vreinterpretq_u16_s16(Clip3(blended, floor, ceiling)));
- x += 8;
- } while (x < safe_chroma_width);
+ }
if (x < chroma_width) {
const int luma_x = x << subsampling_x;
const int valid_range_pixels = width - luma_x;
+ const int valid_range_chroma_pixels = chroma_width - x;
const int valid_range_bytes = valid_range_pixels * sizeof(in_y_row[0]);
+ assert(valid_range_pixels < 16);
memcpy(luma_buffer, &in_y_row[luma_x], valid_range_bytes);
luma_buffer[valid_range_pixels] = in_y_row[width - 1];
const uint16x8_t average_luma = GetAverageLumaMsan(
- luma_buffer, subsampling_x, valid_range_bytes + sizeof(in_y_row[0]));
+ luma_buffer, subsampling_x, valid_range_chroma_pixels << 1);
StoreUnsigned8(average_luma_buffer, average_luma);
+ const int16x8_t scaling = GetScalingFactors<bitdepth, Pixel>(
+ scaling_lut, average_luma_buffer, valid_range_chroma_pixels);
const int16x8_t blended =
BlendChromaValsWithCfl<bitdepth, GrainType, Pixel>(
- average_luma_buffer, scaling_lut, &in_chroma_row[x],
- &(noise_image[y + start_height][x]), scaling_shift_vect);
+ &in_chroma_row[x], &(noise_image[y + start_height][x]), scaling,
+ scaling_shift_vect);
// In 8bpp, when params_.clip_to_restricted_range == false, we can replace
// clipping with vqmovun_s16, but it's not likely to be worth copying the
// function for just that case.
@@ -915,7 +950,8 @@ inline int16x8_t BlendChromaValsNoCfl(
const int16_t* LIBGAV1_RESTRICT scaling_lut, const int16x8_t orig,
const int8_t* LIBGAV1_RESTRICT noise_image_cursor,
const int16x8_t& average_luma, const int16x8_t& scaling_shift_vect,
- const int16x8_t& offset, int luma_multiplier, int chroma_multiplier) {
+ const int16x8_t& offset, int luma_multiplier, int chroma_multiplier,
+ bool restrict_scaling_lookup, int valid_range_pixels = 0) {
uint8_t merged_buffer[8];
const int16x8_t weighted_luma = vmulq_n_s16(average_luma, luma_multiplier);
const int16x8_t weighted_chroma = vmulq_n_s16(orig, chroma_multiplier);
@@ -925,8 +961,12 @@ inline int16x8_t BlendChromaValsNoCfl(
// 0x7E81 + 0x1FE0 = 0x9E61, therefore another halving add is required.
const uint8x8_t merged = vqshrun_n_s16(vhaddq_s16(offset, combined), 4);
vst1_u8(merged_buffer, merged);
+
const int16x8_t scaling =
- GetScalingFactors<kBitdepth8, uint8_t>(scaling_lut, merged_buffer);
+ restrict_scaling_lookup
+ ? GetScalingFactors<kBitdepth8, uint8_t>(scaling_lut, merged_buffer,
+ valid_range_pixels)
+ : GetScalingFactors<kBitdepth8, uint8_t>(scaling_lut, merged_buffer);
int16x8_t noise = GetSignedSource8(noise_image_cursor);
noise = ScaleNoise<kBitdepth8>(noise, scaling, scaling_shift_vect);
return vaddq_s16(orig, noise);
@@ -952,34 +992,28 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlane8bpp_NEON(
const int chroma_width = (width + subsampling_x) >> subsampling_x;
const int safe_chroma_width = chroma_width & ~7;
uint8_t luma_buffer[16];
-#if LIBGAV1_MSAN
- // Quiet msan warnings.
- memset(luma_buffer, 0, sizeof(luma_buffer));
-#endif
const int16x8_t offset = vdupq_n_s16(chroma_offset << 5);
start_height >>= subsampling_y;
int y = 0;
do {
int x = 0;
- do {
+ for (; x + 8 <= safe_chroma_width; x += 8) {
const int luma_x = x << subsampling_x;
- const int valid_range = width - luma_x;
+ const int valid_range_chroma_pixels = chroma_width - x;
const int16x8_t orig_chroma = GetSignedSource8(&in_chroma_row[x]);
- const int16x8_t average_luma = vreinterpretq_s16_u16(
- GetAverageLumaMsan(&in_y_row[luma_x], subsampling_x, valid_range));
+ const int16x8_t average_luma = vreinterpretq_s16_u16(GetAverageLumaMsan(
+ &in_y_row[luma_x], subsampling_x, valid_range_chroma_pixels << 1));
const int16x8_t blended = BlendChromaValsNoCfl(
scaling_lut, orig_chroma, &(noise_image[y + start_height][x]),
average_luma, scaling_shift_vect, offset, luma_multiplier,
- chroma_multiplier);
+ chroma_multiplier, /*restrict_scaling_lookup=*/false);
// In 8bpp, when params_.clip_to_restricted_range == false, we can
// replace clipping with vqmovun_s16, but the gain would be small.
StoreUnsigned8(&out_chroma_row[x],
vreinterpretq_u16_s16(Clip3(blended, floor, ceiling)));
-
- x += 8;
- } while (x < safe_chroma_width);
+ }
if (x < chroma_width) {
// Begin right edge iteration. Same as the normal iterations, but the
@@ -988,19 +1022,20 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlane8bpp_NEON(
const int luma_x = x << subsampling_x;
const int valid_range_pixels = width - luma_x;
const int valid_range_bytes = valid_range_pixels * sizeof(in_y_row[0]);
+ assert(valid_range_pixels < 16);
memcpy(luma_buffer, &in_y_row[luma_x], valid_range_bytes);
luma_buffer[valid_range_pixels] = in_y_row[width - 1];
- const int valid_range_chroma_bytes =
- (chroma_width - x) * sizeof(in_chroma_row[0]);
+ const int valid_range_chroma_pixels = chroma_width - x;
const int16x8_t orig_chroma =
- GetSignedSource8Msan(&in_chroma_row[x], valid_range_chroma_bytes);
+ GetSignedSource8Msan(&in_chroma_row[x], valid_range_chroma_pixels);
const int16x8_t average_luma = vreinterpretq_s16_u16(GetAverageLumaMsan(
- luma_buffer, subsampling_x, valid_range_bytes + sizeof(in_y_row[0])));
+ luma_buffer, subsampling_x, valid_range_chroma_pixels << 1));
const int16x8_t blended = BlendChromaValsNoCfl(
scaling_lut, orig_chroma, &(noise_image[y + start_height][x]),
average_luma, scaling_shift_vect, offset, luma_multiplier,
- chroma_multiplier);
+ chroma_multiplier, /*restrict_scaling_lookup=*/true,
+ valid_range_chroma_pixels);
StoreUnsigned8(&out_chroma_row[x],
vreinterpretq_u16_s16(Clip3(blended, floor, ceiling)));
// End of right edge iteration.
@@ -1267,7 +1302,8 @@ inline int16x8_t BlendChromaValsNoCfl(
const int16_t* LIBGAV1_RESTRICT scaling_lut, const int16x8_t orig,
const int16_t* LIBGAV1_RESTRICT noise_image_cursor,
const int16x8_t& average_luma, const int16x8_t& scaling_shift_vect,
- const int32x4_t& offset, int luma_multiplier, int chroma_multiplier) {
+ const int32x4_t& offset, int luma_multiplier, int chroma_multiplier,
+ bool restrict_scaling_lookup, int valid_range_pixels = 0) {
uint16_t merged_buffer[8];
const int32x4_t weighted_luma_low =
vmull_n_s16(vget_low_s16(average_luma), luma_multiplier);
@@ -1287,7 +1323,11 @@ inline int16x8_t BlendChromaValsNoCfl(
vst1q_u16(merged_buffer,
vminq_u16(vcombine_u16(merged_low, merged_high), max_pixel));
const int16x8_t scaling =
- GetScalingFactors<kBitdepth10, uint16_t>(scaling_lut, merged_buffer);
+ restrict_scaling_lookup
+ ? GetScalingFactors<kBitdepth10, uint16_t>(scaling_lut, merged_buffer,
+ valid_range_pixels)
+ : GetScalingFactors<kBitdepth10, uint16_t>(scaling_lut,
+ merged_buffer);
const int16x8_t noise = GetSignedSource8(noise_image_cursor);
const int16x8_t scaled_noise =
ScaleNoise<kBitdepth10>(noise, scaling, scaling_shift_vect);
@@ -1311,11 +1351,6 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlane10bpp_NEON(
const int chroma_width = (width + subsampling_x) >> subsampling_x;
const int safe_chroma_width = chroma_width & ~7;
uint16_t luma_buffer[16];
-#if LIBGAV1_MSAN
- // TODO(b/194217060): This can be removed if the range calculations below are
- // fixed.
- memset(luma_buffer, 0, sizeof(luma_buffer));
-#endif
// Offset is added before downshifting in order to take advantage of
// saturation, so it has to be upscaled by 6 bits, plus 2 bits for 10bpp.
const int32x4_t offset = vdupq_n_s32(chroma_offset << (6 + 2));
@@ -1324,7 +1359,7 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlane10bpp_NEON(
int y = 0;
do {
int x = 0;
- do {
+ for (; x + 8 <= safe_chroma_width; x += 8) {
const int luma_x = x << subsampling_x;
const int16x8_t average_luma = vreinterpretq_s16_u16(
GetAverageLuma(&in_y_row[luma_x], subsampling_x));
@@ -1332,12 +1367,10 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlane10bpp_NEON(
const int16x8_t blended = BlendChromaValsNoCfl(
scaling_lut, orig_chroma, &(noise_image[y + start_height][x]),
average_luma, scaling_shift_vect, offset, luma_multiplier,
- chroma_multiplier);
+ chroma_multiplier, /*restrict_scaling_lookup=*/false);
StoreUnsigned8(&out_chroma_row[x],
vreinterpretq_u16_s16(Clip3(blended, floor, ceiling)));
-
- x += 8;
- } while (x < safe_chroma_width);
+ }
if (x < chroma_width) {
// Begin right edge iteration. Same as the normal iterations, but the
@@ -1346,19 +1379,22 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlane10bpp_NEON(
const int luma_x = x << subsampling_x;
const int valid_range_pixels = width - luma_x;
const int valid_range_bytes = valid_range_pixels * sizeof(in_y_row[0]);
+ assert(valid_range_pixels < 16);
memcpy(luma_buffer, &in_y_row[luma_x], valid_range_bytes);
luma_buffer[valid_range_pixels] = in_y_row[width - 1];
+ const int valid_range_chroma_pixels = chroma_width - x;
const int valid_range_chroma_bytes =
(chroma_width - x) * sizeof(in_chroma_row[0]);
const int16x8_t orig_chroma =
GetSignedSource8Msan(&in_chroma_row[x], valid_range_chroma_bytes);
const int16x8_t average_luma = vreinterpretq_s16_u16(GetAverageLumaMsan(
- luma_buffer, subsampling_x, valid_range_bytes + sizeof(in_y_row[0])));
+ luma_buffer, subsampling_x, valid_range_chroma_pixels << 1));
const int16x8_t blended = BlendChromaValsNoCfl(
scaling_lut, orig_chroma, &(noise_image[y + start_height][x]),
average_luma, scaling_shift_vect, offset, luma_multiplier,
- chroma_multiplier);
+ chroma_multiplier, /*restrict_scaling_lookup=*/true,
+ valid_range_chroma_pixels);
StoreUnsigned8(&out_chroma_row[x],
vreinterpretq_u16_s16(Clip3(blended, floor, ceiling)));
// End of right edge iteration.
@@ -1442,10 +1478,8 @@ void Init10bpp() {
dsp->film_grain.initialize_scaling_lut =
InitializeScalingLookupTable_NEON<kBitdepth10>;
- // TODO(b/194442742): reenable this function after segfault under armv7 ASan
- // is fixed.
- // dsp->film_grain.blend_noise_luma =
- // BlendNoiseWithImageLuma_NEON<kBitdepth10, int16_t, uint16_t>;
+ dsp->film_grain.blend_noise_luma =
+ BlendNoiseWithImageLuma_NEON<kBitdepth10, int16_t, uint16_t>;
dsp->film_grain.blend_noise_chroma[0] = BlendNoiseWithImageChroma10bpp_NEON;
dsp->film_grain.blend_noise_chroma[1] =
BlendNoiseWithImageChromaWithCfl_NEON<kBitdepth10, int16_t, uint16_t>;
diff --git a/src/dsp/arm/film_grain_neon.h b/src/dsp/arm/film_grain_neon.h
index 3ba2eef..09596e2 100644
--- a/src/dsp/arm/film_grain_neon.h
+++ b/src/dsp/arm/film_grain_neon.h
@@ -39,9 +39,7 @@ void FilmGrainInit_NEON();
#define LIBGAV1_Dsp8bpp_FilmGrainInitializeScalingLutFunc LIBGAV1_DSP_NEON
#define LIBGAV1_Dsp10bpp_FilmGrainInitializeScalingLutFunc LIBGAV1_DSP_NEON
#define LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseLuma LIBGAV1_DSP_NEON
-// TODO(b/194442742): reenable this function after segfault under armv7 ASan is
-// fixed.
-// #define LIBGAV1_Dsp10bpp_FilmGrainBlendNoiseLuma LIBGAV1_DSP_NEON
+#define LIBGAV1_Dsp10bpp_FilmGrainBlendNoiseLuma LIBGAV1_DSP_NEON
#define LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseChroma LIBGAV1_DSP_NEON
#define LIBGAV1_Dsp10bpp_FilmGrainBlendNoiseChroma LIBGAV1_DSP_NEON
#define LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseChromaWithCfl LIBGAV1_DSP_NEON
diff --git a/src/dsp/arm/intrapred_directional_neon.cc b/src/dsp/arm/intrapred_directional_neon.cc
index 3cad4a6..e9bdcf0 100644
--- a/src/dsp/arm/intrapred_directional_neon.cc
+++ b/src/dsp/arm/intrapred_directional_neon.cc
@@ -505,20 +505,12 @@ inline void DirectionalZone1Blend_WxH(
} while (++y < height);
}
-// The height at which a load of 16 bytes will not contain enough source pixels
-// from |left_column| to supply an accurate row when computing 8 pixels at a
-// time. The values are found by inspection. By coincidence, all angles that
-// satisfy (ystep >> 6) == 2 map to the same value, so it is enough to look up
-// by ystep >> 6. The largest index for this lookup is 1023 >> 6 == 15.
-constexpr int kDirectionalZone2ShuffleInvalidHeight[16] = {
- 1024, 1024, 16, 16, 16, 16, 0, 0, 18, 0, 0, 0, 0, 0, 0, 40};
-
-// 7.11.2.4 (8) 90 < angle > 180
-// The strategy for these functions (4xH and 8+xH) is to know how many blocks
-// can be processed with just pixels from |top_ptr|, then handle mixed blocks,
-// then handle only blocks that take from |left_ptr|. Additionally, a fast
-// index-shuffle approach is used for pred values from |left_column| in sections
-// that permit it.
+// 7.11.2.4 (8) 90 < angle > 180
+// The strategy for these functions (4xH and 8+xH) is to know how many blocks
+// can be processed with just pixels from |top_ptr|, then handle mixed blocks,
+// then handle only blocks that take from |left_ptr|. Additionally, a fast
+// index-shuffle approach is used for pred values from |left_column| in
+// sections that permit it.
inline void DirectionalZone2_4xH(
uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride,
const uint8_t* LIBGAV1_RESTRICT const top_row,
@@ -544,13 +536,6 @@ inline void DirectionalZone2_4xH(
assert(xstep >= 3);
const int min_top_only_x = std::min((height * xstep) >> 6, /* width */ 4);
- // For steep angles, the source pixels from |left_column| may not fit in a
- // 16-byte load for shuffling.
- // TODO(petersonab): Find a more precise formula for this subject to x.
- // TODO(johannkoenig): Revisit this for |width| == 4.
- const int max_shuffle_height =
- std::min(kDirectionalZone2ShuffleInvalidHeight[ystep >> 6], height);
-
// Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
int xstep_bounds_base = (xstep == 64) ? 0 : xstep - 1;
@@ -569,9 +554,9 @@ inline void DirectionalZone2_4xH(
// blocks that have a mixture of values computed from top or left. The final
// stage covers blocks that are only computed from the left.
if (min_top_only_x > 0) {
- // Round down to the nearest multiple of 8.
- // TODO(johannkoenig): This never hits for Wx4 blocks but maybe it should.
- const int max_top_only_y = std::min((1 << 6) / xstep, height) & ~7;
+ // Round down to the nearest multiple of 8 (or 4, if height is 4).
+ const int max_top_only_y =
+ std::min((1 << 6) / xstep, height) & ~(min_height - 1);
DirectionalZone1_WxH<4>(dst, stride, max_top_only_y, top_row, -xstep,
upsampled_top);
@@ -584,18 +569,11 @@ inline void DirectionalZone2_4xH(
// All rows from |min_left_only_y| down for this set of columns only need
// |left_column| to compute.
const int min_left_only_y = std::min((4 << 6) / xstep, height);
- // At high angles such that min_left_only_y < 8, ystep is low and xstep is
- // high. This means that max_shuffle_height is unbounded and xstep_bounds
- // will overflow in 16 bits. This is prevented by stopping the first
- // blending loop at min_left_only_y for such cases, which means we skip over
- // the second blending loop as well.
- const int left_shuffle_stop_y =
- std::min(max_shuffle_height, min_left_only_y);
int xstep_bounds = xstep_bounds_base + xstep_y;
int top_x = -xstep - xstep_y;
// +8 increment is OK because if height is 4 this only goes once.
- for (; y < left_shuffle_stop_y;
+ for (; y < min_left_only_y;
y += 8, dst += stride8, xstep_bounds += xstep8, top_x -= xstep8) {
DirectionalZone2FromLeftCol_WxH<4>(
dst, stride, min_height,
@@ -607,21 +585,8 @@ inline void DirectionalZone2_4xH(
upsample_top_shift);
}
- // Pick up from the last y-value, using the slower but secure method for
- // left prediction.
- const int16_t base_left_y = vgetq_lane_s16(left_y, 0);
- for (; y < min_left_only_y;
- y += 8, dst += stride8, xstep_bounds += xstep8, top_x -= xstep8) {
- DirectionalZone3_WxH<4>(
- dst, stride, min_height,
- left_column + ((y - left_base_increment) << upsample_left_shift),
- base_left_y, -ystep, upsample_left_shift);
-
- DirectionalZone1Blend_WxH<4>(dst, stride, min_height, top_row,
- xstep_bounds, top_x, xstep,
- upsample_top_shift);
- }
// Loop over y for left_only rows.
+ const int16_t base_left_y = vgetq_lane_s16(left_y, 0);
for (; y < height; y += 8, dst += stride8) {
DirectionalZone3_WxH<4>(
dst, stride, min_height,
@@ -634,34 +599,88 @@ inline void DirectionalZone2_4xH(
}
}
-// Process a multiple of 8 |width|.
-inline void DirectionalZone2_8(
+template <bool shuffle_left_column>
+inline void DirectionalZone2_8xH(
uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride,
const uint8_t* LIBGAV1_RESTRICT const top_row,
- const uint8_t* LIBGAV1_RESTRICT const left_column, const int width,
- const int height, const int xstep, const int ystep,
+ const uint8_t* LIBGAV1_RESTRICT const left_column, const int height,
+ const int xstep, const int ystep, const int x, const int left_offset,
+ const int xstep_bounds_base, const int16x8_t left_y,
const bool upsampled_top, const bool upsampled_left) {
const int upsample_left_shift = static_cast<int>(upsampled_left);
const int upsample_top_shift = static_cast<int>(upsampled_top);
- // Helper vector.
- const int16x8_t zero_to_seven = {0, 1, 2, 3, 4, 5, 6, 7};
-
// Loop incrementers for moving by block (8x8). This function handles blocks
// with height 4 as well. They are calculated in one pass so these variables
// do not get used.
const ptrdiff_t stride8 = stride << 3;
const int xstep8 = xstep << 3;
- const int ystep8 = ystep << 3;
- // Process Wx4 blocks.
+ // Cover 8x4 case.
const int min_height = (height == 4) ? 4 : 8;
- // All columns from |min_top_only_x| to the right will only need |top_row| to
- // compute and can therefore call the Zone1 functions. This assumes |xstep| is
- // at least 3.
- assert(xstep >= 3);
- const int min_top_only_x = std::min((height * xstep) >> 6, width);
+ // The first stage, before the first y-loop, covers blocks that are only
+ // computed from the top row. The second stage, comprising two y-loops, covers
+ // blocks that have a mixture of values computed from top or left. The final
+ // stage covers blocks that are only computed from the left.
+ uint8_t* dst_x = dst + x;
+ // Round down to the nearest multiple of 8 (or 4, if height is 4).
+ const int max_top_only_y =
+ std::min((1 << 6) / xstep, height) & ~(min_height - 1);
+ DirectionalZone1_WxH<8>(dst_x, stride, max_top_only_y,
+ top_row + (x << upsample_top_shift), -xstep,
+ upsampled_top);
+
+ if (max_top_only_y == height) return;
+
+ int y = max_top_only_y;
+ dst_x += stride * y;
+ const int xstep_y = xstep * y;
+
+ // All rows from |min_left_only_y| down for this set of columns only need
+ // |left_column| to compute. Round up to the nearest 8.
+ const int min_left_only_y =
+ Align(std::min(((x + 8) << 6) / xstep, height), 8);
+ int xstep_bounds = xstep_bounds_base + xstep_y;
+ int top_x = -xstep - xstep_y;
+
+ const int16_t base_left_y = vgetq_lane_s16(left_y, 0);
+ for (; y < min_left_only_y;
+ y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) {
+ if (shuffle_left_column) {
+ DirectionalZone2FromLeftCol_WxH<8>(
+ dst_x, stride, min_height,
+ left_column + ((left_offset + y) << upsample_left_shift), left_y,
+ upsample_left_shift);
+ } else {
+ DirectionalZone3_WxH<8>(
+ dst_x, stride, min_height,
+ left_column + ((left_offset + y) << upsample_left_shift), base_left_y,
+ -ystep, upsample_left_shift);
+ }
+
+ DirectionalZone1Blend_WxH<8>(
+ dst_x, stride, min_height, top_row + (x << upsample_top_shift),
+ xstep_bounds, top_x, xstep, upsample_top_shift);
+ }
+
+ // Loop over y for left_only rows.
+ for (; y < height; y += 8, dst_x += stride8) {
+ DirectionalZone3_WxH<8>(
+ dst_x, stride, min_height,
+ left_column + ((left_offset + y) << upsample_left_shift), base_left_y,
+ -ystep, upsample_left_shift);
+ }
+}
+
+// Process a multiple of 8 |width|.
+inline void DirectionalZone2_WxH(
+ uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride,
+ const uint8_t* LIBGAV1_RESTRICT const top_row,
+ const uint8_t* LIBGAV1_RESTRICT const left_column, const int width,
+ const int height, const int xstep, const int ystep,
+ const bool upsampled_top, const bool upsampled_left) {
+ const int ystep8 = ystep << 3;
// Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
int xstep_bounds_base = (xstep == 64) ? 0 : xstep - 1;
@@ -677,90 +696,43 @@ inline void DirectionalZone2_8(
// left_y vector omits the portion which is covered under the left_column
// offset. Following values need the full ystep as a relative offset.
const int16x8_t remainder = vdupq_n_s16(-ystep_remainder);
+ const int16x8_t zero_to_seven = {0, 1, 2, 3, 4, 5, 6, 7};
int16x8_t left_y = vmlaq_n_s16(remainder, zero_to_seven, -ystep);
+ // For ystep > 90, at least two sets of 8 columns can be fully computed from
+ // top_row only.
+ const int min_top_only_x = std::min((height * xstep) >> 6, width);
+ // Analysis finds that, for most angles (ystep < 132), all segments that use
+ // both top_row and left_column can compute from left_column using byte
+ // shuffles from a single vector. For steeper angles, the shuffle is also
+ // fully reliable when x >= 32.
+ const int shuffle_left_col_x = (ystep < 132) ? 0 : 32;
+ const int min_shuffle_x = std::min(min_top_only_x, shuffle_left_col_x);
+
// This loop treats each set of 4 columns in 3 stages with y-value boundaries.
// The first stage, before the first y-loop, covers blocks that are only
// computed from the top row. The second stage, comprising two y-loops, covers
// blocks that have a mixture of values computed from top or left. The final
// stage covers blocks that are only computed from the left.
int x = 0;
- // For steep angles, the source pixels from |left_column| may not fit in a
- // 16-byte load for shuffling. |d| represents the number of pixels that can
- // fit in one contiguous vector when stepping by |ystep|. For a given x
- // position, the left column values can be obtained by VTBL as long as the
- // values at row[x + d] and beyond come from the top row. However, this does
- // not guarantee that the vector will also contain all of the values needed
- // from top row.
- const int d = 16 / ((ystep >> 6) + 1);
+ for (int left_offset = -left_base_increment; x < min_shuffle_x; x += 8,
+ xstep_bounds_base -= (8 << 6),
+ left_y = vsubq_s16(left_y, increment_left8),
+ left_offset -= left_base_increment8) {
+ DirectionalZone2_8xH<false>(dst, stride, top_row, left_column, height,
+ xstep, ystep, x, left_offset, xstep_bounds_base,
+ left_y, upsampled_top, upsampled_left);
+ }
for (int left_offset = -left_base_increment; x < min_top_only_x; x += 8,
xstep_bounds_base -= (8 << 6),
left_y = vsubq_s16(left_y, increment_left8),
left_offset -= left_base_increment8) {
- uint8_t* dst_x = dst + x;
- const int max_shuffle_height =
- std::min(((x + d) << 6) / xstep, height) & ~7;
- // Round down to the nearest multiple of 8.
- const int max_top_only_y = std::min(((x + 1) << 6) / xstep, height) & ~7;
- DirectionalZone1_WxH<8>(dst_x, stride, max_top_only_y,
- top_row + (x << upsample_top_shift), -xstep,
- upsampled_top);
-
- if (max_top_only_y == height) continue;
-
- int y = max_top_only_y;
- dst_x += stride * y;
- const int xstep_y = xstep * y;
-
- // All rows from |min_left_only_y| down for this set of columns only need
- // |left_column| to compute.
- const int min_left_only_y = std::min(((x + 8) << 6) / xstep, height);
- // At high angles such that min_left_only_y < 8, ystep is low and xstep is
- // high. This means that max_shuffle_height is unbounded and xstep_bounds
- // will overflow in 16 bits. This is prevented by stopping the first
- // blending loop at min_left_only_y for such cases, which means we skip over
- // the second blending loop as well.
- const int left_shuffle_stop_y =
- std::min(max_shuffle_height, min_left_only_y);
- int xstep_bounds = xstep_bounds_base + xstep_y;
- int top_x = -xstep - xstep_y;
-
- for (; y < left_shuffle_stop_y;
- y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) {
- DirectionalZone2FromLeftCol_WxH<8>(
- dst_x, stride, min_height,
- left_column + ((left_offset + y) << upsample_left_shift), left_y,
- upsample_left_shift);
-
- DirectionalZone1Blend_WxH<8>(
- dst_x, stride, min_height, top_row + (x << upsample_top_shift),
- xstep_bounds, top_x, xstep, upsample_top_shift);
- }
-
- // Pick up from the last y-value, using the slower but secure method for
- // left prediction.
- const int16_t base_left_y = vgetq_lane_s16(left_y, 0);
- for (; y < min_left_only_y;
- y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) {
- DirectionalZone3_WxH<8>(
- dst_x, stride, min_height,
- left_column + ((left_offset + y) << upsample_left_shift), base_left_y,
- -ystep, upsample_left_shift);
-
- DirectionalZone1Blend_WxH<8>(
- dst_x, stride, min_height, top_row + (x << upsample_top_shift),
- xstep_bounds, top_x, xstep, upsample_top_shift);
- }
- // Loop over y for left_only rows.
- for (; y < height; y += 8, dst_x += stride8) {
- DirectionalZone3_WxH<8>(
- dst_x, stride, min_height,
- left_column + ((left_offset + y) << upsample_left_shift), base_left_y,
- -ystep, upsample_left_shift);
- }
+ DirectionalZone2_8xH<true>(dst, stride, top_row, left_column, height, xstep,
+ ystep, x, left_offset, xstep_bounds_base, left_y,
+ upsampled_top, upsampled_left);
}
- // TODO(johannkoenig): May be able to remove this branch.
if (x < width) {
+ const int upsample_top_shift = static_cast<int>(upsampled_top);
DirectionalZone1_WxH(dst + x, stride, width - x, height,
top_row + (x << upsample_top_shift), -xstep,
upsampled_top);
@@ -792,8 +764,8 @@ void DirectionalIntraPredictorZone2_NEON(
DirectionalZone2_4xH(dst, stride, top_ptr, left_ptr, height, xstep, ystep,
upsampled_top, upsampled_left);
} else {
- DirectionalZone2_8(dst, stride, top_ptr, left_ptr, width, height, xstep,
- ystep, upsampled_top, upsampled_left);
+ DirectionalZone2_WxH(dst, stride, top_ptr, left_ptr, width, height, xstep,
+ ystep, upsampled_top, upsampled_left);
}
}
@@ -935,6 +907,16 @@ inline uint16x8_t WeightedBlend(const uint16x8_t a, const uint16x8_t b,
return vrshrq_n_u16(sum, 5 /*log2(32)*/);
}
+// Blend two values based on weights that sum to 32.
+inline uint16x8_t WeightedBlend(const uint16x8_t a, const uint16x8_t b,
+ const uint16x8_t a_weight,
+ const uint16x8_t b_weight) {
+ const uint16x8_t a_product = vmulq_u16(a, a_weight);
+ const uint16x8_t sum = vmlaq_u16(a_product, b, b_weight);
+
+ return vrshrq_n_u16(sum, 5 /*log2(32)*/);
+}
+
// Each element of |dest| contains values associated with one weight value.
inline void LoadEdgeVals(uint16x4x2_t* dest,
const uint16_t* LIBGAV1_RESTRICT const source,
@@ -959,6 +941,24 @@ inline void LoadEdgeVals(uint16x8x2_t* dest,
}
}
+// For Wx4 blocks, load the source for 2 columns. The source for the second
+// column is held in the high half of each vector.
+inline void LoadEdgeVals2x4(uint16x8x2_t* dest,
+ const uint16_t* LIBGAV1_RESTRICT const source_low,
+ const uint16_t* LIBGAV1_RESTRICT const source_high,
+ const bool upsampled) {
+ if (upsampled) {
+ const uint16x4x2_t low = vld2_u16(source_low);
+ const uint16x4x2_t high = vld2_u16(source_high);
+ dest->val[0] = vcombine_u16(low.val[0], high.val[0]);
+ dest->val[1] = vcombine_u16(low.val[1], high.val[1]);
+ } else {
+ dest->val[0] = vcombine_u16(vld1_u16(source_low), vld1_u16(source_high));
+ dest->val[1] =
+ vcombine_u16(vld1_u16(source_low + 1), vld1_u16(source_high + 1));
+ }
+}
+
template <bool upsampled>
inline void DirectionalZone1_4xH(uint16_t* LIBGAV1_RESTRICT dst,
const ptrdiff_t stride, const int height,
@@ -1286,18 +1286,162 @@ inline void DirectionalZone3_4x4(uint8_t* LIBGAV1_RESTRICT dst,
}
template <bool upsampled>
+inline void DirectionalZone3_8x4(uint8_t* LIBGAV1_RESTRICT dst,
+ const ptrdiff_t stride,
+ const uint16_t* LIBGAV1_RESTRICT const left,
+ const int ystep, const int base_left_y = 0) {
+ const int upsample_shift = static_cast<int>(upsampled);
+ const int index_scale_bits = 6 - upsample_shift;
+ const uint16x8_t inverter = vdupq_n_u16(32);
+
+ uint16x8x2_t sampled_left_col;
+ // Compute two columns at a time, then transpose for storage.
+ uint16x8_t result[4];
+
+ // The low half of pre-transpose vectors contains columns 0 through 3.
+ int left_y_low = base_left_y + ystep;
+ int left_offset_low = left_y_low >> index_scale_bits;
+ int shift_low = (LeftShift(left_y_low, upsample_shift) & 0x3F) >> 1;
+
+ // The high half of pre-transpose vectors contains columns 4 through 7.
+ int left_y_high = left_y_low + (ystep << 2);
+ int left_offset_high = left_y_high >> index_scale_bits;
+ int shift_high = (LeftShift(left_y_high, upsample_shift) & 0x3F) >> 1;
+ uint16x8_t weights_0 =
+ vcombine_u16(vdup_n_u16(shift_low), vdup_n_u16(shift_high));
+ uint16x8_t weights_1 = vsubq_u16(inverter, weights_0);
+ LoadEdgeVals2x4(&sampled_left_col, &left[left_offset_low],
+ &left[left_offset_high], upsampled);
+ result[0] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ weights_1, weights_0);
+
+ left_y_low += ystep;
+ left_offset_low = left_y_low >> index_scale_bits;
+ shift_low = (LeftShift(left_y_low, upsample_shift) & 0x3F) >> 1;
+
+ left_y_high += ystep;
+ left_offset_high = left_y_high >> index_scale_bits;
+ shift_high = (LeftShift(left_y_high, upsample_shift) & 0x3F) >> 1;
+ weights_0 = vcombine_u16(vdup_n_u16(shift_low), vdup_n_u16(shift_high));
+ weights_1 = vsubq_u16(inverter, weights_0);
+ LoadEdgeVals2x4(&sampled_left_col, &left[left_offset_low],
+ &left[left_offset_high], upsampled);
+ result[1] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ weights_1, weights_0);
+
+ left_y_high += ystep;
+ left_y_low += ystep;
+ left_offset_low = left_y_low >> index_scale_bits;
+ shift_low = (LeftShift(left_y_low, upsample_shift) & 0x3F) >> 1;
+
+ left_offset_high = left_y_high >> index_scale_bits;
+ shift_high = (LeftShift(left_y_high, upsample_shift) & 0x3F) >> 1;
+ weights_0 = vcombine_u16(vdup_n_u16(shift_low), vdup_n_u16(shift_high));
+ weights_1 = vsubq_u16(inverter, weights_0);
+ LoadEdgeVals2x4(&sampled_left_col, &left[left_offset_low],
+ &left[left_offset_high], upsampled);
+ result[2] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ weights_1, weights_0);
+
+ left_y_low += ystep;
+ left_offset_low = left_y_low >> index_scale_bits;
+ shift_low = (LeftShift(left_y_low, upsample_shift) & 0x3F) >> 1;
+
+ left_y_high += ystep;
+ left_offset_high = left_y_high >> index_scale_bits;
+ shift_high = (LeftShift(left_y_high, upsample_shift) & 0x3F) >> 1;
+ weights_0 = vcombine_u16(vdup_n_u16(shift_low), vdup_n_u16(shift_high));
+ weights_1 = vsubq_u16(inverter, weights_0);
+ LoadEdgeVals2x4(&sampled_left_col, &left[left_offset_low],
+ &left[left_offset_high], upsampled);
+ result[3] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ weights_1, weights_0);
+
+ Transpose4x8(result);
+ Store8(dst, result[0]);
+ dst += stride;
+ Store8(dst, result[1]);
+ dst += stride;
+ Store8(dst, result[2]);
+ dst += stride;
+ Store8(dst, result[3]);
+}
+
+template <bool upsampled>
+inline void DirectionalZone3_4x8(uint8_t* LIBGAV1_RESTRICT dst,
+ const ptrdiff_t stride,
+ const uint16_t* LIBGAV1_RESTRICT const left,
+ const int ystep, const int base_left_y = 0) {
+ const int upsample_shift = static_cast<int>(upsampled);
+ const int index_scale_bits = 6 - upsample_shift;
+
+ // Compute one column at a time, then transpose for storage.
+ uint16x8_t result[4];
+
+ int left_y = base_left_y + ystep;
+ int left_offset = left_y >> index_scale_bits;
+ int shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ int shift_1 = 32 - shift_0;
+ uint16x8x2_t sampled_left_col;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[0] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+
+ left_y += ystep;
+ left_offset = left_y >> index_scale_bits;
+ shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ shift_1 = 32 - shift_0;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[1] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+
+ left_y += ystep;
+ left_offset = left_y >> index_scale_bits;
+ shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ shift_1 = 32 - shift_0;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[2] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+
+ left_y += ystep;
+ left_offset = left_y >> index_scale_bits;
+ shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ shift_1 = 32 - shift_0;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[3] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+
+ Transpose4x8(result);
+ Store4(dst, vget_low_u16(result[0]));
+ dst += stride;
+ Store4(dst, vget_low_u16(result[1]));
+ dst += stride;
+ Store4(dst, vget_low_u16(result[2]));
+ dst += stride;
+ Store4(dst, vget_low_u16(result[3]));
+ dst += stride;
+ Store4(dst, vget_high_u16(result[0]));
+ dst += stride;
+ Store4(dst, vget_high_u16(result[1]));
+ dst += stride;
+ Store4(dst, vget_high_u16(result[2]));
+ dst += stride;
+ Store4(dst, vget_high_u16(result[3]));
+}
+
+template <bool upsampled>
inline void DirectionalZone3_4xH(uint8_t* LIBGAV1_RESTRICT dest,
const ptrdiff_t stride, const int height,
const uint16_t* LIBGAV1_RESTRICT const left,
const int ystep) {
+ assert(height == 8 || height == 16);
const int upsample_shift = static_cast<int>(upsampled);
- int y = 0;
- do {
- DirectionalZone3_4x4<upsampled>(dest, stride, left + (y << upsample_shift),
+ DirectionalZone3_4x8<upsampled>(dest, stride, left, ystep);
+ if (height == 16) {
+ dest += stride << 3;
+ DirectionalZone3_4x8<upsampled>(dest, stride, left + (8 << upsample_shift),
ystep);
- dest += 4 * stride;
- y += 4;
- } while (y < height);
+ }
}
template <bool upsampled>
@@ -1305,16 +1449,17 @@ inline void DirectionalZone3_Wx4(uint8_t* LIBGAV1_RESTRICT dest,
const ptrdiff_t stride, const int width,
const uint16_t* LIBGAV1_RESTRICT const left,
const int ystep) {
- int x = 0;
- int base_left_y = 0;
- do {
- // TODO(petersonab): Establish 8x4 transpose to reserve this function for
- // 8x4 and 16x4.
- DirectionalZone3_4x4<upsampled>(dest + 2 * x, stride, left, ystep,
- base_left_y);
- base_left_y += 4 * ystep;
- x += 4;
- } while (x < width);
+ assert(width <= 16);
+ if (width == 4) {
+ DirectionalZone3_4x4<upsampled>(dest, stride, left, ystep);
+ return;
+ }
+ DirectionalZone3_8x4<upsampled>(dest, stride, left, ystep);
+ if (width == 16) {
+ const int base_left_y = ystep << 3;
+ DirectionalZone3_8x4<upsampled>(dest + 8 * sizeof(uint16_t), stride, left,
+ ystep, base_left_y);
+ }
}
template <bool upsampled>
@@ -1460,17 +1605,17 @@ void DirectionalIntraPredictorZone3_NEON(
} while (y != 0);
return;
}
- if (width == 4) {
+ if (height == 4) {
if (upsampled_left) {
- DirectionalZone3_4xH<true>(dst, stride, height, left, ystep);
+ DirectionalZone3_Wx4<true>(dst, stride, width, left, ystep);
} else {
- DirectionalZone3_4xH<false>(dst, stride, height, left, ystep);
+ DirectionalZone3_Wx4<false>(dst, stride, width, left, ystep);
}
- } else if (height == 4) {
+ } else if (width == 4) {
if (upsampled_left) {
- DirectionalZone3_Wx4<true>(dst, stride, width, left, ystep);
+ DirectionalZone3_4xH<true>(dst, stride, height, left, ystep);
} else {
- DirectionalZone3_Wx4<false>(dst, stride, width, left, ystep);
+ DirectionalZone3_4xH<false>(dst, stride, height, left, ystep);
}
} else {
if (upsampled_left) {
@@ -1532,16 +1677,6 @@ inline uint16x4_t WeightedBlend(const uint16x4_t a, const uint16x4_t b,
return vrshr_n_u16(sum, 5 /*log2(32)*/);
}
-// Blend two values based on weight pairs that each sum to 32.
-inline uint16x8_t WeightedBlend(const uint16x8_t a, const uint16x8_t b,
- const uint16x8_t a_weight,
- const uint16x8_t b_weight) {
- const uint16x8_t a_product = vmulq_u16(a, a_weight);
- const uint16x8_t sum = vmlaq_u16(a_product, b, b_weight);
-
- return vrshrq_n_u16(sum, 5 /*log2(32)*/);
-}
-
// Because the source values "move backwards" as the row index increases, the
// indices derived from ystep are generally negative in localized functions.
// This is accommodated by making sure the relative indices are within [-15, 0]
@@ -1608,8 +1743,8 @@ inline void DirectionalZone2FromLeftCol_4xH(
} while (++y < height);
}
-inline void DirectionalZone2FromLeftCol_8xH(
- uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride, const int height,
+inline void DirectionalZone2FromLeftCol_8x8(
+ uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride,
const uint16_t* LIBGAV1_RESTRICT const left_column, const int16x8_t left_y,
const bool upsampled) {
const int upsample_shift = static_cast<int>(upsampled);
@@ -1653,8 +1788,7 @@ inline void DirectionalZone2FromLeftCol_8xH(
vreinterpretq_u16_s16(vshrq_n_s16(shift_masked, 1));
const uint16x8_t shift_1 = vsubq_u16(vdupq_n_u16(32), shift_0);
- int y = 0;
- do {
+ for (int y = 0; y < 8; ++y) {
uint16x8_t src_left, src_right;
LoadStepwise(
left_column - kPositiveIndexOffsetPixels + (y << upsample_shift),
@@ -1664,7 +1798,7 @@ inline void DirectionalZone2FromLeftCol_8xH(
Store8(dst, val);
dst += stride;
- } while (++y < height);
+ }
}
template <bool upsampled>
@@ -1704,8 +1838,8 @@ inline void DirectionalZone1Blend_4xH(
}
template <bool upsampled>
-inline void DirectionalZone1Blend_8xH(
- uint8_t* LIBGAV1_RESTRICT dest, const ptrdiff_t stride, const int height,
+inline void DirectionalZone1Blend_8x8(
+ uint8_t* LIBGAV1_RESTRICT dest, const ptrdiff_t stride,
const uint16_t* LIBGAV1_RESTRICT const top_row, int zone_bounds, int top_x,
const int xstep) {
const int upsample_shift = static_cast<int>(upsampled);
@@ -1716,8 +1850,7 @@ inline void DirectionalZone1Blend_8xH(
const int16x8_t indices = {0, 1, 2, 3, 4, 5, 6, 7};
uint16x8x2_t top_vals;
- int y = height;
- do {
+ for (int y = 0; y < 8; ++y) {
const uint16_t* const src = top_row + (top_x >> scale_bits_x);
LoadEdgeVals(&top_vals, src, upsampled);
@@ -1736,20 +1869,9 @@ inline void DirectionalZone1Blend_8xH(
dest += stride;
zone_bounds += xstep;
top_x -= xstep;
- } while (--y != 0);
+ }
}
-// The height at which a load of 16 bytes will not contain enough source pixels
-// from |left_column| to supply an accurate row when computing 8 pixels at a
-// time. The values are found by inspection. By coincidence, all angles that
-// satisfy (ystep >> 6) == 2 map to the same value, so it is enough to look up
-// by ystep >> 6. The largest index for this lookup is 1023 >> 6 == 15. Indices
-// that do not correspond to angle derivatives are left at zero.
-// Notably, in cases with upsampling, the shuffle-invalid height is always
-// greater than the prediction height (which is 8 at maximum).
-constexpr int kDirectionalZone2ShuffleInvalidHeight[16] = {
- 1024, 1024, 16, 16, 16, 16, 0, 0, 18, 0, 0, 0, 0, 0, 0, 40};
-
// 7.11.2.4 (8) 90 < angle > 180
// The strategy for these functions (4xH and 8+xH) is to know how many blocks
// can be processed with just pixels from |top_ptr|, then handle mixed blocks,
@@ -1796,9 +1918,9 @@ inline void DirectionalZone2_4xH(
// computed from the top row. The second stage, comprising two y-loops, covers
// blocks that have a mixture of values computed from top or left. The final
// stage covers blocks that are only computed from the left.
- // Round down to the nearest multiple of 8.
- // TODO(petersonab): Check if rounding to the nearest 4 is okay.
- const int max_top_only_y = std::min((1 << 6) / xstep, height) & ~7;
+ // Round down to the nearest multiple of 8 (or 4, if height is 4).
+ const int max_top_only_y =
+ std::min((1 << 6) / xstep, height) & ~(min_height - 1);
DirectionalZone1_4xH<upsampled_top>(reinterpret_cast<uint16_t*>(dst),
stride >> 1, max_top_only_y, top_row,
-xstep);
@@ -1827,12 +1949,15 @@ inline void DirectionalZone2_4xH(
xstep_bounds, top_x, xstep);
}
- // Loop over y for left-only rows.
- for (; y < height; y += 8, dst += stride8) {
- // Angle expected by Zone3 is flipped about the 180 degree vector, which
- // is the x-axis.
+ // Left-only section. |height| - |y| is assumed equivalent to:
+ // (y == 0) && (height == 4)
+ if (height - y == 4) {
+ DirectionalZone3_4x4<upsampled_left>(dst, stride, left_column, -ystep);
+ return;
+ }
+ if (y < height) {
DirectionalZone3_4xH<upsampled_left>(
- dst, stride, min_height, left_column + (y << upsample_left_shift),
+ dst, stride, height - y, left_column + (y << upsample_left_shift),
-ystep);
}
}
@@ -1882,9 +2007,75 @@ inline void DirectionalZone2_Wx4(
}
}
+template <bool shuffle_left_column, bool upsampled_top, bool upsampled_left>
+inline void DirectionalZone2_8xH(
+ uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride,
+ const uint16_t* LIBGAV1_RESTRICT const top_row,
+ const uint16_t* LIBGAV1_RESTRICT const left_column, const int height,
+ const int xstep, const int ystep, const int x, const int left_offset,
+ const int xstep_bounds_base, const int16x8_t left_y) {
+ const int upsample_left_shift = static_cast<int>(upsampled_left);
+ const int upsample_top_shift = static_cast<int>(upsampled_top);
+
+ // Loop incrementers for moving by block (8x8). This function handles blocks
+ // with height 4 as well. They are calculated in one pass so these variables
+ // do not get used.
+ const ptrdiff_t stride8 = stride << 3;
+ const int xstep8 = xstep << 3;
+
+ // The first stage, before the first y-loop, covers blocks that are only
+ // computed from the top row. The second stage, comprising two y-loops, covers
+ // blocks that have a mixture of values computed from top or left. The final
+ // stage covers blocks that are only computed from the left.
+ uint8_t* dst_x = dst + x * sizeof(uint16_t);
+ // Round down to the nearest multiple of 8.
+ const int max_top_only_y = std::min(((x + 1) << 6) / xstep, height) & ~7;
+ DirectionalZone1_WxH<upsampled_top>(
+ reinterpret_cast<uint16_t*>(dst_x), stride >> 1, 8, max_top_only_y,
+ top_row + (x << upsample_top_shift), -xstep);
+
+ if (max_top_only_y == height) return;
+
+ int y = max_top_only_y;
+ dst_x += stride * y;
+ const int xstep_y = xstep * y;
+
+ // All rows from |min_left_only_y| down for this set of columns only need
+ // |left_column| to compute. Round up to the nearest 8.
+ const int min_left_only_y =
+ Align(std::min(((x + 8) << 6) / xstep, height), 8);
+ int xstep_bounds = xstep_bounds_base + xstep_y;
+ int top_x = -xstep - xstep_y;
+
+ for (; y < min_left_only_y;
+ y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) {
+ if (shuffle_left_column) {
+ DirectionalZone2FromLeftCol_8x8(
+ dst_x, stride,
+ left_column + ((left_offset + y) << upsample_left_shift), left_y,
+ upsampled_left);
+ } else {
+ DirectionalZone3_8x8<upsampled_left>(
+ dst_x, stride, left_column + (y << upsample_left_shift), -ystep,
+ -ystep * x);
+ }
+
+ DirectionalZone1Blend_8x8<upsampled_top>(
+ dst_x, stride, top_row + (x << upsample_top_shift), xstep_bounds, top_x,
+ xstep);
+ }
+
+ // Loop over y for left_only rows.
+ for (; y < height; y += 8, dst_x += stride8) {
+ DirectionalZone3_8x8<upsampled_left>(
+ dst_x, stride, left_column + (y << upsample_left_shift), -ystep,
+ -ystep * x);
+ }
+}
+
// Process a multiple of 8 |width|.
template <bool upsampled_top, bool upsampled_left>
-inline void DirectionalZone2_8(
+inline void DirectionalZone2_NEON(
uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride,
const uint16_t* LIBGAV1_RESTRICT const top_row,
const uint16_t* LIBGAV1_RESTRICT const left_column, const int width,
@@ -1894,30 +2085,24 @@ inline void DirectionalZone2_8(
dst, stride, top_row, left_column, width, xstep, ystep);
return;
}
- const int upsample_left_shift = static_cast<int>(upsampled_left);
const int upsample_top_shift = static_cast<int>(upsampled_top);
// Helper vector.
const int16x8_t zero_to_seven = {0, 1, 2, 3, 4, 5, 6, 7};
- // Loop increments for moving by block (8x8). This function handles blocks
- // with height 4 as well. They are calculated in one pass so these variables
- // do not get used.
- const ptrdiff_t stride8 = stride << 3;
- const int xstep8 = xstep << 3;
const int ystep8 = ystep << 3;
// All columns from |min_top_only_x| to the right will only need |top_row| to
// compute and can therefore call the Zone1 functions. This assumes |xstep| is
// at least 3.
assert(xstep >= 3);
- const int min_top_only_x = std::min((height * xstep) >> 6, width);
-
- // For steep angles, the source pixels from |left_column| may not fit in a
- // 16-byte load for shuffling.
- // TODO(petersonab): Find a more precise formula for this subject to x.
- const int max_shuffle_height =
- std::min(kDirectionalZone2ShuffleInvalidHeight[ystep >> 6], height);
+ const int min_top_only_x = Align(std::min((height * xstep) >> 6, width), 8);
+ // Analysis finds that, for most angles (ystep < 132), all segments that use
+ // both top_row and left_column can compute from left_column using byte
+ // shuffles from a single vector. For steeper angles, the shuffle is also
+ // fully reliable when x >= 32.
+ const int shuffle_left_col_x = (ystep < 132) ? 0 : 32;
+ const int min_shuffle_x = std::min(min_top_only_x, shuffle_left_col_x);
// Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
int xstep_bounds_base = (xstep == 64) ? 0 : xstep - 1;
@@ -1935,73 +2120,22 @@ inline void DirectionalZone2_8(
int16x8_t left_y =
vmlaq_n_s16(vdupq_n_s16(-ystep_remainder), zero_to_seven, -ystep);
- // This loop treats each set of 4 columns in 3 stages with y-value boundaries.
- // The first stage, before the first y-loop, covers blocks that are only
- // computed from the top row. The second stage, comprising two y-loops, covers
- // blocks that have a mixture of values computed from top or left. The final
- // stage covers blocks that are only computed from the left.
int x = 0;
+ for (int left_offset = -left_base_increment; x < min_shuffle_x; x += 8,
+ xstep_bounds_base -= (8 << 6),
+ left_y = vsubq_s16(left_y, increment_left8),
+ left_offset -= left_base_increment8) {
+ DirectionalZone2_8xH<false, upsampled_top, upsampled_left>(
+ dst, stride, top_row, left_column, height, xstep, ystep, x, left_offset,
+ xstep_bounds_base, left_y);
+ }
for (int left_offset = -left_base_increment; x < min_top_only_x; x += 8,
xstep_bounds_base -= (8 << 6),
left_y = vsubq_s16(left_y, increment_left8),
left_offset -= left_base_increment8) {
- uint8_t* dst_x = dst + x * sizeof(uint16_t);
-
- // Round down to the nearest multiple of 8.
- const int max_top_only_y = std::min(((x + 1) << 6) / xstep, height) & ~7;
- DirectionalZone1_WxH<upsampled_top>(
- reinterpret_cast<uint16_t*>(dst_x), stride >> 1, 8, max_top_only_y,
- top_row + (x << upsample_top_shift), -xstep);
-
- if (max_top_only_y == height) continue;
-
- int y = max_top_only_y;
- dst_x += stride * y;
- const int xstep_y = xstep * y;
-
- // All rows from |min_left_only_y| down for this set of columns only need
- // |left_column| to compute.
- const int min_left_only_y = std::min(((x + 8) << 6) / xstep, height);
- // At high angles such that min_left_only_y < 8, ystep is low and xstep is
- // high. This means that max_shuffle_height is unbounded and xstep_bounds
- // will overflow in 16 bits. This is prevented by stopping the first
- // blending loop at min_left_only_y for such cases, which means we skip over
- // the second blending loop as well.
- const int left_shuffle_stop_y =
- std::min(max_shuffle_height, min_left_only_y);
- int xstep_bounds = xstep_bounds_base + xstep_y;
- int top_x = -xstep - xstep_y;
-
- for (; y < left_shuffle_stop_y;
- y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) {
- DirectionalZone2FromLeftCol_8xH(
- dst_x, stride, 8,
- left_column + ((left_offset + y) << upsample_left_shift), left_y,
- upsample_left_shift);
-
- DirectionalZone1Blend_8xH<upsampled_top>(
- dst_x, stride, 8, top_row + (x << upsample_top_shift), xstep_bounds,
- top_x, xstep);
- }
-
- // Pick up from the last y-value, using the slower but secure method for
- // left prediction.
- for (; y < min_left_only_y;
- y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) {
- DirectionalZone3_8x8<upsampled_left>(
- dst_x, stride, left_column + (y << upsample_left_shift), -ystep,
- -ystep * x);
-
- DirectionalZone1Blend_8xH<upsampled_top>(
- dst_x, stride, 8, top_row + (x << upsample_top_shift), xstep_bounds,
- top_x, xstep);
- }
- // Loop over y for left_only rows.
- for (; y < height; y += 8, dst_x += stride8) {
- DirectionalZone3_8x8<upsampled_left>(
- dst_x, stride, left_column + (y << upsample_left_shift), -ystep,
- -ystep * x);
- }
+ DirectionalZone2_8xH<true, upsampled_top, upsampled_left>(
+ dst, stride, top_row, left_column, height, xstep, ystep, x, left_offset,
+ xstep_bounds_base, left_y);
}
// Reached |min_top_only_x|.
if (x < width) {
@@ -2129,18 +2263,18 @@ void DirectionalIntraPredictorZone2_NEON(
}
if (upsampled_top) {
if (upsampled_left) {
- DirectionalZone2_8<true, true>(dst, stride, top_ptr, left_ptr, width,
- height, xstep, ystep);
+ DirectionalZone2_NEON<true, true>(dst, stride, top_ptr, left_ptr, width,
+ height, xstep, ystep);
} else {
- DirectionalZone2_8<true, false>(dst, stride, top_ptr, left_ptr, width,
- height, xstep, ystep);
+ DirectionalZone2_NEON<true, false>(dst, stride, top_ptr, left_ptr, width,
+ height, xstep, ystep);
}
} else if (upsampled_left) {
- DirectionalZone2_8<false, true>(dst, stride, top_ptr, left_ptr, width,
- height, xstep, ystep);
+ DirectionalZone2_NEON<false, true>(dst, stride, top_ptr, left_ptr, width,
+ height, xstep, ystep);
} else {
- DirectionalZone2_8<false, false>(dst, stride, top_ptr, left_ptr, width,
- height, xstep, ystep);
+ DirectionalZone2_NEON<false, false>(dst, stride, top_ptr, left_ptr, width,
+ height, xstep, ystep);
}
}
diff --git a/src/dsp/arm/intrapred_neon.cc b/src/dsp/arm/intrapred_neon.cc
index cd47a22..d1adbdf 100644
--- a/src/dsp/arm/intrapred_neon.cc
+++ b/src/dsp/arm/intrapred_neon.cc
@@ -407,13 +407,9 @@ inline void Paeth4Or8xN_NEON(void* LIBGAV1_RESTRICT const dest,
inline uint8x16_t XLeTopLeft(const uint8x16_t x_dist,
const uint16x8_t top_left_dist_low,
const uint16x8_t top_left_dist_high) {
- // TODO(johannkoenig): cle() should work with vmovn(top_left_dist) instead of
- // using movl(x_dist).
- const uint8x8_t x_le_top_left_low =
- vmovn_u16(vcleq_u16(vmovl_u8(vget_low_u8(x_dist)), top_left_dist_low));
- const uint8x8_t x_le_top_left_high =
- vmovn_u16(vcleq_u16(vmovl_u8(vget_high_u8(x_dist)), top_left_dist_high));
- return vcombine_u8(x_le_top_left_low, x_le_top_left_high);
+ const uint8x16_t top_left_dist = vcombine_u8(vqmovn_u16(top_left_dist_low),
+ vqmovn_u16(top_left_dist_high));
+ return vcleq_u8(x_dist, top_left_dist);
}
// Select the closest values and collect them.
diff --git a/src/dsp/arm/intrapred_smooth_neon.cc b/src/dsp/arm/intrapred_smooth_neon.cc
index bcda131..d6c1450 100644
--- a/src/dsp/arm/intrapred_smooth_neon.cc
+++ b/src/dsp/arm/intrapred_smooth_neon.cc
@@ -31,7 +31,6 @@
namespace libgav1 {
namespace dsp {
-
namespace low_bitdepth {
namespace {
@@ -42,20 +41,15 @@ constexpr uint8_t kSmoothWeights[] = {
#include "src/dsp/smooth_weights.inc"
};
-inline uint16x4_t CalculatePred(const uint16x4_t weighted_top,
- const uint16x4_t weighted_left,
- const uint16x4_t weighted_bl,
- const uint16x4_t weighted_tr) {
- const uint32x4_t pred_0 = vaddl_u16(weighted_top, weighted_left);
- const uint32x4_t pred_1 = vaddl_u16(weighted_bl, weighted_tr);
- const uint32x4_t pred_2 = vaddq_u32(pred_0, pred_1);
- return vrshrn_n_u32(pred_2, kSmoothWeightScale + 1);
+// 256 - v = vneg_s8(v)
+inline uint8x8_t NegateS8(const uint8x8_t v) {
+ return vreinterpret_u8_s8(vneg_s8(vreinterpret_s8_u8(v)));
}
template <int height>
-inline void Smooth4xN_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
- const void* LIBGAV1_RESTRICT const top_row,
- const void* LIBGAV1_RESTRICT const left_column) {
+void Smooth4xN_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
constexpr int width = 4;
const auto* const top = static_cast<const uint8_t*>(top_row);
const auto* const left = static_cast<const uint8_t*>(left_column);
@@ -68,47 +62,49 @@ inline void Smooth4xN_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
const uint8x8_t top_right_v = vdup_n_u8(top_right);
const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);
const uint8x8_t weights_x_v = Load4(kSmoothWeights + width - 4);
- // 256 - weights = vneg_s8(weights)
- const uint8x8_t scaled_weights_x =
- vreinterpret_u8_s8(vneg_s8(vreinterpret_s8_u8(weights_x_v)));
+ const uint8x8_t scaled_weights_x = NegateS8(weights_x_v);
+ const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v);
for (int y = 0; y < height; ++y) {
const uint8x8_t left_v = vdup_n_u8(left[y]);
const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);
- const uint8x8_t scaled_weights_y =
- vreinterpret_u8_s8(vneg_s8(vreinterpret_s8_u8(weights_y_v)));
- const uint16x4_t weighted_bl =
- vget_low_u16(vmull_u8(scaled_weights_y, bottom_left_v));
-
- const uint16x4_t weighted_top = vget_low_u16(vmull_u8(weights_y_v, top_v));
- const uint16x4_t weighted_left =
- vget_low_u16(vmull_u8(weights_x_v, left_v));
- const uint16x4_t weighted_tr =
- vget_low_u16(vmull_u8(scaled_weights_x, top_right_v));
- const uint16x4_t result =
- CalculatePred(weighted_top, weighted_left, weighted_bl, weighted_tr);
-
- StoreLo4(dst, vmovn_u16(vcombine_u16(result, result)));
+ const uint8x8_t scaled_weights_y = NegateS8(weights_y_v);
+ const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v);
+ const uint16x8_t weighted_top_bl =
+ vmlal_u8(weighted_bl, weights_y_v, top_v);
+ const uint16x8_t weighted_left_tr =
+ vmlal_u8(weighted_tr, weights_x_v, left_v);
+ // Maximum value of each parameter: 0xFF00
+ const uint16x8_t avg = vhaddq_u16(weighted_top_bl, weighted_left_tr);
+ const uint8x8_t result = vrshrn_n_u16(avg, kSmoothWeightScale);
+
+ StoreLo4(dst, result);
dst += stride;
}
}
-inline uint8x8_t CalculatePred(const uint16x8_t weighted_top,
- const uint16x8_t weighted_left,
- const uint16x8_t weighted_bl,
- const uint16x8_t weighted_tr) {
- // Maximum value: 0xFF00
- const uint16x8_t pred_0 = vaddq_u16(weighted_top, weighted_bl);
- // Maximum value: 0xFF00
- const uint16x8_t pred_1 = vaddq_u16(weighted_left, weighted_tr);
- const uint16x8_t pred_2 = vhaddq_u16(pred_0, pred_1);
- return vrshrn_n_u16(pred_2, kSmoothWeightScale);
+inline uint8x8_t CalculatePred(const uint16x8_t weighted_top_bl,
+ const uint16x8_t weighted_left_tr) {
+ // Maximum value of each parameter: 0xFF00
+ const uint16x8_t avg = vhaddq_u16(weighted_top_bl, weighted_left_tr);
+ return vrshrn_n_u16(avg, kSmoothWeightScale);
+}
+
+inline uint8x8_t CalculateWeightsAndPred(
+ const uint8x8_t top, const uint8x8_t left, const uint16x8_t weighted_tr,
+ const uint8x8_t bottom_left, const uint8x8_t weights_x,
+ const uint8x8_t scaled_weights_y, const uint8x8_t weights_y) {
+ const uint16x8_t weighted_top = vmull_u8(weights_y, top);
+ const uint16x8_t weighted_top_bl =
+ vmlal_u8(weighted_top, scaled_weights_y, bottom_left);
+ const uint16x8_t weighted_left_tr = vmlal_u8(weighted_tr, weights_x, left);
+ return CalculatePred(weighted_top_bl, weighted_left_tr);
}
template <int height>
-inline void Smooth8xN_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
- const void* LIBGAV1_RESTRICT const top_row,
- const void* LIBGAV1_RESTRICT const left_column) {
+void Smooth8xN_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
constexpr int width = 8;
const auto* const top = static_cast<const uint8_t*>(top_row);
const auto* const left = static_cast<const uint8_t*>(left_column);
@@ -121,21 +117,16 @@ inline void Smooth8xN_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
const uint8x8_t top_right_v = vdup_n_u8(top_right);
const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);
const uint8x8_t weights_x_v = vld1_u8(kSmoothWeights + width - 4);
- // 256 - weights = vneg_s8(weights)
- const uint8x8_t scaled_weights_x =
- vreinterpret_u8_s8(vneg_s8(vreinterpret_s8_u8(weights_x_v)));
+ const uint8x8_t scaled_weights_x = NegateS8(weights_x_v);
+ const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v);
for (int y = 0; y < height; ++y) {
const uint8x8_t left_v = vdup_n_u8(left[y]);
const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);
- const uint8x8_t scaled_weights_y = vdup_n_u8(256 - weights_y[y]);
- const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v);
-
- const uint16x8_t weighted_top = vmull_u8(weights_y_v, top_v);
- const uint16x8_t weighted_left = vmull_u8(weights_x_v, left_v);
- const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v);
+ const uint8x8_t scaled_weights_y = NegateS8(weights_y_v);
const uint8x8_t result =
- CalculatePred(weighted_top, weighted_left, weighted_bl, weighted_tr);
+ CalculateWeightsAndPred(top_v, left_v, weighted_tr, bottom_left_v,
+ weights_x_v, scaled_weights_y, weights_y_v);
vst1_u8(dst, result);
dst += stride;
@@ -146,28 +137,34 @@ inline uint8x16_t CalculateWeightsAndPred(
const uint8x16_t top, const uint8x8_t left, const uint8x8_t top_right,
const uint8x8_t weights_y, const uint8x16_t weights_x,
const uint8x16_t scaled_weights_x, const uint16x8_t weighted_bl) {
- const uint16x8_t weighted_top_low = vmull_u8(weights_y, vget_low_u8(top));
+ const uint16x8_t weighted_top_bl_low =
+ vmlal_u8(weighted_bl, weights_y, vget_low_u8(top));
const uint16x8_t weighted_left_low = vmull_u8(vget_low_u8(weights_x), left);
- const uint16x8_t weighted_tr_low =
- vmull_u8(vget_low_u8(scaled_weights_x), top_right);
- const uint8x8_t result_low = CalculatePred(
- weighted_top_low, weighted_left_low, weighted_bl, weighted_tr_low);
+ const uint16x8_t weighted_left_tr_low =
+ vmlal_u8(weighted_left_low, vget_low_u8(scaled_weights_x), top_right);
+ const uint8x8_t result_low =
+ CalculatePred(weighted_top_bl_low, weighted_left_tr_low);
- const uint16x8_t weighted_top_high = vmull_u8(weights_y, vget_high_u8(top));
+ const uint16x8_t weighted_top_bl_high =
+ vmlal_u8(weighted_bl, weights_y, vget_high_u8(top));
const uint16x8_t weighted_left_high = vmull_u8(vget_high_u8(weights_x), left);
- const uint16x8_t weighted_tr_high =
- vmull_u8(vget_high_u8(scaled_weights_x), top_right);
- const uint8x8_t result_high = CalculatePred(
- weighted_top_high, weighted_left_high, weighted_bl, weighted_tr_high);
+ const uint16x8_t weighted_left_tr_high =
+ vmlal_u8(weighted_left_high, vget_high_u8(scaled_weights_x), top_right);
+ const uint8x8_t result_high =
+ CalculatePred(weighted_top_bl_high, weighted_left_tr_high);
return vcombine_u8(result_low, result_high);
}
+// 256 - v = vneg_s8(v)
+inline uint8x16_t NegateS8(const uint8x16_t v) {
+ return vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(v)));
+}
+
template <int width, int height>
-inline void Smooth16PlusxN_NEON(
- void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
- const void* LIBGAV1_RESTRICT const top_row,
- const void* LIBGAV1_RESTRICT const left_column) {
+void Smooth16PlusxN_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const top = static_cast<const uint8_t*>(top_row);
const auto* const left = static_cast<const uint8_t*>(left_column);
const uint8_t top_right = top[width - 1];
@@ -188,9 +185,6 @@ inline void Smooth16PlusxN_NEON(
const uint8x8_t top_right_v = vdup_n_u8(top_right);
const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);
- // TODO(johannkoenig): Consider re-reading top_v and weights_x_v in the loop.
- // This currently has a performance slope similar to Paeth so it does not
- // appear to be register bound for arm64.
uint8x16_t weights_x_v[4];
weights_x_v[0] = vld1q_u8(kSmoothWeights + width - 4);
if (width > 16) {
@@ -202,23 +196,19 @@ inline void Smooth16PlusxN_NEON(
}
uint8x16_t scaled_weights_x[4];
- scaled_weights_x[0] =
- vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x_v[0])));
+ scaled_weights_x[0] = NegateS8(weights_x_v[0]);
if (width > 16) {
- scaled_weights_x[1] =
- vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x_v[1])));
+ scaled_weights_x[1] = NegateS8(weights_x_v[1]);
if (width == 64) {
- scaled_weights_x[2] =
- vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x_v[2])));
- scaled_weights_x[3] =
- vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x_v[3])));
+ scaled_weights_x[2] = NegateS8(weights_x_v[2]);
+ scaled_weights_x[3] = NegateS8(weights_x_v[3]);
}
}
for (int y = 0; y < height; ++y) {
const uint8x8_t left_v = vdup_n_u8(left[y]);
const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);
- const uint8x8_t scaled_weights_y = vdup_n_u8(256 - weights_y[y]);
+ const uint8x8_t scaled_weights_y = NegateS8(weights_y_v);
const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v);
vst1q_u8(dst, CalculateWeightsAndPred(top_v[0], left_v, top_right_v,
@@ -246,10 +236,10 @@ inline void Smooth16PlusxN_NEON(
}
template <int width, int height>
-inline void SmoothVertical4Or8xN_NEON(
- void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
- const void* LIBGAV1_RESTRICT const top_row,
- const void* LIBGAV1_RESTRICT const left_column) {
+void SmoothVertical4Or8xN_NEON(void* LIBGAV1_RESTRICT const dest,
+ ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const top = static_cast<const uint8_t*>(top_row);
const auto* const left = static_cast<const uint8_t*>(left_column);
const uint8_t bottom_left = left[height - 1];
@@ -267,17 +257,17 @@ inline void SmoothVertical4Or8xN_NEON(
for (int y = 0; y < height; ++y) {
const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);
- const uint8x8_t scaled_weights_y = vdup_n_u8(256 - weights_y[y]);
+ const uint8x8_t scaled_weights_y = NegateS8(weights_y_v);
const uint16x8_t weighted_top = vmull_u8(weights_y_v, top_v);
- const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v);
- const uint16x8_t pred = vaddq_u16(weighted_top, weighted_bl);
- const uint8x8_t pred_scaled = vrshrn_n_u16(pred, kSmoothWeightScale);
+ const uint16x8_t weighted_top_bl =
+ vmlal_u8(weighted_top, scaled_weights_y, bottom_left_v);
+ const uint8x8_t pred = vrshrn_n_u16(weighted_top_bl, kSmoothWeightScale);
if (width == 4) {
- StoreLo4(dst, pred_scaled);
+ StoreLo4(dst, pred);
} else { // width == 8
- vst1_u8(dst, pred_scaled);
+ vst1_u8(dst, pred);
}
dst += stride;
}
@@ -286,10 +276,10 @@ inline void SmoothVertical4Or8xN_NEON(
inline uint8x16_t CalculateVerticalWeightsAndPred(
const uint8x16_t top, const uint8x8_t weights_y,
const uint16x8_t weighted_bl) {
- const uint16x8_t weighted_top_low = vmull_u8(weights_y, vget_low_u8(top));
- const uint16x8_t weighted_top_high = vmull_u8(weights_y, vget_high_u8(top));
- const uint16x8_t pred_low = vaddq_u16(weighted_top_low, weighted_bl);
- const uint16x8_t pred_high = vaddq_u16(weighted_top_high, weighted_bl);
+ const uint16x8_t pred_low =
+ vmlal_u8(weighted_bl, weights_y, vget_low_u8(top));
+ const uint16x8_t pred_high =
+ vmlal_u8(weighted_bl, weights_y, vget_high_u8(top));
const uint8x8_t pred_scaled_low = vrshrn_n_u16(pred_low, kSmoothWeightScale);
const uint8x8_t pred_scaled_high =
vrshrn_n_u16(pred_high, kSmoothWeightScale);
@@ -297,7 +287,7 @@ inline uint8x16_t CalculateVerticalWeightsAndPred(
}
template <int width, int height>
-inline void SmoothVertical16PlusxN_NEON(
+void SmoothVertical16PlusxN_NEON(
void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
const void* LIBGAV1_RESTRICT const top_row,
const void* LIBGAV1_RESTRICT const left_column) {
@@ -321,7 +311,7 @@ inline void SmoothVertical16PlusxN_NEON(
for (int y = 0; y < height; ++y) {
const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);
- const uint8x8_t scaled_weights_y = vdup_n_u8(256 - weights_y[y]);
+ const uint8x8_t scaled_weights_y = NegateS8(weights_y_v);
const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v);
const uint8x16_t pred_0 =
@@ -349,7 +339,7 @@ inline void SmoothVertical16PlusxN_NEON(
}
template <int width, int height>
-inline void SmoothHorizontal4Or8xN_NEON(
+void SmoothHorizontal4Or8xN_NEON(
void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
const void* LIBGAV1_RESTRICT const top_row,
const void* LIBGAV1_RESTRICT const left_column) {
@@ -361,22 +351,19 @@ inline void SmoothHorizontal4Or8xN_NEON(
const uint8x8_t top_right_v = vdup_n_u8(top_right);
// Over-reads for 4xN but still within the array.
const uint8x8_t weights_x = vld1_u8(kSmoothWeights + width - 4);
- // 256 - weights = vneg_s8(weights)
- const uint8x8_t scaled_weights_x =
- vreinterpret_u8_s8(vneg_s8(vreinterpret_s8_u8(weights_x)));
+ const uint8x8_t scaled_weights_x = NegateS8(weights_x);
+ const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v);
for (int y = 0; y < height; ++y) {
const uint8x8_t left_v = vdup_n_u8(left[y]);
-
- const uint16x8_t weighted_left = vmull_u8(weights_x, left_v);
- const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v);
- const uint16x8_t pred = vaddq_u16(weighted_left, weighted_tr);
- const uint8x8_t pred_scaled = vrshrn_n_u16(pred, kSmoothWeightScale);
+ const uint16x8_t weighted_left_tr =
+ vmlal_u8(weighted_tr, weights_x, left_v);
+ const uint8x8_t pred = vrshrn_n_u16(weighted_left_tr, kSmoothWeightScale);
if (width == 4) {
- StoreLo4(dst, pred_scaled);
+ StoreLo4(dst, pred);
} else { // width == 8
- vst1_u8(dst, pred_scaled);
+ vst1_u8(dst, pred);
}
dst += stride;
}
@@ -386,23 +373,22 @@ inline uint8x16_t CalculateHorizontalWeightsAndPred(
const uint8x8_t left, const uint8x8_t top_right, const uint8x16_t weights_x,
const uint8x16_t scaled_weights_x) {
const uint16x8_t weighted_left_low = vmull_u8(vget_low_u8(weights_x), left);
- const uint16x8_t weighted_tr_low =
- vmull_u8(vget_low_u8(scaled_weights_x), top_right);
- const uint16x8_t pred_low = vaddq_u16(weighted_left_low, weighted_tr_low);
- const uint8x8_t pred_scaled_low = vrshrn_n_u16(pred_low, kSmoothWeightScale);
+ const uint16x8_t weighted_left_tr_low =
+ vmlal_u8(weighted_left_low, vget_low_u8(scaled_weights_x), top_right);
+ const uint8x8_t pred_scaled_low =
+ vrshrn_n_u16(weighted_left_tr_low, kSmoothWeightScale);
const uint16x8_t weighted_left_high = vmull_u8(vget_high_u8(weights_x), left);
- const uint16x8_t weighted_tr_high =
- vmull_u8(vget_high_u8(scaled_weights_x), top_right);
- const uint16x8_t pred_high = vaddq_u16(weighted_left_high, weighted_tr_high);
+ const uint16x8_t weighted_left_tr_high =
+ vmlal_u8(weighted_left_high, vget_high_u8(scaled_weights_x), top_right);
const uint8x8_t pred_scaled_high =
- vrshrn_n_u16(pred_high, kSmoothWeightScale);
+ vrshrn_n_u16(weighted_left_tr_high, kSmoothWeightScale);
return vcombine_u8(pred_scaled_low, pred_scaled_high);
}
template <int width, int height>
-inline void SmoothHorizontal16PlusxN_NEON(
+void SmoothHorizontal16PlusxN_NEON(
void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
const void* LIBGAV1_RESTRICT const top_row,
const void* LIBGAV1_RESTRICT const left_column) {
@@ -424,16 +410,12 @@ inline void SmoothHorizontal16PlusxN_NEON(
}
uint8x16_t scaled_weights_x[4];
- scaled_weights_x[0] =
- vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x[0])));
+ scaled_weights_x[0] = NegateS8(weights_x[0]);
if (width > 16) {
- scaled_weights_x[1] =
- vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x[1])));
+ scaled_weights_x[1] = NegateS8(weights_x[1]);
if (width == 64) {
- scaled_weights_x[2] =
- vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x[2])));
- scaled_weights_x[3] =
- vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x[3])));
+ scaled_weights_x[2] = NegateS8(weights_x[2]);
+ scaled_weights_x[3] = NegateS8(weights_x[3]);
}
}
@@ -633,10 +615,15 @@ constexpr uint16_t kSmoothWeights[] = {
#include "src/dsp/smooth_weights.inc"
};
+// 256 - v = vneg_s8(v)
+inline uint16x4_t NegateS8(const uint16x4_t v) {
+ return vreinterpret_u16_s8(vneg_s8(vreinterpret_s8_u16(v)));
+}
+
template <int height>
-inline void Smooth4xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
- const void* LIBGAV1_RESTRICT const top_row,
- const void* LIBGAV1_RESTRICT const left_column) {
+void Smooth4xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const top = static_cast<const uint16_t*>(top_row);
const auto* const left = static_cast<const uint16_t*>(left_column);
const uint16_t top_right = top[3];
@@ -647,9 +634,7 @@ inline void Smooth4xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
const uint16x4_t top_v = vld1_u16(top);
const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
const uint16x4_t weights_x_v = vld1_u16(kSmoothWeights);
- const uint16x4_t scaled_weights_x = vsub_u16(vdup_n_u16(256), weights_x_v);
-
- // Weighted top right doesn't change with each row.
+ const uint16x4_t scaled_weights_x = NegateS8(weights_x_v);
const uint32x4_t weighted_tr = vmull_n_u16(scaled_weights_x, top_right);
for (int y = 0; y < height; ++y) {
@@ -670,10 +655,10 @@ inline void Smooth4xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
// Common code between 8xH and [16|32|64]xH.
inline void CalculatePred8(uint16_t* LIBGAV1_RESTRICT dst,
- const uint32x4_t& weighted_corners_low,
- const uint32x4_t& weighted_corners_high,
- const uint16x4x2_t& top_vals,
- const uint16x4x2_t& weights_x, const uint16_t left_y,
+ const uint32x4_t weighted_corners_low,
+ const uint32x4_t weighted_corners_high,
+ const uint16x4x2_t top_vals,
+ const uint16x4x2_t weights_x, const uint16_t left_y,
const uint16_t weight_y) {
// Each variable in the running summation is named for the last item to be
// accumulated.
@@ -697,9 +682,9 @@ inline void CalculatePred8(uint16_t* LIBGAV1_RESTRICT dst,
}
template <int height>
-inline void Smooth8xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
- const void* LIBGAV1_RESTRICT const top_row,
- const void* LIBGAV1_RESTRICT const left_column) {
+void Smooth8xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const top = static_cast<const uint16_t*>(top_row);
const auto* const left = static_cast<const uint16_t*>(left_column);
const uint16_t top_right = top[7];
@@ -712,14 +697,12 @@ inline void Smooth8xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
const uint16x4x2_t weights_x = {vld1_u16(kSmoothWeights + 4),
vld1_u16(kSmoothWeights + 8)};
- // Weighted top right doesn't change with each row.
const uint32x4_t weighted_tr_low =
- vmull_n_u16(vsub_u16(vdup_n_u16(256), weights_x.val[0]), top_right);
+ vmull_n_u16(NegateS8(weights_x.val[0]), top_right);
const uint32x4_t weighted_tr_high =
- vmull_n_u16(vsub_u16(vdup_n_u16(256), weights_x.val[1]), top_right);
+ vmull_n_u16(NegateS8(weights_x.val[1]), top_right);
for (int y = 0; y < height; ++y) {
- // |weighted_bl| is invariant across the row.
const uint32x4_t weighted_bl =
vmull_n_u16(bottom_left_v, 256 - weights_y[y]);
const uint32x4_t weighted_corners_low =
@@ -735,9 +718,9 @@ inline void Smooth8xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
// For width 16 and above.
template <int width, int height>
-inline void SmoothWxH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
- const void* LIBGAV1_RESTRICT const top_row,
- const void* LIBGAV1_RESTRICT const left_column) {
+void SmoothWxH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const top = static_cast<const uint16_t*>(top_row);
const auto* const left = static_cast<const uint16_t*>(left_column);
const uint16_t top_right = top[width - 1];
@@ -746,23 +729,19 @@ inline void SmoothWxH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
auto* dst = static_cast<uint8_t*>(dest);
- const uint16x4_t weight_scaling = vdup_n_u16(256);
// Precompute weighted values that don't vary with |y|.
uint32x4_t weighted_tr_low[width >> 3];
uint32x4_t weighted_tr_high[width >> 3];
for (int i = 0; i < width >> 3; ++i) {
const int x = i << 3;
const uint16x4_t weights_x_low = vld1_u16(kSmoothWeights + width - 4 + x);
- weighted_tr_low[i] =
- vmull_n_u16(vsub_u16(weight_scaling, weights_x_low), top_right);
+ weighted_tr_low[i] = vmull_n_u16(NegateS8(weights_x_low), top_right);
const uint16x4_t weights_x_high = vld1_u16(kSmoothWeights + width + x);
- weighted_tr_high[i] =
- vmull_n_u16(vsub_u16(weight_scaling, weights_x_high), top_right);
+ weighted_tr_high[i] = vmull_n_u16(NegateS8(weights_x_high), top_right);
}
const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
for (int y = 0; y < height; ++y) {
- // |weighted_bl| is invariant across the row.
const uint32x4_t weighted_bl =
vmull_n_u16(bottom_left_v, 256 - weights_y[y]);
auto* dst_x = reinterpret_cast<uint16_t*>(dst);
@@ -785,10 +764,9 @@ inline void SmoothWxH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
}
template <int height>
-inline void SmoothVertical4xH_NEON(
- void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
- const void* LIBGAV1_RESTRICT const top_row,
- const void* LIBGAV1_RESTRICT const left_column) {
+void SmoothVertical4xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const top = static_cast<const uint16_t*>(top_row);
const auto* const left = static_cast<const uint16_t*>(left_column);
const uint16_t bottom_left = left[height - 1];
@@ -812,10 +790,10 @@ inline void SmoothVertical4xH_NEON(
}
template <int height>
-inline void SmoothVertical8xH_NEON(
- void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
- const void* LIBGAV1_RESTRICT const top_row,
- const void* LIBGAV1_RESTRICT const left_column) {
+void SmoothVertical8xH_NEON(void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const top = static_cast<const uint16_t*>(top_row);
const auto* const left = static_cast<const uint16_t*>(left_column);
const uint16_t bottom_left = left[height - 1];
@@ -829,7 +807,6 @@ inline void SmoothVertical8xH_NEON(
for (int y = 0; y < height; ++y) {
auto* dst16 = reinterpret_cast<uint16_t*>(dst);
- // |weighted_bl| is invariant across the row.
const uint32x4_t weighted_bl =
vmull_n_u16(bottom_left_v, 256 - weights_y[y]);
@@ -846,10 +823,10 @@ inline void SmoothVertical8xH_NEON(
// For width 16 and above.
template <int width, int height>
-inline void SmoothVerticalWxH_NEON(
- void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
- const void* LIBGAV1_RESTRICT const top_row,
- const void* LIBGAV1_RESTRICT const left_column) {
+void SmoothVerticalWxH_NEON(void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const top = static_cast<const uint16_t*>(top_row);
const auto* const left = static_cast<const uint16_t*>(left_column);
const uint16_t bottom_left = left[height - 1];
@@ -865,7 +842,6 @@ inline void SmoothVerticalWxH_NEON(
const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
for (int y = 0; y < height; ++y) {
- // |weighted_bl| is invariant across the row.
const uint32x4_t weighted_bl =
vmull_n_u16(bottom_left_v, 256 - weights_y[y]);
@@ -885,10 +861,10 @@ inline void SmoothVerticalWxH_NEON(
}
template <int height>
-inline void SmoothHorizontal4xH_NEON(
- void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
- const void* LIBGAV1_RESTRICT const top_row,
- const void* LIBGAV1_RESTRICT const left_column) {
+void SmoothHorizontal4xH_NEON(void* LIBGAV1_RESTRICT const dest,
+ ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const top = static_cast<const uint16_t*>(top_row);
const auto* const left = static_cast<const uint16_t*>(left_column);
const uint16_t top_right = top[3];
@@ -896,7 +872,7 @@ inline void SmoothHorizontal4xH_NEON(
auto* dst = static_cast<uint8_t*>(dest);
const uint16x4_t weights_x = vld1_u16(kSmoothWeights);
- const uint16x4_t scaled_weights_x = vsub_u16(vdup_n_u16(256), weights_x);
+ const uint16x4_t scaled_weights_x = NegateS8(weights_x);
const uint32x4_t weighted_tr = vmull_n_u16(scaled_weights_x, top_right);
for (int y = 0; y < height; ++y) {
@@ -909,10 +885,10 @@ inline void SmoothHorizontal4xH_NEON(
}
template <int height>
-inline void SmoothHorizontal8xH_NEON(
- void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
- const void* LIBGAV1_RESTRICT const top_row,
- const void* LIBGAV1_RESTRICT const left_column) {
+void SmoothHorizontal8xH_NEON(void* LIBGAV1_RESTRICT const dest,
+ ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const top = static_cast<const uint16_t*>(top_row);
const auto* const left = static_cast<const uint16_t*>(left_column);
const uint16_t top_right = top[7];
@@ -923,9 +899,9 @@ inline void SmoothHorizontal8xH_NEON(
vld1_u16(kSmoothWeights + 8)};
const uint32x4_t weighted_tr_low =
- vmull_n_u16(vsub_u16(vdup_n_u16(256), weights_x.val[0]), top_right);
+ vmull_n_u16(NegateS8(weights_x.val[0]), top_right);
const uint32x4_t weighted_tr_high =
- vmull_n_u16(vsub_u16(vdup_n_u16(256), weights_x.val[1]), top_right);
+ vmull_n_u16(NegateS8(weights_x.val[1]), top_right);
for (int y = 0; y < height; ++y) {
auto* dst16 = reinterpret_cast<uint16_t*>(dst);
@@ -943,18 +919,16 @@ inline void SmoothHorizontal8xH_NEON(
// For width 16 and above.
template <int width, int height>
-inline void SmoothHorizontalWxH_NEON(
- void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
- const void* LIBGAV1_RESTRICT const top_row,
- const void* LIBGAV1_RESTRICT const left_column) {
+void SmoothHorizontalWxH_NEON(void* LIBGAV1_RESTRICT const dest,
+ ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const top = static_cast<const uint16_t*>(top_row);
const auto* const left = static_cast<const uint16_t*>(left_column);
const uint16_t top_right = top[width - 1];
auto* dst = static_cast<uint8_t*>(dest);
- const uint16x4_t weight_scaling = vdup_n_u16(256);
-
uint16x4_t weights_x_low[width >> 3];
uint16x4_t weights_x_high[width >> 3];
uint32x4_t weighted_tr_low[width >> 3];
@@ -962,11 +936,9 @@ inline void SmoothHorizontalWxH_NEON(
for (int i = 0; i < width >> 3; ++i) {
const int x = i << 3;
weights_x_low[i] = vld1_u16(kSmoothWeights + width - 4 + x);
- weighted_tr_low[i] =
- vmull_n_u16(vsub_u16(weight_scaling, weights_x_low[i]), top_right);
+ weighted_tr_low[i] = vmull_n_u16(NegateS8(weights_x_low[i]), top_right);
weights_x_high[i] = vld1_u16(kSmoothWeights + width + x);
- weighted_tr_high[i] =
- vmull_n_u16(vsub_u16(weight_scaling, weights_x_high[i]), top_right);
+ weighted_tr_high[i] = vmull_n_u16(NegateS8(weights_x_high[i]), top_right);
}
for (int y = 0; y < height; ++y) {
@@ -1141,6 +1113,7 @@ void Init10bpp() {
dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothHorizontal] =
SmoothHorizontalWxH_NEON<64, 64>;
}
+
} // namespace
} // namespace high_bitdepth
#endif // LIBGAV1_MAX_BITDEPTH >= 10
diff --git a/src/dsp/arm/inverse_transform_10bit_neon.cc b/src/dsp/arm/inverse_transform_10bit_neon.cc
index 617accc..e6f0d9d 100644
--- a/src/dsp/arm/inverse_transform_10bit_neon.cc
+++ b/src/dsp/arm/inverse_transform_10bit_neon.cc
@@ -282,9 +282,12 @@ LIBGAV1_ALWAYS_INLINE void Dct4_NEON(void* dest, int32_t step, bool is_row,
const int32x4_t max = vdupq_n_s32((1 << range) - 1);
int32x4_t s[4], x[4];
- LoadSrc<4>(dst, step, 0, x);
if (is_row) {
- Transpose4x4(x, x);
+ assert(step == 4);
+ int32x4x4_t y = vld4q_s32(dst);
+ for (int i = 0; i < 4; ++i) x[i] = y.val[i];
+ } else {
+ LoadSrc<4>(dst, step, 0, x);
}
// stage 1.
@@ -301,9 +304,12 @@ LIBGAV1_ALWAYS_INLINE void Dct4_NEON(void* dest, int32_t step, bool is_row,
for (auto& i : s) {
i = vmovl_s16(vqmovn_s32(vqrshlq_s32(i, v_row_shift)));
}
- Transpose4x4(s, s);
+ int32x4x4_t y;
+ for (int i = 0; i < 4; ++i) y.val[i] = s[i];
+ vst4q_s32(dst, y);
+ } else {
+ StoreDst<4>(dst, step, 0, s);
}
- StoreDst<4>(dst, step, 0, s);
}
template <ButterflyRotationFunc butterfly_rotation,
@@ -937,9 +943,12 @@ LIBGAV1_ALWAYS_INLINE void Adst4_NEON(void* dest, int32_t step, bool is_row,
int32x4_t s[8];
int32x4_t x[4];
- LoadSrc<4>(dst, step, 0, x);
if (is_row) {
- Transpose4x4(x, x);
+ assert(step == 4);
+ int32x4x4_t y = vld4q_s32(dst);
+ for (int i = 0; i < 4; ++i) x[i] = y.val[i];
+ } else {
+ LoadSrc<4>(dst, step, 0, x);
}
// stage 1.
@@ -981,9 +990,12 @@ LIBGAV1_ALWAYS_INLINE void Adst4_NEON(void* dest, int32_t step, bool is_row,
x[1] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[1], v_row_shift)));
x[2] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[2], v_row_shift)));
x[3] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[3], v_row_shift)));
- Transpose4x4(x, x);
+ int32x4x4_t y;
+ for (int i = 0; i < 4; ++i) y.val[i] = x[i];
+ vst4q_s32(dst, y);
+ } else {
+ StoreDst<4>(dst, step, 0, x);
}
- StoreDst<4>(dst, step, 0, x);
}
alignas(16) constexpr int32_t kAdst4DcOnlyMultiplier[4] = {1321, 2482, 3344,
diff --git a/src/dsp/arm/inverse_transform_neon.cc b/src/dsp/arm/inverse_transform_neon.cc
index 1c2e111..452f14a 100644
--- a/src/dsp/arm/inverse_transform_neon.cc
+++ b/src/dsp/arm/inverse_transform_neon.cc
@@ -41,50 +41,6 @@ namespace {
//------------------------------------------------------------------------------
-// TODO(slavarnway): Move transpose functions to transpose_neon.h or
-// common_neon.h.
-
-LIBGAV1_ALWAYS_INLINE void Transpose4x4(const int16x8_t in[4],
- int16x8_t out[4]) {
- // Swap 16 bit elements. Goes from:
- // a0: 00 01 02 03
- // a1: 10 11 12 13
- // a2: 20 21 22 23
- // a3: 30 31 32 33
- // to:
- // b0.val[0]: 00 10 02 12
- // b0.val[1]: 01 11 03 13
- // b1.val[0]: 20 30 22 32
- // b1.val[1]: 21 31 23 33
- const int16x4_t a0 = vget_low_s16(in[0]);
- const int16x4_t a1 = vget_low_s16(in[1]);
- const int16x4_t a2 = vget_low_s16(in[2]);
- const int16x4_t a3 = vget_low_s16(in[3]);
-
- const int16x4x2_t b0 = vtrn_s16(a0, a1);
- const int16x4x2_t b1 = vtrn_s16(a2, a3);
-
- // Swap 32 bit elements resulting in:
- // c0.val[0]: 00 10 20 30 04 14 24 34
- // c0.val[1]: 02 12 22 32 06 16 26 36
- // c1.val[0]: 01 11 21 31 05 15 25 35
- // c1.val[1]: 03 13 23 33 07 17 27 37
- const int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]),
- vreinterpret_s32_s16(b1.val[0]));
- const int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]),
- vreinterpret_s32_s16(b1.val[1]));
-
- const int16x4_t d0 = vreinterpret_s16_s32(c0.val[0]);
- const int16x4_t d1 = vreinterpret_s16_s32(c1.val[0]);
- const int16x4_t d2 = vreinterpret_s16_s32(c0.val[1]);
- const int16x4_t d3 = vreinterpret_s16_s32(c1.val[1]);
-
- out[0] = vcombine_s16(d0, d0);
- out[1] = vcombine_s16(d1, d1);
- out[2] = vcombine_s16(d2, d2);
- out[3] = vcombine_s16(d3, d3);
-}
-
// Note this is only used in the final stage of Dct32/64 and Adst16 as the in
// place version causes additional stack usage with clang.
LIBGAV1_ALWAYS_INLINE void Transpose8x8(const int16x8_t in[8],
@@ -580,16 +536,19 @@ LIBGAV1_ALWAYS_INLINE void Dct4_NEON(void* dest, int32_t step, bool transpose) {
if (stage_is_rectangular) {
if (transpose) {
- int16x8_t input[8];
- LoadSrc<8, 8>(dst, step, 0, input);
- Transpose4x8To8x4(input, x);
+ assert(step == 4);
+ int16x8x4_t y = vld4q_s16(dst);
+ for (int i = 0; i < 4; ++i) x[i] = y.val[i];
} else {
LoadSrc<16, 4>(dst, step, 0, x);
}
} else {
- LoadSrc<8, 4>(dst, step, 0, x);
if (transpose) {
- Transpose4x4(x, x);
+ assert(step == 4);
+ int16x4x4_t y = vld4_s16(dst);
+ for (int i = 0; i < 4; ++i) x[i] = vcombine_s16(y.val[i], y.val[i]);
+ } else {
+ LoadSrc<8, 4>(dst, step, 0, x);
}
}
@@ -604,17 +563,20 @@ LIBGAV1_ALWAYS_INLINE void Dct4_NEON(void* dest, int32_t step, bool transpose) {
if (stage_is_rectangular) {
if (transpose) {
- int16x8_t output[8];
- Transpose8x4To4x8(s, output);
- StoreDst<8, 8>(dst, step, 0, output);
+ int16x8x4_t y;
+ for (int i = 0; i < 4; ++i) y.val[i] = s[i];
+ vst4q_s16(dst, y);
} else {
StoreDst<16, 4>(dst, step, 0, s);
}
} else {
if (transpose) {
- Transpose4x4(s, s);
+ int16x4x4_t y;
+ for (int i = 0; i < 4; ++i) y.val[i] = vget_low_s16(s[i]);
+ vst4_s16(dst, y);
+ } else {
+ StoreDst<8, 4>(dst, step, 0, s);
}
- StoreDst<8, 4>(dst, step, 0, s);
}
}
@@ -1204,45 +1166,41 @@ void Dct64_NEON(void* dest, int32_t step, bool is_row, int row_shift) {
//------------------------------------------------------------------------------
// Asymmetric Discrete Sine Transforms (ADST).
-template <bool stage_is_rectangular>
+
LIBGAV1_ALWAYS_INLINE void Adst4_NEON(void* dest, int32_t step,
bool transpose) {
auto* const dst = static_cast<int16_t*>(dest);
- int32x4_t s[8];
- int16x8_t x[4];
+ int32x4_t s[7];
+ int16x4_t x[4];
- if (stage_is_rectangular) {
- if (transpose) {
- int16x8_t input[8];
- LoadSrc<8, 8>(dst, step, 0, input);
- Transpose4x8To8x4(input, x);
- } else {
- LoadSrc<16, 4>(dst, step, 0, x);
- }
+ if (transpose) {
+ assert(step == 4);
+ int16x4x4_t y = vld4_s16(dst);
+ for (int i = 0; i < 4; ++i) x[i] = y.val[i];
} else {
- LoadSrc<8, 4>(dst, step, 0, x);
- if (transpose) {
- Transpose4x4(x, x);
- }
+ x[0] = vld1_s16(dst);
+ x[1] = vld1_s16(dst + 1 * step);
+ x[2] = vld1_s16(dst + 2 * step);
+ x[3] = vld1_s16(dst + 3 * step);
}
// stage 1.
- s[5] = vmull_n_s16(vget_low_s16(x[3]), kAdst4Multiplier[1]);
- s[6] = vmull_n_s16(vget_low_s16(x[3]), kAdst4Multiplier[3]);
+ s[5] = vmull_n_s16(x[3], kAdst4Multiplier[1]);
+ s[6] = vmull_n_s16(x[3], kAdst4Multiplier[3]);
// stage 2.
- const int32x4_t a7 = vsubl_s16(vget_low_s16(x[0]), vget_low_s16(x[2]));
- const int32x4_t b7 = vaddw_s16(a7, vget_low_s16(x[3]));
+ const int32x4_t a7 = vsubl_s16(x[0], x[2]);
+ const int32x4_t b7 = vaddw_s16(a7, x[3]);
// stage 3.
- s[0] = vmull_n_s16(vget_low_s16(x[0]), kAdst4Multiplier[0]);
- s[1] = vmull_n_s16(vget_low_s16(x[0]), kAdst4Multiplier[1]);
+ s[0] = vmull_n_s16(x[0], kAdst4Multiplier[0]);
+ s[1] = vmull_n_s16(x[0], kAdst4Multiplier[1]);
// s[0] = s[0] + s[3]
- s[0] = vmlal_n_s16(s[0], vget_low_s16(x[2]), kAdst4Multiplier[3]);
+ s[0] = vmlal_n_s16(s[0], x[2], kAdst4Multiplier[3]);
// s[1] = s[1] - s[4]
- s[1] = vmlsl_n_s16(s[1], vget_low_s16(x[2]), kAdst4Multiplier[0]);
+ s[1] = vmlsl_n_s16(s[1], x[2], kAdst4Multiplier[0]);
- s[3] = vmull_n_s16(vget_low_s16(x[1]), kAdst4Multiplier[2]);
+ s[3] = vmull_n_s16(x[1], kAdst4Multiplier[2]);
s[2] = vmulq_n_s32(b7, kAdst4Multiplier[2]);
// stage 4.
@@ -1259,24 +1217,20 @@ LIBGAV1_ALWAYS_INLINE void Adst4_NEON(void* dest, int32_t step,
const int16x4_t dst_2 = vqrshrn_n_s32(s[2], 12);
const int16x4_t dst_3 = vqrshrn_n_s32(x3, 12);
- x[0] = vcombine_s16(dst_0, dst_0);
- x[1] = vcombine_s16(dst_1, dst_1);
- x[2] = vcombine_s16(dst_2, dst_2);
- x[3] = vcombine_s16(dst_3, dst_3);
+ x[0] = dst_0;
+ x[1] = dst_1;
+ x[2] = dst_2;
+ x[3] = dst_3;
- if (stage_is_rectangular) {
- if (transpose) {
- int16x8_t output[8];
- Transpose8x4To4x8(x, output);
- StoreDst<8, 8>(dst, step, 0, output);
- } else {
- StoreDst<16, 4>(dst, step, 0, x);
- }
+ if (transpose) {
+ int16x4x4_t y;
+ for (int i = 0; i < 4; ++i) y.val[i] = x[i];
+ vst4_s16(dst, y);
} else {
- if (transpose) {
- Transpose4x4(x, x);
- }
- StoreDst<8, 4>(dst, step, 0, x);
+ vst1_s16(dst, x[0]);
+ vst1_s16(dst + 1 * step, x[1]);
+ vst1_s16(dst + 2 * step, x[2]);
+ vst1_s16(dst + 3 * step, x[3]);
}
}
@@ -2705,7 +2659,7 @@ void Adst4TransformLoopRow_NEON(TransformType /*tx_type*/,
int i = adjusted_tx_height;
auto* data = src;
do {
- Adst4_NEON<false>(data, /*step=*/4, /*transpose=*/true);
+ Adst4_NEON(data, /*step=*/4, /*transpose=*/true);
data += 16;
i -= 4;
} while (i != 0);
@@ -2732,7 +2686,7 @@ void Adst4TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
int i = tx_width;
auto* data = src;
do {
- Adst4_NEON<false>(data, tx_width, /*transpose=*/false);
+ Adst4_NEON(data, tx_width, /*transpose=*/false);
data += 4;
i -= 4;
} while (i != 0);
diff --git a/src/dsp/arm/loop_filter_10bit_neon.cc b/src/dsp/arm/loop_filter_10bit_neon.cc
new file mode 100644
index 0000000..a9dd98f
--- /dev/null
+++ b/src/dsp/arm/loop_filter_10bit_neon.cc
@@ -0,0 +1,1218 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/loop_filter.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10
+
+#include <arm_neon.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// (abs(p1 - p0) > thresh) || (abs(q1 - q0) > thresh)
+inline uint16x4_t Hev(const uint16x8_t abd_p0p1_q0q1, const uint16_t thresh) {
+ const uint16x8_t a = vcgtq_u16(abd_p0p1_q0q1, vdupq_n_u16(thresh));
+ return vorr_u16(vget_low_u16(a), vget_high_u16(a));
+}
+
+// abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= outer_thresh
+inline uint16x4_t OuterThreshold(const uint16x4_t p1, const uint16x4_t p0,
+ const uint16x4_t q0, const uint16x4_t q1,
+ const uint16_t outer_thresh) {
+ const uint16x4_t abd_p0q0 = vabd_u16(p0, q0);
+ const uint16x4_t abd_p1q1 = vabd_u16(p1, q1);
+ const uint16x4_t p0q0_double = vshl_n_u16(abd_p0q0, 1);
+ const uint16x4_t p1q1_half = vshr_n_u16(abd_p1q1, 1);
+ const uint16x4_t sum = vadd_u16(p0q0_double, p1q1_half);
+ return vcle_u16(sum, vdup_n_u16(outer_thresh));
+}
+
+// abs(p1 - p0) <= inner_thresh && abs(q1 - q0) <= inner_thresh &&
+// OuterThreshold()
+inline uint16x4_t NeedsFilter4(const uint16x8_t abd_p0p1_q0q1,
+ const uint16_t inner_thresh,
+ const uint16x4_t outer_mask) {
+ const uint16x8_t a = vcleq_u16(abd_p0p1_q0q1, vdupq_n_u16(inner_thresh));
+ const uint16x4_t inner_mask = vand_u16(vget_low_u16(a), vget_high_u16(a));
+ return vand_u16(inner_mask, outer_mask);
+}
+
+// abs(p2 - p1) <= inner_thresh && abs(p1 - p0) <= inner_thresh &&
+// abs(q1 - q0) <= inner_thresh && abs(q2 - q1) <= inner_thresh &&
+// OuterThreshold()
+inline uint16x4_t NeedsFilter6(const uint16x8_t abd_p0p1_q0q1,
+ const uint16x8_t abd_p1p2_q1q2,
+ const uint16_t inner_thresh,
+ const uint16x4_t outer_mask) {
+ const uint16x8_t a = vmaxq_u16(abd_p0p1_q0q1, abd_p1p2_q1q2);
+ const uint16x8_t b = vcleq_u16(a, vdupq_n_u16(inner_thresh));
+ const uint16x4_t inner_mask = vand_u16(vget_low_u16(b), vget_high_u16(b));
+ return vand_u16(inner_mask, outer_mask);
+}
+
+// abs(p3 - p2) <= inner_thresh && abs(p2 - p1) <= inner_thresh &&
+// abs(p1 - p0) <= inner_thresh && abs(q1 - q0) <= inner_thresh &&
+// abs(q2 - q1) <= inner_thresh && abs(q3 - q2) <= inner_thresh
+// OuterThreshold()
+inline uint16x4_t NeedsFilter8(const uint16x8_t abd_p0p1_q0q1,
+ const uint16x8_t abd_p1p2_q1q2,
+ const uint16x8_t abd_p2p3_q2q3,
+ const uint16_t inner_thresh,
+ const uint16x4_t outer_mask) {
+ const uint16x8_t a = vmaxq_u16(abd_p0p1_q0q1, abd_p1p2_q1q2);
+ const uint16x8_t b = vmaxq_u16(a, abd_p2p3_q2q3);
+ const uint16x8_t c = vcleq_u16(b, vdupq_n_u16(inner_thresh));
+ const uint16x4_t inner_mask = vand_u16(vget_low_u16(c), vget_high_u16(c));
+ return vand_u16(inner_mask, outer_mask);
+}
+
+// -----------------------------------------------------------------------------
+// FilterNMasks functions.
+
+inline void Filter4Masks(const uint16x8_t p0q0, const uint16x8_t p1q1,
+ const uint16_t hev_thresh, const uint16x4_t outer_mask,
+ const uint16_t inner_thresh,
+ uint16x4_t* const hev_mask,
+ uint16x4_t* const needs_filter4_mask) {
+ const uint16x8_t p0p1_q0q1 = vabdq_u16(p0q0, p1q1);
+ // This includes cases where NeedsFilter4() is not true and so Filter2() will
+ // not be applied.
+ const uint16x4_t hev_tmp_mask = Hev(p0p1_q0q1, hev_thresh);
+
+ *needs_filter4_mask = NeedsFilter4(p0p1_q0q1, inner_thresh, outer_mask);
+
+ // Filter2() will only be applied if both NeedsFilter4() and Hev() are true.
+ *hev_mask = vand_u16(hev_tmp_mask, *needs_filter4_mask);
+}
+
+// abs(p1 - p0) <= flat_thresh && abs(q1 - q0) <= flat_thresh &&
+// abs(p2 - p0) <= flat_thresh && abs(q2 - q0) <= flat_thresh
+// |flat_thresh| == 4 for 10 bit decode.
+inline uint16x4_t IsFlat3(const uint16x8_t abd_p0p1_q0q1,
+ const uint16x8_t abd_p0p2_q0q2) {
+ constexpr int flat_thresh = 1 << 2;
+ const uint16x8_t a = vmaxq_u16(abd_p0p1_q0q1, abd_p0p2_q0q2);
+ const uint16x8_t b = vcleq_u16(a, vdupq_n_u16(flat_thresh));
+ return vand_u16(vget_low_u16(b), vget_high_u16(b));
+}
+
+inline void Filter6Masks(const uint16x8_t p2q2, const uint16x8_t p1q1,
+ const uint16x8_t p0q0, const uint16_t hev_thresh,
+ const uint16x4_t outer_mask,
+ const uint16_t inner_thresh,
+ uint16x4_t* const needs_filter6_mask,
+ uint16x4_t* const is_flat3_mask,
+ uint16x4_t* const hev_mask) {
+ const uint16x8_t abd_p0p1_q0q1 = vabdq_u16(p0q0, p1q1);
+ *hev_mask = Hev(abd_p0p1_q0q1, hev_thresh);
+ *is_flat3_mask = IsFlat3(abd_p0p1_q0q1, vabdq_u16(p0q0, p2q2));
+ *needs_filter6_mask = NeedsFilter6(abd_p0p1_q0q1, vabdq_u16(p1q1, p2q2),
+ inner_thresh, outer_mask);
+}
+
+// IsFlat4 uses N=1, IsFlatOuter4 uses N=4.
+// abs(p[N] - p0) <= flat_thresh && abs(q[N] - q0) <= flat_thresh &&
+// abs(p[N+1] - p0) <= flat_thresh && abs(q[N+1] - q0) <= flat_thresh &&
+// abs(p[N+2] - p0) <= flat_thresh && abs(q[N+1] - q0) <= flat_thresh
+// |flat_thresh| == 4 for 10 bit decode.
+inline uint16x4_t IsFlat4(const uint16x8_t abd_pnp0_qnq0,
+ const uint16x8_t abd_pn1p0_qn1q0,
+ const uint16x8_t abd_pn2p0_qn2q0) {
+ constexpr int flat_thresh = 1 << 2;
+ const uint16x8_t a = vmaxq_u16(abd_pnp0_qnq0, abd_pn1p0_qn1q0);
+ const uint16x8_t b = vmaxq_u16(a, abd_pn2p0_qn2q0);
+ const uint16x8_t c = vcleq_u16(b, vdupq_n_u16(flat_thresh));
+ return vand_u16(vget_low_u16(c), vget_high_u16(c));
+}
+
+inline void Filter8Masks(const uint16x8_t p3q3, const uint16x8_t p2q2,
+ const uint16x8_t p1q1, const uint16x8_t p0q0,
+ const uint16_t hev_thresh, const uint16x4_t outer_mask,
+ const uint16_t inner_thresh,
+ uint16x4_t* const needs_filter8_mask,
+ uint16x4_t* const is_flat4_mask,
+ uint16x4_t* const hev_mask) {
+ const uint16x8_t abd_p0p1_q0q1 = vabdq_u16(p0q0, p1q1);
+ *hev_mask = Hev(abd_p0p1_q0q1, hev_thresh);
+ const uint16x4_t is_flat4 =
+ IsFlat4(abd_p0p1_q0q1, vabdq_u16(p0q0, p2q2), vabdq_u16(p0q0, p3q3));
+ *needs_filter8_mask =
+ NeedsFilter8(abd_p0p1_q0q1, vabdq_u16(p1q1, p2q2), vabdq_u16(p2q2, p3q3),
+ inner_thresh, outer_mask);
+ // |is_flat4_mask| is used to decide where to use the result of Filter8.
+ // In rare cases, |is_flat4| can be true where |needs_filter8_mask| is false,
+ // overriding the question of whether to use Filter8. Because Filter4 doesn't
+ // apply to p2q2, |is_flat4_mask| chooses directly between Filter8 and the
+ // source value. To be correct, the mask must account for this override.
+ *is_flat4_mask = vand_u16(is_flat4, *needs_filter8_mask);
+}
+
+// -----------------------------------------------------------------------------
+// FilterN functions.
+
+// Calculate Filter4() or Filter2() based on |hev_mask|.
+inline void Filter4(const uint16x8_t p0q0, const uint16x8_t p0q1,
+ const uint16x8_t p1q1, const uint16x4_t hev_mask,
+ uint16x8_t* const p1q1_result,
+ uint16x8_t* const p0q0_result) {
+ const uint16x8_t q0p1 = vextq_u16(p0q0, p1q1, 4);
+ // a = 3 * (q0 - p0) + Clip3(p1 - q1, min_signed_val, max_signed_val);
+ // q0mp0 means "q0 minus p0".
+ const int16x8_t q0mp0_p1mq1 = vreinterpretq_s16_u16(vsubq_u16(q0p1, p0q1));
+ const int16x4_t q0mp0_3 = vmul_n_s16(vget_low_s16(q0mp0_p1mq1), 3);
+
+ // If this is for Filter2() then include |p1mq1|. Otherwise zero it.
+ const int16x4_t min_signed_pixel = vdup_n_s16(-(1 << (9 /*bitdepth-1*/)));
+ const int16x4_t max_signed_pixel = vdup_n_s16((1 << (9 /*bitdepth-1*/)) - 1);
+ const int16x4_t p1mq1 = vget_high_s16(q0mp0_p1mq1);
+ const int16x4_t p1mq1_saturated =
+ Clip3S16(p1mq1, min_signed_pixel, max_signed_pixel);
+ const int16x4_t hev_option =
+ vand_s16(vreinterpret_s16_u16(hev_mask), p1mq1_saturated);
+
+ const int16x4_t a = vadd_s16(q0mp0_3, hev_option);
+
+ // Need to figure out what's going on here because there are some unnecessary
+ // tricks to accommodate 8x8 as smallest 8bpp vector
+
+ // We can not shift with rounding because the clamp comes *before* the
+ // shifting. a1 = Clip3(a + 4, min_signed_val, max_signed_val) >> 3; a2 =
+ // Clip3(a + 3, min_signed_val, max_signed_val) >> 3;
+ const int16x4_t plus_four =
+ Clip3S16(vadd_s16(a, vdup_n_s16(4)), min_signed_pixel, max_signed_pixel);
+ const int16x4_t plus_three =
+ Clip3S16(vadd_s16(a, vdup_n_s16(3)), min_signed_pixel, max_signed_pixel);
+ const int16x4_t a1 = vshr_n_s16(plus_four, 3);
+ const int16x4_t a2 = vshr_n_s16(plus_three, 3);
+
+ // a3 = (a1 + 1) >> 1;
+ const int16x4_t a3 = vrshr_n_s16(a1, 1);
+
+ const int16x8_t a3_ma3 = vcombine_s16(a3, vneg_s16(a3));
+ const int16x8_t p1q1_a3 = vaddq_s16(vreinterpretq_s16_u16(p1q1), a3_ma3);
+
+ // Need to shift the second term or we end up with a2_ma2.
+ const int16x8_t a2_ma1 = vcombine_s16(a2, vneg_s16(a1));
+ const int16x8_t p0q0_a = vaddq_s16(vreinterpretq_s16_u16(p0q0), a2_ma1);
+ *p1q1_result = ConvertToUnsignedPixelU16(p1q1_a3, kBitdepth10);
+ *p0q0_result = ConvertToUnsignedPixelU16(p0q0_a, kBitdepth10);
+}
+
+void Horizontal4_NEON(void* const dest, const ptrdiff_t stride,
+ int outer_thresh, int inner_thresh, int hev_thresh) {
+ auto* const dst = static_cast<uint8_t*>(dest);
+ auto* const dst_p1 = reinterpret_cast<uint16_t*>(dst - 2 * stride);
+ auto* const dst_p0 = reinterpret_cast<uint16_t*>(dst - stride);
+ auto* const dst_q0 = reinterpret_cast<uint16_t*>(dst);
+ auto* const dst_q1 = reinterpret_cast<uint16_t*>(dst + stride);
+
+ const uint16x4_t src[4] = {vld1_u16(dst_p1), vld1_u16(dst_p0),
+ vld1_u16(dst_q0), vld1_u16(dst_q1)};
+
+ // Adjust thresholds to bitdepth.
+ outer_thresh <<= 2;
+ inner_thresh <<= 2;
+ hev_thresh <<= 2;
+ const uint16x4_t outer_mask =
+ OuterThreshold(src[0], src[1], src[2], src[3], outer_thresh);
+ uint16x4_t hev_mask;
+ uint16x4_t needs_filter4_mask;
+ const uint16x8_t p0q0 = vcombine_u16(src[1], src[2]);
+ const uint16x8_t p1q1 = vcombine_u16(src[0], src[3]);
+ Filter4Masks(p0q0, p1q1, hev_thresh, outer_mask, inner_thresh, &hev_mask,
+ &needs_filter4_mask);
+
+#if defined(__aarch64__)
+ if (vaddv_u16(needs_filter4_mask) == 0) {
+ // None of the values will be filtered.
+ return;
+ }
+#endif // defined(__aarch64__)
+
+ // Copy the masks to the high bits for packed comparisons later.
+ const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+ const uint16x8_t needs_filter4_mask_8 =
+ vcombine_u16(needs_filter4_mask, needs_filter4_mask);
+
+ uint16x8_t f_p1q1;
+ uint16x8_t f_p0q0;
+ const uint16x8_t p0q1 = vcombine_u16(src[1], src[3]);
+ Filter4(p0q0, p0q1, p1q1, hev_mask, &f_p1q1, &f_p0q0);
+
+ // Already integrated the Hev mask when calculating the filtered values.
+ const uint16x8_t p0q0_output = vbslq_u16(needs_filter4_mask_8, f_p0q0, p0q0);
+
+ // p1/q1 are unmodified if only Hev() is true. This works because it was and'd
+ // with |needs_filter4_mask| previously.
+ const uint16x8_t p1q1_mask = veorq_u16(hev_mask_8, needs_filter4_mask_8);
+ const uint16x8_t p1q1_output = vbslq_u16(p1q1_mask, f_p1q1, p1q1);
+
+ vst1_u16(dst_p1, vget_low_u16(p1q1_output));
+ vst1_u16(dst_p0, vget_low_u16(p0q0_output));
+ vst1_u16(dst_q0, vget_high_u16(p0q0_output));
+ vst1_u16(dst_q1, vget_high_u16(p1q1_output));
+}
+
+void Vertical4_NEON(void* const dest, const ptrdiff_t stride, int outer_thresh,
+ int inner_thresh, int hev_thresh) {
+ // Offset by 2 uint16_t values to load from first p1 position.
+ auto* dst = static_cast<uint8_t*>(dest) - 4;
+ auto* dst_p1 = reinterpret_cast<uint16_t*>(dst);
+ auto* dst_p0 = reinterpret_cast<uint16_t*>(dst + stride);
+ auto* dst_q0 = reinterpret_cast<uint16_t*>(dst + stride * 2);
+ auto* dst_q1 = reinterpret_cast<uint16_t*>(dst + stride * 3);
+
+ uint16x4_t src[4] = {vld1_u16(dst_p1), vld1_u16(dst_p0), vld1_u16(dst_q0),
+ vld1_u16(dst_q1)};
+ Transpose4x4(src);
+
+ // Adjust thresholds to bitdepth.
+ outer_thresh <<= 2;
+ inner_thresh <<= 2;
+ hev_thresh <<= 2;
+ const uint16x4_t outer_mask =
+ OuterThreshold(src[0], src[1], src[2], src[3], outer_thresh);
+ uint16x4_t hev_mask;
+ uint16x4_t needs_filter4_mask;
+ const uint16x8_t p0q0 = vcombine_u16(src[1], src[2]);
+ const uint16x8_t p1q1 = vcombine_u16(src[0], src[3]);
+ Filter4Masks(p0q0, p1q1, hev_thresh, outer_mask, inner_thresh, &hev_mask,
+ &needs_filter4_mask);
+
+#if defined(__aarch64__)
+ if (vaddv_u16(needs_filter4_mask) == 0) {
+ // None of the values will be filtered.
+ return;
+ }
+#endif // defined(__aarch64__)
+
+ // Copy the masks to the high bits for packed comparisons later.
+ const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+ const uint16x8_t needs_filter4_mask_8 =
+ vcombine_u16(needs_filter4_mask, needs_filter4_mask);
+
+ uint16x8_t f_p1q1;
+ uint16x8_t f_p0q0;
+ const uint16x8_t p0q1 = vcombine_u16(src[1], src[3]);
+ Filter4(p0q0, p0q1, p1q1, hev_mask, &f_p1q1, &f_p0q0);
+
+ // Already integrated the Hev mask when calculating the filtered values.
+ const uint16x8_t p0q0_output = vbslq_u16(needs_filter4_mask_8, f_p0q0, p0q0);
+
+ // p1/q1 are unmodified if only Hev() is true. This works because it was and'd
+ // with |needs_filter4_mask| previously.
+ const uint16x8_t p1q1_mask = veorq_u16(hev_mask_8, needs_filter4_mask_8);
+ const uint16x8_t p1q1_output = vbslq_u16(p1q1_mask, f_p1q1, p1q1);
+
+ uint16x4_t output[4] = {
+ vget_low_u16(p1q1_output),
+ vget_low_u16(p0q0_output),
+ vget_high_u16(p0q0_output),
+ vget_high_u16(p1q1_output),
+ };
+ Transpose4x4(output);
+
+ vst1_u16(dst_p1, output[0]);
+ vst1_u16(dst_p0, output[1]);
+ vst1_u16(dst_q0, output[2]);
+ vst1_u16(dst_q1, output[3]);
+}
+
+inline void Filter6(const uint16x8_t p2q2, const uint16x8_t p1q1,
+ const uint16x8_t p0q0, uint16x8_t* const p1q1_output,
+ uint16x8_t* const p0q0_output) {
+ // Sum p1 and q1 output from opposite directions.
+ // The formula is regrouped to allow 3 doubling operations to be combined.
+ //
+ // p1 = (3 * p2) + (2 * p1) + (2 * p0) + q0
+ // ^^^^^^^^
+ // q1 = p0 + (2 * q0) + (2 * q1) + (3 * q2)
+ // ^^^^^^^^
+ // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0
+ // ^^^^^^^^^^^
+ uint16x8_t sum = vaddq_u16(p2q2, p1q1);
+
+ // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0
+ // ^^^^^^
+ sum = vaddq_u16(sum, p0q0);
+
+ // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0
+ // ^^^^^
+ sum = vshlq_n_u16(sum, 1);
+
+ // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0
+ // ^^^^^^ ^^^^^^
+ // Should dual issue with the left shift.
+ const uint16x8_t q0p0 = Transpose64(p0q0);
+ const uint16x8_t outer_sum = vaddq_u16(p2q2, q0p0);
+ sum = vaddq_u16(sum, outer_sum);
+
+ *p1q1_output = vrshrq_n_u16(sum, 3);
+
+ // Convert to p0 and q0 output:
+ // p0 = p1 - (2 * p2) + q0 + q1
+ // q0 = q1 - (2 * q2) + p0 + p1
+ // p0q0 = p1q1 - (2 * p2q2) + q0p0 + q1p1
+ // ^^^^^^^^
+ const uint16x8_t p2q2_double = vshlq_n_u16(p2q2, 1);
+ // p0q0 = p1q1 - (2 * p2q2) + q0p0 + q1p1
+ // ^^^^^^^^
+ sum = vsubq_u16(sum, p2q2_double);
+ const uint16x8_t q1p1 = Transpose64(p1q1);
+ sum = vaddq_u16(sum, vaddq_u16(q0p0, q1p1));
+
+ *p0q0_output = vrshrq_n_u16(sum, 3);
+}
+
+void Horizontal6_NEON(void* const dest, const ptrdiff_t stride,
+ int outer_thresh, int inner_thresh, int hev_thresh) {
+ auto* const dst = static_cast<uint8_t*>(dest);
+ auto* const dst_p2 = reinterpret_cast<uint16_t*>(dst - 3 * stride);
+ auto* const dst_p1 = reinterpret_cast<uint16_t*>(dst - 2 * stride);
+ auto* const dst_p0 = reinterpret_cast<uint16_t*>(dst - stride);
+ auto* const dst_q0 = reinterpret_cast<uint16_t*>(dst);
+ auto* const dst_q1 = reinterpret_cast<uint16_t*>(dst + stride);
+ auto* const dst_q2 = reinterpret_cast<uint16_t*>(dst + 2 * stride);
+
+ const uint16x4_t src[6] = {vld1_u16(dst_p2), vld1_u16(dst_p1),
+ vld1_u16(dst_p0), vld1_u16(dst_q0),
+ vld1_u16(dst_q1), vld1_u16(dst_q2)};
+
+ // Adjust thresholds to bitdepth.
+ outer_thresh <<= 2;
+ inner_thresh <<= 2;
+ hev_thresh <<= 2;
+ const uint16x4_t outer_mask =
+ OuterThreshold(src[1], src[2], src[3], src[4], outer_thresh);
+ uint16x4_t hev_mask;
+ uint16x4_t needs_filter_mask;
+ uint16x4_t is_flat3_mask;
+ const uint16x8_t p0q0 = vcombine_u16(src[2], src[3]);
+ const uint16x8_t p1q1 = vcombine_u16(src[1], src[4]);
+ const uint16x8_t p2q2 = vcombine_u16(src[0], src[5]);
+ Filter6Masks(p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
+ &needs_filter_mask, &is_flat3_mask, &hev_mask);
+
+#if defined(__aarch64__)
+ if (vaddv_u16(needs_filter_mask) == 0) {
+ // None of the values will be filtered.
+ return;
+ }
+#endif // defined(__aarch64__)
+
+ // Copy the masks to the high bits for packed comparisons later.
+ const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+ const uint16x8_t is_flat3_mask_8 = vcombine_u16(is_flat3_mask, is_flat3_mask);
+ const uint16x8_t needs_filter_mask_8 =
+ vcombine_u16(needs_filter_mask, needs_filter_mask);
+
+ uint16x8_t f4_p1q1;
+ uint16x8_t f4_p0q0;
+ // ZIP1 p0q0, p1q1 may perform better here.
+ const uint16x8_t p0q1 = vcombine_u16(src[2], src[4]);
+ Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0);
+ f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
+
+ uint16x8_t p0q0_output, p1q1_output;
+ // Because we did not return after testing |needs_filter_mask| we know it is
+ // nonzero. |is_flat3_mask| controls whether the needed filter is Filter4 or
+ // Filter6. Therefore if it is false when |needs_filter_mask| is true, Filter6
+ // output is not used.
+ uint16x8_t f6_p1q1, f6_p0q0;
+ const uint64x1_t need_filter6 = vreinterpret_u64_u16(is_flat3_mask);
+ if (vget_lane_u64(need_filter6, 0) == 0) {
+ // Filter6() does not apply, but Filter4() applies to one or more values.
+ p0q0_output = p0q0;
+ p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
+ p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
+ } else {
+ Filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0);
+ p1q1_output = vbslq_u16(is_flat3_mask_8, f6_p1q1, f4_p1q1);
+ p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+ p0q0_output = vbslq_u16(is_flat3_mask_8, f6_p0q0, f4_p0q0);
+ p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+ }
+
+ vst1_u16(dst_p1, vget_low_u16(p1q1_output));
+ vst1_u16(dst_p0, vget_low_u16(p0q0_output));
+ vst1_u16(dst_q0, vget_high_u16(p0q0_output));
+ vst1_u16(dst_q1, vget_high_u16(p1q1_output));
+}
+
+void Vertical6_NEON(void* const dest, const ptrdiff_t stride, int outer_thresh,
+ int inner_thresh, int hev_thresh) {
+ // Left side of the filter window.
+ auto* const dst = static_cast<uint8_t*>(dest) - 3 * sizeof(uint16_t);
+ auto* const dst_0 = reinterpret_cast<uint16_t*>(dst);
+ auto* const dst_1 = reinterpret_cast<uint16_t*>(dst + stride);
+ auto* const dst_2 = reinterpret_cast<uint16_t*>(dst + 2 * stride);
+ auto* const dst_3 = reinterpret_cast<uint16_t*>(dst + 3 * stride);
+
+ // Overread by 2 values. These overreads become the high halves of src_raw[2]
+ // and src_raw[3] after transpose.
+ uint16x8_t src_raw[4] = {vld1q_u16(dst_0), vld1q_u16(dst_1), vld1q_u16(dst_2),
+ vld1q_u16(dst_3)};
+ Transpose4x8(src_raw);
+ // p2, p1, p0, q0, q1, q2
+ const uint16x4_t src[6] = {
+ vget_low_u16(src_raw[0]), vget_low_u16(src_raw[1]),
+ vget_low_u16(src_raw[2]), vget_low_u16(src_raw[3]),
+ vget_high_u16(src_raw[0]), vget_high_u16(src_raw[1]),
+ };
+
+ // Adjust thresholds to bitdepth.
+ outer_thresh <<= 2;
+ inner_thresh <<= 2;
+ hev_thresh <<= 2;
+ const uint16x4_t outer_mask =
+ OuterThreshold(src[1], src[2], src[3], src[4], outer_thresh);
+ uint16x4_t hev_mask;
+ uint16x4_t needs_filter_mask;
+ uint16x4_t is_flat3_mask;
+ const uint16x8_t p0q0 = vcombine_u16(src[2], src[3]);
+ const uint16x8_t p1q1 = vcombine_u16(src[1], src[4]);
+ const uint16x8_t p2q2 = vcombine_u16(src[0], src[5]);
+ Filter6Masks(p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
+ &needs_filter_mask, &is_flat3_mask, &hev_mask);
+
+#if defined(__aarch64__)
+ if (vaddv_u16(needs_filter_mask) == 0) {
+ // None of the values will be filtered.
+ return;
+ }
+#endif // defined(__aarch64__)
+
+ // Copy the masks to the high bits for packed comparisons later.
+ const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+ const uint16x8_t is_flat3_mask_8 = vcombine_u16(is_flat3_mask, is_flat3_mask);
+ const uint16x8_t needs_filter_mask_8 =
+ vcombine_u16(needs_filter_mask, needs_filter_mask);
+
+ uint16x8_t f4_p1q1;
+ uint16x8_t f4_p0q0;
+ // ZIP1 p0q0, p1q1 may perform better here.
+ const uint16x8_t p0q1 = vcombine_u16(src[2], src[4]);
+ Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0);
+ f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
+
+ uint16x8_t p0q0_output, p1q1_output;
+ // Because we did not return after testing |needs_filter_mask| we know it is
+ // nonzero. |is_flat3_mask| controls whether the needed filter is Filter4 or
+ // Filter6. Therefore if it is false when |needs_filter_mask| is true, Filter6
+ // output is not used.
+ uint16x8_t f6_p1q1, f6_p0q0;
+ const uint64x1_t need_filter6 = vreinterpret_u64_u16(is_flat3_mask);
+ if (vget_lane_u64(need_filter6, 0) == 0) {
+ // Filter6() does not apply, but Filter4() applies to one or more values.
+ p0q0_output = p0q0;
+ p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
+ p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
+ } else {
+ Filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0);
+ p1q1_output = vbslq_u16(is_flat3_mask_8, f6_p1q1, f4_p1q1);
+ p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+ p0q0_output = vbslq_u16(is_flat3_mask_8, f6_p0q0, f4_p0q0);
+ p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+ }
+
+ uint16x4_t output[4] = {
+ vget_low_u16(p1q1_output),
+ vget_low_u16(p0q0_output),
+ vget_high_u16(p0q0_output),
+ vget_high_u16(p1q1_output),
+ };
+ Transpose4x4(output);
+
+ // dst_n starts at p2, so adjust to p1.
+ vst1_u16(dst_0 + 1, output[0]);
+ vst1_u16(dst_1 + 1, output[1]);
+ vst1_u16(dst_2 + 1, output[2]);
+ vst1_u16(dst_3 + 1, output[3]);
+}
+
+inline void Filter8(const uint16x8_t p3q3, const uint16x8_t p2q2,
+ const uint16x8_t p1q1, const uint16x8_t p0q0,
+ uint16x8_t* const p2q2_output,
+ uint16x8_t* const p1q1_output,
+ uint16x8_t* const p0q0_output) {
+ // Sum p2 and q2 output from opposite directions.
+ // The formula is regrouped to allow 2 doubling operations to be combined.
+ // p2 = (3 * p3) + (2 * p2) + p1 + p0 + q0
+ // ^^^^^^^^
+ // q2 = p0 + q0 + q1 + (2 * q2) + (3 * q3)
+ // ^^^^^^^^
+ // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+ // ^^^^^^^^^^^
+ const uint16x8_t p23q23 = vaddq_u16(p3q3, p2q2);
+
+ // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+ // ^^^^^
+ uint16x8_t sum = vshlq_n_u16(p23q23, 1);
+
+ // Add two other terms to make dual issue with shift more likely.
+ // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+ // ^^^^^^^^^^^
+ const uint16x8_t p01q01 = vaddq_u16(p0q0, p1q1);
+
+ // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+ // ^^^^^^^^^^^^^
+ sum = vaddq_u16(sum, p01q01);
+
+ // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+ // ^^^^^^
+ sum = vaddq_u16(sum, p3q3);
+
+ // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+ // ^^^^^^
+ const uint16x8_t q0p0 = Transpose64(p0q0);
+ sum = vaddq_u16(sum, q0p0);
+
+ *p2q2_output = vrshrq_n_u16(sum, 3);
+
+ // Convert to p1 and q1 output:
+ // p1 = p2 - p3 - p2 + p1 + q1
+ // q1 = q2 - q3 - q2 + q0 + p1
+ sum = vsubq_u16(sum, p23q23);
+ const uint16x8_t q1p1 = Transpose64(p1q1);
+ sum = vaddq_u16(sum, vaddq_u16(p1q1, q1p1));
+
+ *p1q1_output = vrshrq_n_u16(sum, 3);
+
+ // Convert to p0 and q0 output:
+ // p0 = p1 - p3 - p1 + p0 + q2
+ // q0 = q1 - q3 - q1 + q0 + p2
+ sum = vsubq_u16(sum, vaddq_u16(p3q3, p1q1));
+ const uint16x8_t q2p2 = Transpose64(p2q2);
+ sum = vaddq_u16(sum, vaddq_u16(p0q0, q2p2));
+
+ *p0q0_output = vrshrq_n_u16(sum, 3);
+}
+
+void Horizontal8_NEON(void* const dest, const ptrdiff_t stride,
+ int outer_thresh, int inner_thresh, int hev_thresh) {
+ auto* const dst = static_cast<uint8_t*>(dest);
+ auto* const dst_p3 = reinterpret_cast<uint16_t*>(dst - 4 * stride);
+ auto* const dst_p2 = reinterpret_cast<uint16_t*>(dst - 3 * stride);
+ auto* const dst_p1 = reinterpret_cast<uint16_t*>(dst - 2 * stride);
+ auto* const dst_p0 = reinterpret_cast<uint16_t*>(dst - stride);
+ auto* const dst_q0 = reinterpret_cast<uint16_t*>(dst);
+ auto* const dst_q1 = reinterpret_cast<uint16_t*>(dst + stride);
+ auto* const dst_q2 = reinterpret_cast<uint16_t*>(dst + 2 * stride);
+ auto* const dst_q3 = reinterpret_cast<uint16_t*>(dst + 3 * stride);
+
+ const uint16x4_t src[8] = {
+ vld1_u16(dst_p3), vld1_u16(dst_p2), vld1_u16(dst_p1), vld1_u16(dst_p0),
+ vld1_u16(dst_q0), vld1_u16(dst_q1), vld1_u16(dst_q2), vld1_u16(dst_q3)};
+
+ // Adjust thresholds to bitdepth.
+ outer_thresh <<= 2;
+ inner_thresh <<= 2;
+ hev_thresh <<= 2;
+ const uint16x4_t outer_mask =
+ OuterThreshold(src[2], src[3], src[4], src[5], outer_thresh);
+ uint16x4_t hev_mask;
+ uint16x4_t needs_filter_mask;
+ uint16x4_t is_flat4_mask;
+ const uint16x8_t p0q0 = vcombine_u16(src[3], src[4]);
+ const uint16x8_t p1q1 = vcombine_u16(src[2], src[5]);
+ const uint16x8_t p2q2 = vcombine_u16(src[1], src[6]);
+ const uint16x8_t p3q3 = vcombine_u16(src[0], src[7]);
+ Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
+ &needs_filter_mask, &is_flat4_mask, &hev_mask);
+
+#if defined(__aarch64__)
+ if (vaddv_u16(needs_filter_mask) == 0) {
+ // None of the values will be filtered.
+ return;
+ }
+#endif // defined(__aarch64__)
+
+ // Copy the masks to the high bits for packed comparisons later.
+ const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+ const uint16x8_t needs_filter_mask_8 =
+ vcombine_u16(needs_filter_mask, needs_filter_mask);
+
+ uint16x8_t f4_p1q1;
+ uint16x8_t f4_p0q0;
+ // ZIP1 p0q0, p1q1 may perform better here.
+ const uint16x8_t p0q1 = vcombine_u16(src[3], src[5]);
+ Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0);
+ f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
+
+ uint16x8_t p0q0_output, p1q1_output, p2q2_output;
+ // Because we did not return after testing |needs_filter_mask| we know it is
+ // nonzero. |is_flat4_mask| controls whether the needed filter is Filter4 or
+ // Filter8. Therefore if it is false when |needs_filter_mask| is true, Filter8
+ // output is not used.
+ uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
+ const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
+ if (vget_lane_u64(need_filter8, 0) == 0) {
+ // Filter8() does not apply, but Filter4() applies to one or more values.
+ p2q2_output = p2q2;
+ p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
+ p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
+ } else {
+ const uint16x8_t is_flat4_mask_8 =
+ vcombine_u16(is_flat4_mask, is_flat4_mask);
+ Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
+ p2q2_output = vbslq_u16(is_flat4_mask_8, f8_p2q2, p2q2);
+ p1q1_output = vbslq_u16(is_flat4_mask_8, f8_p1q1, f4_p1q1);
+ p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+ p0q0_output = vbslq_u16(is_flat4_mask_8, f8_p0q0, f4_p0q0);
+ p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+ }
+
+ vst1_u16(dst_p2, vget_low_u16(p2q2_output));
+ vst1_u16(dst_p1, vget_low_u16(p1q1_output));
+ vst1_u16(dst_p0, vget_low_u16(p0q0_output));
+ vst1_u16(dst_q0, vget_high_u16(p0q0_output));
+ vst1_u16(dst_q1, vget_high_u16(p1q1_output));
+ vst1_u16(dst_q2, vget_high_u16(p2q2_output));
+}
+
+inline uint16x8_t ReverseLowHalf(const uint16x8_t a) {
+ return vcombine_u16(vrev64_u16(vget_low_u16(a)), vget_high_u16(a));
+}
+
+void Vertical8_NEON(void* const dest, const ptrdiff_t stride, int outer_thresh,
+ int inner_thresh, int hev_thresh) {
+ auto* const dst = static_cast<uint8_t*>(dest) - 4 * sizeof(uint16_t);
+ auto* const dst_0 = reinterpret_cast<uint16_t*>(dst);
+ auto* const dst_1 = reinterpret_cast<uint16_t*>(dst + stride);
+ auto* const dst_2 = reinterpret_cast<uint16_t*>(dst + 2 * stride);
+ auto* const dst_3 = reinterpret_cast<uint16_t*>(dst + 3 * stride);
+
+ // src_raw[n] contains p3, p2, p1, p0, q0, q1, q2, q3 for row n.
+ // To get desired pairs after transpose, one half should be reversed.
+ uint16x8_t src[4] = {vld1q_u16(dst_0), vld1q_u16(dst_1), vld1q_u16(dst_2),
+ vld1q_u16(dst_3)};
+
+ // src[0] = p0q0
+ // src[1] = p1q1
+ // src[2] = p2q2
+ // src[3] = p3q3
+ LoopFilterTranspose4x8(src);
+
+ // Adjust thresholds to bitdepth.
+ outer_thresh <<= 2;
+ inner_thresh <<= 2;
+ hev_thresh <<= 2;
+ const uint16x4_t outer_mask = OuterThreshold(
+ vget_low_u16(src[1]), vget_low_u16(src[0]), vget_high_u16(src[0]),
+ vget_high_u16(src[1]), outer_thresh);
+ uint16x4_t hev_mask;
+ uint16x4_t needs_filter_mask;
+ uint16x4_t is_flat4_mask;
+ const uint16x8_t p0q0 = src[0];
+ const uint16x8_t p1q1 = src[1];
+ const uint16x8_t p2q2 = src[2];
+ const uint16x8_t p3q3 = src[3];
+ Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
+ &needs_filter_mask, &is_flat4_mask, &hev_mask);
+
+#if defined(__aarch64__)
+ if (vaddv_u16(needs_filter_mask) == 0) {
+ // None of the values will be filtered.
+ return;
+ }
+#endif // defined(__aarch64__)
+
+ // Copy the masks to the high bits for packed comparisons later.
+ const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+ const uint16x8_t needs_filter_mask_8 =
+ vcombine_u16(needs_filter_mask, needs_filter_mask);
+
+ uint16x8_t f4_p1q1;
+ uint16x8_t f4_p0q0;
+ const uint16x8_t p0q1 = vcombine_u16(vget_low_u16(p0q0), vget_high_u16(p1q1));
+ Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0);
+ f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
+
+ uint16x8_t p0q0_output, p1q1_output, p2q2_output;
+ // Because we did not return after testing |needs_filter_mask| we know it is
+ // nonzero. |is_flat4_mask| controls whether the needed filter is Filter4 or
+ // Filter8. Therefore if it is false when |needs_filter_mask| is true, Filter8
+ // output is not used.
+ const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
+ if (vget_lane_u64(need_filter8, 0) == 0) {
+ // Filter8() does not apply, but Filter4() applies to one or more values.
+ p2q2_output = p2q2;
+ p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
+ p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
+ } else {
+ const uint16x8_t is_flat4_mask_8 =
+ vcombine_u16(is_flat4_mask, is_flat4_mask);
+ uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
+ Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
+ p2q2_output = vbslq_u16(is_flat4_mask_8, f8_p2q2, p2q2);
+ p1q1_output = vbslq_u16(is_flat4_mask_8, f8_p1q1, f4_p1q1);
+ p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+ p0q0_output = vbslq_u16(is_flat4_mask_8, f8_p0q0, f4_p0q0);
+ p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+ }
+
+ uint16x8_t output[4] = {p0q0_output, p1q1_output, p2q2_output, p3q3};
+ // After transpose, |output| will contain rows of the form:
+ // p0 p1 p2 p3 q0 q1 q2 q3
+ Transpose4x8(output);
+
+ // Reverse p values to produce original order:
+ // p3 p2 p1 p0 q0 q1 q2 q3
+ vst1q_u16(dst_0, ReverseLowHalf(output[0]));
+ vst1q_u16(dst_1, ReverseLowHalf(output[1]));
+ vst1q_u16(dst_2, ReverseLowHalf(output[2]));
+ vst1q_u16(dst_3, ReverseLowHalf(output[3]));
+}
+
+inline void Filter14(const uint16x8_t p6q6, const uint16x8_t p5q5,
+ const uint16x8_t p4q4, const uint16x8_t p3q3,
+ const uint16x8_t p2q2, const uint16x8_t p1q1,
+ const uint16x8_t p0q0, uint16x8_t* const p5q5_output,
+ uint16x8_t* const p4q4_output,
+ uint16x8_t* const p3q3_output,
+ uint16x8_t* const p2q2_output,
+ uint16x8_t* const p1q1_output,
+ uint16x8_t* const p0q0_output) {
+ // Sum p5 and q5 output from opposite directions.
+ // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+ // ^^^^^^^^
+ // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+ // ^^^^^^^^
+ const uint16x8_t p6q6_x7 = vsubq_u16(vshlq_n_u16(p6q6, 3), p6q6);
+
+ // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+ // ^^^^^^^^^^^^^^^^^^^
+ // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+ // ^^^^^^^^^^^^^^^^^^^
+ uint16x8_t sum = vshlq_n_u16(vaddq_u16(p5q5, p4q4), 1);
+ sum = vaddq_u16(sum, p6q6_x7);
+
+ // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+ // ^^^^^^^
+ // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+ // ^^^^^^^
+ sum = vaddq_u16(vaddq_u16(p3q3, p2q2), sum);
+
+ // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+ // ^^^^^^^
+ // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+ // ^^^^^^^
+ sum = vaddq_u16(vaddq_u16(p1q1, p0q0), sum);
+
+ // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+ // ^^
+ // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+ // ^^
+ const uint16x8_t q0p0 = Transpose64(p0q0);
+ sum = vaddq_u16(sum, q0p0);
+
+ *p5q5_output = vrshrq_n_u16(sum, 4);
+
+ // Convert to p4 and q4 output:
+ // p4 = p5 - (2 * p6) + p3 + q1
+ // q4 = q5 - (2 * q6) + q3 + p1
+ sum = vsubq_u16(sum, vshlq_n_u16(p6q6, 1));
+ const uint16x8_t q1p1 = Transpose64(p1q1);
+ sum = vaddq_u16(vaddq_u16(p3q3, q1p1), sum);
+
+ *p4q4_output = vrshrq_n_u16(sum, 4);
+
+ // Convert to p3 and q3 output:
+ // p3 = p4 - p6 - p5 + p2 + q2
+ // q3 = q4 - q6 - q5 + q2 + p2
+ sum = vsubq_u16(sum, vaddq_u16(p6q6, p5q5));
+ const uint16x8_t q2p2 = Transpose64(p2q2);
+ sum = vaddq_u16(vaddq_u16(p2q2, q2p2), sum);
+
+ *p3q3_output = vrshrq_n_u16(sum, 4);
+
+ // Convert to p2 and q2 output:
+ // p2 = p3 - p6 - p4 + p1 + q3
+ // q2 = q3 - q6 - q4 + q1 + p3
+ sum = vsubq_u16(sum, vaddq_u16(p6q6, p4q4));
+ const uint16x8_t q3p3 = Transpose64(p3q3);
+ sum = vaddq_u16(vaddq_u16(p1q1, q3p3), sum);
+
+ *p2q2_output = vrshrq_n_u16(sum, 4);
+
+ // Convert to p1 and q1 output:
+ // p1 = p2 - p6 - p3 + p0 + q4
+ // q1 = q2 - q6 - q3 + q0 + p4
+ sum = vsubq_u16(sum, vaddq_u16(p6q6, p3q3));
+ const uint16x8_t q4p4 = Transpose64(p4q4);
+ sum = vaddq_u16(vaddq_u16(p0q0, q4p4), sum);
+
+ *p1q1_output = vrshrq_n_u16(sum, 4);
+
+ // Convert to p0 and q0 output:
+ // p0 = p1 - p6 - p2 + q0 + q5
+ // q0 = q1 - q6 - q2 + p0 + p5
+ sum = vsubq_u16(sum, vaddq_u16(p6q6, p2q2));
+ const uint16x8_t q5p5 = Transpose64(p5q5);
+ sum = vaddq_u16(vaddq_u16(q0p0, q5p5), sum);
+
+ *p0q0_output = vrshrq_n_u16(sum, 4);
+}
+
+void Horizontal14_NEON(void* const dest, const ptrdiff_t stride,
+ int outer_thresh, int inner_thresh, int hev_thresh) {
+ auto* const dst = static_cast<uint8_t*>(dest);
+ auto* const dst_p6 = reinterpret_cast<uint16_t*>(dst - 7 * stride);
+ auto* const dst_p5 = reinterpret_cast<uint16_t*>(dst - 6 * stride);
+ auto* const dst_p4 = reinterpret_cast<uint16_t*>(dst - 5 * stride);
+ auto* const dst_p3 = reinterpret_cast<uint16_t*>(dst - 4 * stride);
+ auto* const dst_p2 = reinterpret_cast<uint16_t*>(dst - 3 * stride);
+ auto* const dst_p1 = reinterpret_cast<uint16_t*>(dst - 2 * stride);
+ auto* const dst_p0 = reinterpret_cast<uint16_t*>(dst - stride);
+ auto* const dst_q0 = reinterpret_cast<uint16_t*>(dst);
+ auto* const dst_q1 = reinterpret_cast<uint16_t*>(dst + stride);
+ auto* const dst_q2 = reinterpret_cast<uint16_t*>(dst + 2 * stride);
+ auto* const dst_q3 = reinterpret_cast<uint16_t*>(dst + 3 * stride);
+ auto* const dst_q4 = reinterpret_cast<uint16_t*>(dst + 4 * stride);
+ auto* const dst_q5 = reinterpret_cast<uint16_t*>(dst + 5 * stride);
+ auto* const dst_q6 = reinterpret_cast<uint16_t*>(dst + 6 * stride);
+
+ const uint16x4_t src[14] = {
+ vld1_u16(dst_p6), vld1_u16(dst_p5), vld1_u16(dst_p4), vld1_u16(dst_p3),
+ vld1_u16(dst_p2), vld1_u16(dst_p1), vld1_u16(dst_p0), vld1_u16(dst_q0),
+ vld1_u16(dst_q1), vld1_u16(dst_q2), vld1_u16(dst_q3), vld1_u16(dst_q4),
+ vld1_u16(dst_q5), vld1_u16(dst_q6)};
+
+ // Adjust thresholds to bitdepth.
+ outer_thresh <<= 2;
+ inner_thresh <<= 2;
+ hev_thresh <<= 2;
+ const uint16x4_t outer_mask =
+ OuterThreshold(src[5], src[6], src[7], src[8], outer_thresh);
+ uint16x4_t hev_mask;
+ uint16x4_t needs_filter_mask;
+ uint16x4_t is_flat4_mask;
+ const uint16x8_t p0q0 = vcombine_u16(src[6], src[7]);
+ const uint16x8_t p1q1 = vcombine_u16(src[5], src[8]);
+ const uint16x8_t p2q2 = vcombine_u16(src[4], src[9]);
+ const uint16x8_t p3q3 = vcombine_u16(src[3], src[10]);
+ Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
+ &needs_filter_mask, &is_flat4_mask, &hev_mask);
+
+#if defined(__aarch64__)
+ if (vaddv_u16(needs_filter_mask) == 0) {
+ // None of the values will be filtered.
+ return;
+ }
+#endif // defined(__aarch64__)
+ const uint16x8_t p4q4 = vcombine_u16(src[2], src[11]);
+ const uint16x8_t p5q5 = vcombine_u16(src[1], src[12]);
+ const uint16x8_t p6q6 = vcombine_u16(src[0], src[13]);
+ // Mask to choose between the outputs of Filter8 and Filter14.
+ // As with the derivation of |is_flat4_mask|, the question of whether to use
+ // Filter14 is only raised where |is_flat4_mask| is true.
+ const uint16x4_t is_flat4_outer_mask = vand_u16(
+ is_flat4_mask, IsFlat4(vabdq_u16(p0q0, p4q4), vabdq_u16(p0q0, p5q5),
+ vabdq_u16(p0q0, p6q6)));
+ // Copy the masks to the high bits for packed comparisons later.
+ const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+ const uint16x8_t needs_filter_mask_8 =
+ vcombine_u16(needs_filter_mask, needs_filter_mask);
+
+ uint16x8_t f4_p1q1;
+ uint16x8_t f4_p0q0;
+ // ZIP1 p0q0, p1q1 may perform better here.
+ const uint16x8_t p0q1 = vcombine_u16(src[6], src[8]);
+ Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0);
+ f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
+
+ uint16x8_t p0q0_output, p1q1_output, p2q2_output, p3q3_output, p4q4_output,
+ p5q5_output;
+ // Because we did not return after testing |needs_filter_mask| we know it is
+ // nonzero. |is_flat4_mask| controls whether the needed filter is Filter4 or
+ // Filter8. Therefore if it is false when |needs_filter_mask| is true, Filter8
+ // output is not used.
+ uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
+ const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
+ if (vget_lane_u64(need_filter8, 0) == 0) {
+ // Filter8() and Filter14() do not apply, but Filter4() applies to one or
+ // more values.
+ p5q5_output = p5q5;
+ p4q4_output = p4q4;
+ p3q3_output = p3q3;
+ p2q2_output = p2q2;
+ p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
+ p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
+ } else {
+ const uint16x8_t use_filter8_mask =
+ vcombine_u16(is_flat4_mask, is_flat4_mask);
+ Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
+ const uint64x1_t need_filter14 = vreinterpret_u64_u16(is_flat4_outer_mask);
+ if (vget_lane_u64(need_filter14, 0) == 0) {
+ // Filter14() does not apply, but Filter8() and Filter4() apply to one or
+ // more values.
+ p5q5_output = p5q5;
+ p4q4_output = p4q4;
+ p3q3_output = p3q3;
+ p2q2_output = vbslq_u16(use_filter8_mask, f8_p2q2, p2q2);
+ p1q1_output = vbslq_u16(use_filter8_mask, f8_p1q1, f4_p1q1);
+ p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+ p0q0_output = vbslq_u16(use_filter8_mask, f8_p0q0, f4_p0q0);
+ p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+ } else {
+ // All filters may contribute values to final outputs.
+ const uint16x8_t use_filter14_mask =
+ vcombine_u16(is_flat4_outer_mask, is_flat4_outer_mask);
+ uint16x8_t f14_p5q5, f14_p4q4, f14_p3q3, f14_p2q2, f14_p1q1, f14_p0q0;
+ Filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4,
+ &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0);
+ p5q5_output = vbslq_u16(use_filter14_mask, f14_p5q5, p5q5);
+ p4q4_output = vbslq_u16(use_filter14_mask, f14_p4q4, p4q4);
+ p3q3_output = vbslq_u16(use_filter14_mask, f14_p3q3, p3q3);
+ p2q2_output = vbslq_u16(use_filter14_mask, f14_p2q2, f8_p2q2);
+ p2q2_output = vbslq_u16(use_filter8_mask, p2q2_output, p2q2);
+ p2q2_output = vbslq_u16(needs_filter_mask_8, p2q2_output, p2q2);
+ p1q1_output = vbslq_u16(use_filter14_mask, f14_p1q1, f8_p1q1);
+ p1q1_output = vbslq_u16(use_filter8_mask, p1q1_output, f4_p1q1);
+ p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+ p0q0_output = vbslq_u16(use_filter14_mask, f14_p0q0, f8_p0q0);
+ p0q0_output = vbslq_u16(use_filter8_mask, p0q0_output, f4_p0q0);
+ p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+ }
+ }
+
+ vst1_u16(dst_p5, vget_low_u16(p5q5_output));
+ vst1_u16(dst_p4, vget_low_u16(p4q4_output));
+ vst1_u16(dst_p3, vget_low_u16(p3q3_output));
+ vst1_u16(dst_p2, vget_low_u16(p2q2_output));
+ vst1_u16(dst_p1, vget_low_u16(p1q1_output));
+ vst1_u16(dst_p0, vget_low_u16(p0q0_output));
+ vst1_u16(dst_q0, vget_high_u16(p0q0_output));
+ vst1_u16(dst_q1, vget_high_u16(p1q1_output));
+ vst1_u16(dst_q2, vget_high_u16(p2q2_output));
+ vst1_u16(dst_q3, vget_high_u16(p3q3_output));
+ vst1_u16(dst_q4, vget_high_u16(p4q4_output));
+ vst1_u16(dst_q5, vget_high_u16(p5q5_output));
+}
+
+inline uint16x8x2_t PermuteACDB64(const uint16x8_t ab, const uint16x8_t cd) {
+ uint16x8x2_t acdb;
+#if defined(__aarch64__)
+ // a[b] <- [c]d
+ acdb.val[0] = vreinterpretq_u16_u64(
+ vtrn1q_u64(vreinterpretq_u64_u16(ab), vreinterpretq_u64_u16(cd)));
+ // [a]b <- c[d]
+ acdb.val[1] = vreinterpretq_u16_u64(
+ vtrn2q_u64(vreinterpretq_u64_u16(cd), vreinterpretq_u64_u16(ab)));
+#else
+ // a[b] <- [c]d
+ acdb.val[0] = vreinterpretq_u16_u64(
+ vsetq_lane_u64(vgetq_lane_u64(vreinterpretq_u64_u16(cd), 0),
+ vreinterpretq_u64_u16(ab), 1));
+ // [a]b <- c[d]
+ acdb.val[1] = vreinterpretq_u16_u64(
+ vsetq_lane_u64(vgetq_lane_u64(vreinterpretq_u64_u16(cd), 1),
+ vreinterpretq_u64_u16(ab), 0));
+#endif // defined(__aarch64__)
+ return acdb;
+}
+
+void Vertical14_NEON(void* const dest, const ptrdiff_t stride, int outer_thresh,
+ int inner_thresh, int hev_thresh) {
+ auto* const dst = static_cast<uint8_t*>(dest) - 8 * sizeof(uint16_t);
+ auto* const dst_0 = reinterpret_cast<uint16_t*>(dst);
+ auto* const dst_1 = reinterpret_cast<uint16_t*>(dst + stride);
+ auto* const dst_2 = reinterpret_cast<uint16_t*>(dst + 2 * stride);
+ auto* const dst_3 = reinterpret_cast<uint16_t*>(dst + 3 * stride);
+
+ // Low halves: p7 p6 p5 p4
+ // High halves: p3 p2 p1 p0
+ uint16x8_t src_p[4] = {vld1q_u16(dst_0), vld1q_u16(dst_1), vld1q_u16(dst_2),
+ vld1q_u16(dst_3)};
+ // p7 will be the low half of src_p[0]. Not used until the end.
+ Transpose4x8(src_p);
+
+ // Low halves: q0 q1 q2 q3
+ // High halves: q4 q5 q6 q7
+ uint16x8_t src_q[4] = {vld1q_u16(dst_0 + 8), vld1q_u16(dst_1 + 8),
+ vld1q_u16(dst_2 + 8), vld1q_u16(dst_3 + 8)};
+ // q7 will be the high half of src_q[3]. Not used until the end.
+ Transpose4x8(src_q);
+
+ // Adjust thresholds to bitdepth.
+ outer_thresh <<= 2;
+ inner_thresh <<= 2;
+ hev_thresh <<= 2;
+ const uint16x4_t outer_mask = OuterThreshold(
+ vget_high_u16(src_p[2]), vget_high_u16(src_p[3]), vget_low_u16(src_q[0]),
+ vget_low_u16(src_q[1]), outer_thresh);
+ const uint16x8_t p0q0 = vextq_u16(src_p[3], src_q[0], 4);
+ const uint16x8_t p1q1 = vextq_u16(src_p[2], src_q[1], 4);
+ const uint16x8_t p2q2 = vextq_u16(src_p[1], src_q[2], 4);
+ const uint16x8_t p3q3 = vextq_u16(src_p[0], src_q[3], 4);
+ uint16x4_t hev_mask;
+ uint16x4_t needs_filter_mask;
+ uint16x4_t is_flat4_mask;
+ Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
+ &needs_filter_mask, &is_flat4_mask, &hev_mask);
+
+#if defined(__aarch64__)
+ if (vaddv_u16(needs_filter_mask) == 0) {
+ // None of the values will be filtered.
+ return;
+ }
+#endif // defined(__aarch64__)
+ const uint16x8_t p4q4 =
+ vcombine_u16(vget_low_u16(src_p[3]), vget_high_u16(src_q[0]));
+ const uint16x8_t p5q5 =
+ vcombine_u16(vget_low_u16(src_p[2]), vget_high_u16(src_q[1]));
+ const uint16x8_t p6q6 =
+ vcombine_u16(vget_low_u16(src_p[1]), vget_high_u16(src_q[2]));
+ const uint16x8_t p7q7 =
+ vcombine_u16(vget_low_u16(src_p[0]), vget_high_u16(src_q[3]));
+ // Mask to choose between the outputs of Filter8 and Filter14.
+ // As with the derivation of |is_flat4_mask|, the question of whether to use
+ // Filter14 is only raised where |is_flat4_mask| is true.
+ const uint16x4_t is_flat4_outer_mask = vand_u16(
+ is_flat4_mask, IsFlat4(vabdq_u16(p0q0, p4q4), vabdq_u16(p0q0, p5q5),
+ vabdq_u16(p0q0, p6q6)));
+ // Copy the masks to the high bits for packed comparisons later.
+ const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+ const uint16x8_t needs_filter_mask_8 =
+ vcombine_u16(needs_filter_mask, needs_filter_mask);
+
+ uint16x8_t f4_p1q1;
+ uint16x8_t f4_p0q0;
+ const uint16x8_t p0q1 = vcombine_u16(vget_low_u16(p0q0), vget_high_u16(p1q1));
+ Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0);
+ f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
+
+ uint16x8_t p0q0_output, p1q1_output, p2q2_output, p3q3_output, p4q4_output,
+ p5q5_output;
+ // Because we did not return after testing |needs_filter_mask| we know it is
+ // nonzero. |is_flat4_mask| controls whether the needed filter is Filter4 or
+ // Filter8. Therefore if it is false when |needs_filter_mask| is true, Filter8
+ // output is not used.
+ uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
+ const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
+ if (vget_lane_u64(need_filter8, 0) == 0) {
+ // Filter8() and Filter14() do not apply, but Filter4() applies to one or
+ // more values.
+ p5q5_output = p5q5;
+ p4q4_output = p4q4;
+ p3q3_output = p3q3;
+ p2q2_output = p2q2;
+ p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
+ p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
+ } else {
+ const uint16x8_t use_filter8_mask =
+ vcombine_u16(is_flat4_mask, is_flat4_mask);
+ Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
+ const uint64x1_t need_filter14 = vreinterpret_u64_u16(is_flat4_outer_mask);
+ if (vget_lane_u64(need_filter14, 0) == 0) {
+ // Filter14() does not apply, but Filter8() and Filter4() apply to one or
+ // more values.
+ p5q5_output = p5q5;
+ p4q4_output = p4q4;
+ p3q3_output = p3q3;
+ p2q2_output = vbslq_u16(use_filter8_mask, f8_p2q2, p2q2);
+ p1q1_output = vbslq_u16(use_filter8_mask, f8_p1q1, f4_p1q1);
+ p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+ p0q0_output = vbslq_u16(use_filter8_mask, f8_p0q0, f4_p0q0);
+ p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+ } else {
+ // All filters may contribute values to final outputs.
+ const uint16x8_t use_filter14_mask =
+ vcombine_u16(is_flat4_outer_mask, is_flat4_outer_mask);
+ uint16x8_t f14_p5q5, f14_p4q4, f14_p3q3, f14_p2q2, f14_p1q1, f14_p0q0;
+ Filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4,
+ &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0);
+ p5q5_output = vbslq_u16(use_filter14_mask, f14_p5q5, p5q5);
+ p4q4_output = vbslq_u16(use_filter14_mask, f14_p4q4, p4q4);
+ p3q3_output = vbslq_u16(use_filter14_mask, f14_p3q3, p3q3);
+ p2q2_output = vbslq_u16(use_filter14_mask, f14_p2q2, f8_p2q2);
+ p2q2_output = vbslq_u16(use_filter8_mask, p2q2_output, p2q2);
+ p2q2_output = vbslq_u16(needs_filter_mask_8, p2q2_output, p2q2);
+ p1q1_output = vbslq_u16(use_filter14_mask, f14_p1q1, f8_p1q1);
+ p1q1_output = vbslq_u16(use_filter8_mask, p1q1_output, f4_p1q1);
+ p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+ p0q0_output = vbslq_u16(use_filter14_mask, f14_p0q0, f8_p0q0);
+ p0q0_output = vbslq_u16(use_filter8_mask, p0q0_output, f4_p0q0);
+ p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+ }
+ }
+ // To get the correctly ordered rows from the transpose, we need:
+ // p7p3 p6p2 p5p1 p4p0
+ // q0q4 q1q5 q2q6 q3q7
+ const uint16x8x2_t p7p3_q3q7 = PermuteACDB64(p7q7, p3q3_output);
+ const uint16x8x2_t p6p2_q2q6 = PermuteACDB64(p6q6, p2q2_output);
+ const uint16x8x2_t p5p1_q1q5 = PermuteACDB64(p5q5_output, p1q1_output);
+ const uint16x8x2_t p4p0_q0q4 = PermuteACDB64(p4q4_output, p0q0_output);
+ uint16x8_t output_p[4] = {p7p3_q3q7.val[0], p6p2_q2q6.val[0],
+ p5p1_q1q5.val[0], p4p0_q0q4.val[0]};
+ Transpose4x8(output_p);
+ uint16x8_t output_q[4] = {p4p0_q0q4.val[1], p5p1_q1q5.val[1],
+ p6p2_q2q6.val[1], p7p3_q3q7.val[1]};
+ Transpose4x8(output_q);
+
+ // Reverse p values to produce original order:
+ // p3 p2 p1 p0 q0 q1 q2 q3
+ vst1q_u16(dst_0, output_p[0]);
+ vst1q_u16(dst_0 + 8, output_q[0]);
+ vst1q_u16(dst_1, output_p[1]);
+ vst1q_u16(dst_1 + 8, output_q[1]);
+ vst1q_u16(dst_2, output_p[2]);
+ vst1q_u16(dst_2 + 8, output_q[2]);
+ vst1q_u16(dst_3, output_p[3]);
+ vst1q_u16(dst_3 + 8, output_q[3]);
+}
+
+} // namespace
+
+void LoopFilterInit10bpp_NEON() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+ dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] =
+ Horizontal4_NEON;
+ dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] = Vertical4_NEON;
+ dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] =
+ Horizontal6_NEON;
+ dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] = Vertical6_NEON;
+ dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] =
+ Horizontal8_NEON;
+ dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] = Vertical8_NEON;
+ dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] =
+ Horizontal14_NEON;
+ dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] =
+ Vertical14_NEON;
+}
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !(LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10)
+namespace libgav1 {
+namespace dsp {
+
+void LoopFilterInit10bpp_NEON() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10
diff --git a/src/dsp/arm/loop_filter_neon.cc b/src/dsp/arm/loop_filter_neon.cc
index 8c03928..a8b236d 100644
--- a/src/dsp/arm/loop_filter_neon.cc
+++ b/src/dsp/arm/loop_filter_neon.cc
@@ -29,7 +29,6 @@
namespace libgav1 {
namespace dsp {
-namespace low_bitdepth {
namespace {
// (abs(p1 - p0) > thresh) || (abs(q1 - q0) > thresh)
@@ -149,10 +148,6 @@ void Horizontal4_NEON(void* const dest, const ptrdiff_t stride,
needs_filter4_mask = InterleaveLow32(needs_filter4_mask, needs_filter4_mask);
#if defined(__aarch64__)
- // This provides a good speedup for the unit test. Not sure how applicable it
- // is to valid streams though.
- // Consider doing this on armv7 if there is a quick way to check if a vector
- // is zero.
if (vaddv_u8(needs_filter4_mask) == 0) {
// None of the values will be filtered.
return;
@@ -209,10 +204,6 @@ void Vertical4_NEON(void* const dest, const ptrdiff_t stride,
needs_filter4_mask = InterleaveLow32(needs_filter4_mask, needs_filter4_mask);
#if defined(__aarch64__)
- // This provides a good speedup for the unit test. Not sure how applicable it
- // is to valid streams though.
- // Consider doing this on armv7 if there is a quick way to check if a vector
- // is zero.
if (vaddv_u8(needs_filter4_mask) == 0) {
// None of the values will be filtered.
return;
@@ -346,10 +337,6 @@ void Horizontal6_NEON(void* const dest, const ptrdiff_t stride,
hev_mask = InterleaveLow32(hev_mask, hev_mask);
#if defined(__aarch64__)
- // This provides a good speedup for the unit test. Not sure how applicable it
- // is to valid streams though.
- // Consider doing this on armv7 if there is a quick way to check if a vector
- // is zero.
if (vaddv_u8(needs_filter6_mask) == 0) {
// None of the values will be filtered.
return;
@@ -420,10 +407,6 @@ void Vertical6_NEON(void* const dest, const ptrdiff_t stride,
hev_mask = InterleaveLow32(hev_mask, hev_mask);
#if defined(__aarch64__)
- // This provides a good speedup for the unit test. Not sure how applicable it
- // is to valid streams though.
- // Consider doing this on armv7 if there is a quick way to check if a vector
- // is zero.
if (vaddv_u8(needs_filter6_mask) == 0) {
// None of the values will be filtered.
return;
@@ -600,10 +583,6 @@ void Horizontal8_NEON(void* const dest, const ptrdiff_t stride,
hev_mask = InterleaveLow32(hev_mask, hev_mask);
#if defined(__aarch64__)
- // This provides a good speedup for the unit test. Not sure how applicable it
- // is to valid streams though.
- // Consider doing this on armv7 if there is a quick way to check if a vector
- // is zero.
if (vaddv_u8(needs_filter8_mask) == 0) {
// None of the values will be filtered.
return;
@@ -679,10 +658,6 @@ void Vertical8_NEON(void* const dest, const ptrdiff_t stride,
hev_mask = InterleaveLow32(hev_mask, hev_mask);
#if defined(__aarch64__)
- // This provides a good speedup for the unit test. Not sure how applicable it
- // is to valid streams though.
- // Consider doing this on armv7 if there is a quick way to check if a vector
- // is zero.
if (vaddv_u8(needs_filter8_mask) == 0) {
// None of the values will be filtered.
return;
@@ -863,10 +838,6 @@ void Horizontal14_NEON(void* const dest, const ptrdiff_t stride,
hev_mask = InterleaveLow32(hev_mask, hev_mask);
#if defined(__aarch64__)
- // This provides a good speedup for the unit test. Not sure how applicable it
- // is to valid streams though.
- // Consider doing this on armv7 if there is a quick way to check if a vector
- // is zero.
if (vaddv_u8(needs_filter8_mask) == 0) {
// None of the values will be filtered.
return;
@@ -1031,10 +1002,6 @@ void Vertical14_NEON(void* const dest, const ptrdiff_t stride,
hev_mask = InterleaveLow32(hev_mask, hev_mask);
#if defined(__aarch64__)
- // This provides a good speedup for the unit test. Not sure how applicable it
- // is to valid streams though.
- // Consider doing this on armv7 if there is a quick way to check if a vector
- // is zero.
if (vaddv_u8(needs_filter8_mask) == 0) {
// None of the values will be filtered.
return;
@@ -1158,7 +1125,9 @@ void Vertical14_NEON(void* const dest, const ptrdiff_t stride,
vst1q_u8(dst, output_3);
}
-void Init8bpp() {
+} // namespace
+
+void LoopFilterInit_NEON() {
Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
assert(dsp != nullptr);
dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] =
@@ -1178,1267 +1147,6 @@ void Init8bpp() {
dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] =
Vertical14_NEON;
}
-} // namespace
-} // namespace low_bitdepth
-
-#if LIBGAV1_MAX_BITDEPTH >= 10
-namespace high_bitdepth {
-namespace {
-
-// (abs(p1 - p0) > thresh) || (abs(q1 - q0) > thresh)
-inline uint16x4_t Hev(const uint16x8_t abd_p0p1_q0q1, const uint16_t thresh) {
- const uint16x8_t a = vcgtq_u16(abd_p0p1_q0q1, vdupq_n_u16(thresh));
- return vorr_u16(vget_low_u16(a), vget_high_u16(a));
-}
-
-// abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= outer_thresh
-inline uint16x4_t OuterThreshold(const uint16x4_t p1, const uint16x4_t p0,
- const uint16x4_t q0, const uint16x4_t q1,
- const uint16_t outer_thresh) {
- const uint16x4_t abd_p0q0 = vabd_u16(p0, q0);
- const uint16x4_t abd_p1q1 = vabd_u16(p1, q1);
- const uint16x4_t p0q0_double = vshl_n_u16(abd_p0q0, 1);
- const uint16x4_t p1q1_half = vshr_n_u16(abd_p1q1, 1);
- const uint16x4_t sum = vadd_u16(p0q0_double, p1q1_half);
- return vcle_u16(sum, vdup_n_u16(outer_thresh));
-}
-
-// abs(p1 - p0) <= inner_thresh && abs(q1 - q0) <= inner_thresh &&
-// OuterThreshold()
-inline uint16x4_t NeedsFilter4(const uint16x8_t abd_p0p1_q0q1,
- const uint16_t inner_thresh,
- const uint16x4_t outer_mask) {
- const uint16x8_t a = vcleq_u16(abd_p0p1_q0q1, vdupq_n_u16(inner_thresh));
- const uint16x4_t inner_mask = vand_u16(vget_low_u16(a), vget_high_u16(a));
- return vand_u16(inner_mask, outer_mask);
-}
-
-// abs(p2 - p1) <= inner_thresh && abs(p1 - p0) <= inner_thresh &&
-// abs(q1 - q0) <= inner_thresh && abs(q2 - q1) <= inner_thresh &&
-// OuterThreshold()
-inline uint16x4_t NeedsFilter6(const uint16x8_t abd_p0p1_q0q1,
- const uint16x8_t abd_p1p2_q1q2,
- const uint16_t inner_thresh,
- const uint16x4_t outer_mask) {
- const uint16x8_t a = vmaxq_u16(abd_p0p1_q0q1, abd_p1p2_q1q2);
- const uint16x8_t b = vcleq_u16(a, vdupq_n_u16(inner_thresh));
- const uint16x4_t inner_mask = vand_u16(vget_low_u16(b), vget_high_u16(b));
- return vand_u16(inner_mask, outer_mask);
-}
-
-// abs(p3 - p2) <= inner_thresh && abs(p2 - p1) <= inner_thresh &&
-// abs(p1 - p0) <= inner_thresh && abs(q1 - q0) <= inner_thresh &&
-// abs(q2 - q1) <= inner_thresh && abs(q3 - q2) <= inner_thresh
-// OuterThreshold()
-inline uint16x4_t NeedsFilter8(const uint16x8_t abd_p0p1_q0q1,
- const uint16x8_t abd_p1p2_q1q2,
- const uint16x8_t abd_p2p3_q2q3,
- const uint16_t inner_thresh,
- const uint16x4_t outer_mask) {
- const uint16x8_t a = vmaxq_u16(abd_p0p1_q0q1, abd_p1p2_q1q2);
- const uint16x8_t b = vmaxq_u16(a, abd_p2p3_q2q3);
- const uint16x8_t c = vcleq_u16(b, vdupq_n_u16(inner_thresh));
- const uint16x4_t inner_mask = vand_u16(vget_low_u16(c), vget_high_u16(c));
- return vand_u16(inner_mask, outer_mask);
-}
-
-// -----------------------------------------------------------------------------
-// FilterNMasks functions.
-
-inline void Filter4Masks(const uint16x8_t p0q0, const uint16x8_t p1q1,
- const uint16_t hev_thresh, const uint16x4_t outer_mask,
- const uint16_t inner_thresh,
- uint16x4_t* const hev_mask,
- uint16x4_t* const needs_filter4_mask) {
- const uint16x8_t p0p1_q0q1 = vabdq_u16(p0q0, p1q1);
- // This includes cases where NeedsFilter4() is not true and so Filter2() will
- // not be applied.
- const uint16x4_t hev_tmp_mask = Hev(p0p1_q0q1, hev_thresh);
-
- *needs_filter4_mask = NeedsFilter4(p0p1_q0q1, inner_thresh, outer_mask);
-
- // Filter2() will only be applied if both NeedsFilter4() and Hev() are true.
- *hev_mask = vand_u16(hev_tmp_mask, *needs_filter4_mask);
-}
-
-// abs(p1 - p0) <= flat_thresh && abs(q1 - q0) <= flat_thresh &&
-// abs(p2 - p0) <= flat_thresh && abs(q2 - q0) <= flat_thresh
-// |flat_thresh| == 4 for 10 bit decode.
-inline uint16x4_t IsFlat3(const uint16x8_t abd_p0p1_q0q1,
- const uint16x8_t abd_p0p2_q0q2) {
- constexpr int flat_thresh = 1 << 2;
- const uint16x8_t a = vmaxq_u16(abd_p0p1_q0q1, abd_p0p2_q0q2);
- const uint16x8_t b = vcleq_u16(a, vdupq_n_u16(flat_thresh));
- return vand_u16(vget_low_u16(b), vget_high_u16(b));
-}
-
-inline void Filter6Masks(const uint16x8_t p2q2, const uint16x8_t p1q1,
- const uint16x8_t p0q0, const uint16_t hev_thresh,
- const uint16x4_t outer_mask,
- const uint16_t inner_thresh,
- uint16x4_t* const needs_filter6_mask,
- uint16x4_t* const is_flat3_mask,
- uint16x4_t* const hev_mask) {
- const uint16x8_t abd_p0p1_q0q1 = vabdq_u16(p0q0, p1q1);
- *hev_mask = Hev(abd_p0p1_q0q1, hev_thresh);
- *is_flat3_mask = IsFlat3(abd_p0p1_q0q1, vabdq_u16(p0q0, p2q2));
- *needs_filter6_mask = NeedsFilter6(abd_p0p1_q0q1, vabdq_u16(p1q1, p2q2),
- inner_thresh, outer_mask);
-}
-
-// IsFlat4 uses N=1, IsFlatOuter4 uses N=4.
-// abs(p[N] - p0) <= flat_thresh && abs(q[N] - q0) <= flat_thresh &&
-// abs(p[N+1] - p0) <= flat_thresh && abs(q[N+1] - q0) <= flat_thresh &&
-// abs(p[N+2] - p0) <= flat_thresh && abs(q[N+1] - q0) <= flat_thresh
-// |flat_thresh| == 4 for 10 bit decode.
-inline uint16x4_t IsFlat4(const uint16x8_t abd_pnp0_qnq0,
- const uint16x8_t abd_pn1p0_qn1q0,
- const uint16x8_t abd_pn2p0_qn2q0) {
- constexpr int flat_thresh = 1 << 2;
- const uint16x8_t a = vmaxq_u16(abd_pnp0_qnq0, abd_pn1p0_qn1q0);
- const uint16x8_t b = vmaxq_u16(a, abd_pn2p0_qn2q0);
- const uint16x8_t c = vcleq_u16(b, vdupq_n_u16(flat_thresh));
- return vand_u16(vget_low_u16(c), vget_high_u16(c));
-}
-
-inline void Filter8Masks(const uint16x8_t p3q3, const uint16x8_t p2q2,
- const uint16x8_t p1q1, const uint16x8_t p0q0,
- const uint16_t hev_thresh, const uint16x4_t outer_mask,
- const uint16_t inner_thresh,
- uint16x4_t* const needs_filter8_mask,
- uint16x4_t* const is_flat4_mask,
- uint16x4_t* const hev_mask) {
- const uint16x8_t abd_p0p1_q0q1 = vabdq_u16(p0q0, p1q1);
- *hev_mask = Hev(abd_p0p1_q0q1, hev_thresh);
- const uint16x4_t is_flat4 =
- IsFlat4(abd_p0p1_q0q1, vabdq_u16(p0q0, p2q2), vabdq_u16(p0q0, p3q3));
- *needs_filter8_mask =
- NeedsFilter8(abd_p0p1_q0q1, vabdq_u16(p1q1, p2q2), vabdq_u16(p2q2, p3q3),
- inner_thresh, outer_mask);
- // |is_flat4_mask| is used to decide where to use the result of Filter8.
- // In rare cases, |is_flat4| can be true where |needs_filter8_mask| is false,
- // overriding the question of whether to use Filter8. Because Filter4 doesn't
- // apply to p2q2, |is_flat4_mask| chooses directly between Filter8 and the
- // source value. To be correct, the mask must account for this override.
- *is_flat4_mask = vand_u16(is_flat4, *needs_filter8_mask);
-}
-
-// -----------------------------------------------------------------------------
-// FilterN functions.
-
-// Calculate Filter4() or Filter2() based on |hev_mask|.
-inline void Filter4(const uint16x8_t p0q0, const uint16x8_t p0q1,
- const uint16x8_t p1q1, const uint16x4_t hev_mask,
- uint16x8_t* const p1q1_result,
- uint16x8_t* const p0q0_result) {
- const uint16x8_t q0p1 = vextq_u16(p0q0, p1q1, 4);
- // a = 3 * (q0 - p0) + Clip3(p1 - q1, min_signed_val, max_signed_val);
- // q0mp0 means "q0 minus p0".
- const int16x8_t q0mp0_p1mq1 = vreinterpretq_s16_u16(vsubq_u16(q0p1, p0q1));
- const int16x4_t q0mp0_3 = vmul_n_s16(vget_low_s16(q0mp0_p1mq1), 3);
-
- // If this is for Filter2() then include |p1mq1|. Otherwise zero it.
- const int16x4_t min_signed_pixel = vdup_n_s16(-(1 << (9 /*bitdepth-1*/)));
- const int16x4_t max_signed_pixel = vdup_n_s16((1 << (9 /*bitdepth-1*/)) - 1);
- const int16x4_t p1mq1 = vget_high_s16(q0mp0_p1mq1);
- const int16x4_t p1mq1_saturated =
- Clip3S16(p1mq1, min_signed_pixel, max_signed_pixel);
- const int16x4_t hev_option =
- vand_s16(vreinterpret_s16_u16(hev_mask), p1mq1_saturated);
-
- const int16x4_t a = vadd_s16(q0mp0_3, hev_option);
-
- // Need to figure out what's going on here because there are some unnecessary
- // tricks to accommodate 8x8 as smallest 8bpp vector
-
- // We can not shift with rounding because the clamp comes *before* the
- // shifting. a1 = Clip3(a + 4, min_signed_val, max_signed_val) >> 3; a2 =
- // Clip3(a + 3, min_signed_val, max_signed_val) >> 3;
- const int16x4_t plus_four =
- Clip3S16(vadd_s16(a, vdup_n_s16(4)), min_signed_pixel, max_signed_pixel);
- const int16x4_t plus_three =
- Clip3S16(vadd_s16(a, vdup_n_s16(3)), min_signed_pixel, max_signed_pixel);
- const int16x4_t a1 = vshr_n_s16(plus_four, 3);
- const int16x4_t a2 = vshr_n_s16(plus_three, 3);
-
- // a3 = (a1 + 1) >> 1;
- const int16x4_t a3 = vrshr_n_s16(a1, 1);
-
- const int16x8_t a3_ma3 = vcombine_s16(a3, vneg_s16(a3));
- const int16x8_t p1q1_a3 = vaddq_s16(vreinterpretq_s16_u16(p1q1), a3_ma3);
-
- // Need to shift the second term or we end up with a2_ma2.
- const int16x8_t a2_ma1 = vcombine_s16(a2, vneg_s16(a1));
- const int16x8_t p0q0_a = vaddq_s16(vreinterpretq_s16_u16(p0q0), a2_ma1);
- *p1q1_result = ConvertToUnsignedPixelU16(p1q1_a3, kBitdepth10);
- *p0q0_result = ConvertToUnsignedPixelU16(p0q0_a, kBitdepth10);
-}
-
-void Horizontal4_NEON(void* const dest, const ptrdiff_t stride,
- int outer_thresh, int inner_thresh, int hev_thresh) {
- auto* const dst = static_cast<uint8_t*>(dest);
- auto* const dst_p1 = reinterpret_cast<uint16_t*>(dst - 2 * stride);
- auto* const dst_p0 = reinterpret_cast<uint16_t*>(dst - stride);
- auto* const dst_q0 = reinterpret_cast<uint16_t*>(dst);
- auto* const dst_q1 = reinterpret_cast<uint16_t*>(dst + stride);
-
- const uint16x4_t src[4] = {vld1_u16(dst_p1), vld1_u16(dst_p0),
- vld1_u16(dst_q0), vld1_u16(dst_q1)};
-
- // Adjust thresholds to bitdepth.
- outer_thresh <<= 2;
- inner_thresh <<= 2;
- hev_thresh <<= 2;
- const uint16x4_t outer_mask =
- OuterThreshold(src[0], src[1], src[2], src[3], outer_thresh);
- uint16x4_t hev_mask;
- uint16x4_t needs_filter4_mask;
- const uint16x8_t p0q0 = vcombine_u16(src[1], src[2]);
- const uint16x8_t p1q1 = vcombine_u16(src[0], src[3]);
- Filter4Masks(p0q0, p1q1, hev_thresh, outer_mask, inner_thresh, &hev_mask,
- &needs_filter4_mask);
-
-#if defined(__aarch64__)
- // This provides a good speedup for the unit test, but may not come up often
- // enough to warrant it.
- if (vaddv_u16(needs_filter4_mask) == 0) {
- // None of the values will be filtered.
- return;
- }
-#else // !defined(__aarch64__)
- const uint64x1_t needs_filter4_mask64 =
- vreinterpret_u64_u16(needs_filter4_mask);
- if (vget_lane_u64(needs_filter4_mask64, 0) == 0) {
- // None of the values will be filtered.
- return;
- }
-#endif // defined(__aarch64__)
-
- // Copy the masks to the high bits for packed comparisons later.
- const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
- const uint16x8_t needs_filter4_mask_8 =
- vcombine_u16(needs_filter4_mask, needs_filter4_mask);
-
- uint16x8_t f_p1q1;
- uint16x8_t f_p0q0;
- const uint16x8_t p0q1 = vcombine_u16(src[1], src[3]);
- Filter4(p0q0, p0q1, p1q1, hev_mask, &f_p1q1, &f_p0q0);
-
- // Already integrated the Hev mask when calculating the filtered values.
- const uint16x8_t p0q0_output = vbslq_u16(needs_filter4_mask_8, f_p0q0, p0q0);
-
- // p1/q1 are unmodified if only Hev() is true. This works because it was and'd
- // with |needs_filter4_mask| previously.
- const uint16x8_t p1q1_mask = veorq_u16(hev_mask_8, needs_filter4_mask_8);
- const uint16x8_t p1q1_output = vbslq_u16(p1q1_mask, f_p1q1, p1q1);
-
- vst1_u16(dst_p1, vget_low_u16(p1q1_output));
- vst1_u16(dst_p0, vget_low_u16(p0q0_output));
- vst1_u16(dst_q0, vget_high_u16(p0q0_output));
- vst1_u16(dst_q1, vget_high_u16(p1q1_output));
-}
-
-void Vertical4_NEON(void* const dest, const ptrdiff_t stride, int outer_thresh,
- int inner_thresh, int hev_thresh) {
- // Offset by 2 uint16_t values to load from first p1 position.
- auto* dst = static_cast<uint8_t*>(dest) - 4;
- auto* dst_p1 = reinterpret_cast<uint16_t*>(dst);
- auto* dst_p0 = reinterpret_cast<uint16_t*>(dst + stride);
- auto* dst_q0 = reinterpret_cast<uint16_t*>(dst + stride * 2);
- auto* dst_q1 = reinterpret_cast<uint16_t*>(dst + stride * 3);
-
- uint16x4_t src[4] = {vld1_u16(dst_p1), vld1_u16(dst_p0), vld1_u16(dst_q0),
- vld1_u16(dst_q1)};
- Transpose4x4(src);
-
- // Adjust thresholds to bitdepth.
- outer_thresh <<= 2;
- inner_thresh <<= 2;
- hev_thresh <<= 2;
- const uint16x4_t outer_mask =
- OuterThreshold(src[0], src[1], src[2], src[3], outer_thresh);
- uint16x4_t hev_mask;
- uint16x4_t needs_filter4_mask;
- const uint16x8_t p0q0 = vcombine_u16(src[1], src[2]);
- const uint16x8_t p1q1 = vcombine_u16(src[0], src[3]);
- Filter4Masks(p0q0, p1q1, hev_thresh, outer_mask, inner_thresh, &hev_mask,
- &needs_filter4_mask);
-
-#if defined(__aarch64__)
- // This provides a good speedup for the unit test. Not sure how applicable it
- // is to valid streams though.
- // Consider doing this on armv7 if there is a quick way to check if a vector
- // is zero.
- if (vaddv_u16(needs_filter4_mask) == 0) {
- // None of the values will be filtered.
- return;
- }
-#else // !defined(__aarch64__)
- const uint64x1_t needs_filter4_mask64 =
- vreinterpret_u64_u16(needs_filter4_mask);
- if (vget_lane_u64(needs_filter4_mask64, 0) == 0) {
- // None of the values will be filtered.
- return;
- }
-#endif // defined(__aarch64__)
-
- // Copy the masks to the high bits for packed comparisons later.
- const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
- const uint16x8_t needs_filter4_mask_8 =
- vcombine_u16(needs_filter4_mask, needs_filter4_mask);
-
- uint16x8_t f_p1q1;
- uint16x8_t f_p0q0;
- const uint16x8_t p0q1 = vcombine_u16(src[1], src[3]);
- Filter4(p0q0, p0q1, p1q1, hev_mask, &f_p1q1, &f_p0q0);
-
- // Already integrated the Hev mask when calculating the filtered values.
- const uint16x8_t p0q0_output = vbslq_u16(needs_filter4_mask_8, f_p0q0, p0q0);
-
- // p1/q1 are unmodified if only Hev() is true. This works because it was and'd
- // with |needs_filter4_mask| previously.
- const uint16x8_t p1q1_mask = veorq_u16(hev_mask_8, needs_filter4_mask_8);
- const uint16x8_t p1q1_output = vbslq_u16(p1q1_mask, f_p1q1, p1q1);
-
- uint16x4_t output[4] = {
- vget_low_u16(p1q1_output),
- vget_low_u16(p0q0_output),
- vget_high_u16(p0q0_output),
- vget_high_u16(p1q1_output),
- };
- Transpose4x4(output);
-
- vst1_u16(dst_p1, output[0]);
- vst1_u16(dst_p0, output[1]);
- vst1_u16(dst_q0, output[2]);
- vst1_u16(dst_q1, output[3]);
-}
-
-inline void Filter6(const uint16x8_t p2q2, const uint16x8_t p1q1,
- const uint16x8_t p0q0, uint16x8_t* const p1q1_output,
- uint16x8_t* const p0q0_output) {
- // Sum p1 and q1 output from opposite directions.
- // The formula is regrouped to allow 3 doubling operations to be combined.
- //
- // p1 = (3 * p2) + (2 * p1) + (2 * p0) + q0
- // ^^^^^^^^
- // q1 = p0 + (2 * q0) + (2 * q1) + (3 * q2)
- // ^^^^^^^^
- // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0
- // ^^^^^^^^^^^
- uint16x8_t sum = vaddq_u16(p2q2, p1q1);
-
- // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0
- // ^^^^^^
- sum = vaddq_u16(sum, p0q0);
-
- // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0
- // ^^^^^
- sum = vshlq_n_u16(sum, 1);
-
- // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0
- // ^^^^^^ ^^^^^^
- // Should dual issue with the left shift.
- const uint16x8_t q0p0 = Transpose64(p0q0);
- const uint16x8_t outer_sum = vaddq_u16(p2q2, q0p0);
- sum = vaddq_u16(sum, outer_sum);
-
- *p1q1_output = vrshrq_n_u16(sum, 3);
-
- // Convert to p0 and q0 output:
- // p0 = p1 - (2 * p2) + q0 + q1
- // q0 = q1 - (2 * q2) + p0 + p1
- // p0q0 = p1q1 - (2 * p2q2) + q0p0 + q1p1
- // ^^^^^^^^
- const uint16x8_t p2q2_double = vshlq_n_u16(p2q2, 1);
- // p0q0 = p1q1 - (2 * p2q2) + q0p0 + q1p1
- // ^^^^^^^^
- sum = vsubq_u16(sum, p2q2_double);
- const uint16x8_t q1p1 = Transpose64(p1q1);
- sum = vaddq_u16(sum, vaddq_u16(q0p0, q1p1));
-
- *p0q0_output = vrshrq_n_u16(sum, 3);
-}
-
-void Horizontal6_NEON(void* const dest, const ptrdiff_t stride,
- int outer_thresh, int inner_thresh, int hev_thresh) {
- auto* const dst = static_cast<uint8_t*>(dest);
- auto* const dst_p2 = reinterpret_cast<uint16_t*>(dst - 3 * stride);
- auto* const dst_p1 = reinterpret_cast<uint16_t*>(dst - 2 * stride);
- auto* const dst_p0 = reinterpret_cast<uint16_t*>(dst - stride);
- auto* const dst_q0 = reinterpret_cast<uint16_t*>(dst);
- auto* const dst_q1 = reinterpret_cast<uint16_t*>(dst + stride);
- auto* const dst_q2 = reinterpret_cast<uint16_t*>(dst + 2 * stride);
-
- const uint16x4_t src[6] = {vld1_u16(dst_p2), vld1_u16(dst_p1),
- vld1_u16(dst_p0), vld1_u16(dst_q0),
- vld1_u16(dst_q1), vld1_u16(dst_q2)};
-
- // Adjust thresholds to bitdepth.
- outer_thresh <<= 2;
- inner_thresh <<= 2;
- hev_thresh <<= 2;
- const uint16x4_t outer_mask =
- OuterThreshold(src[1], src[2], src[3], src[4], outer_thresh);
- uint16x4_t hev_mask;
- uint16x4_t needs_filter_mask;
- uint16x4_t is_flat3_mask;
- const uint16x8_t p0q0 = vcombine_u16(src[2], src[3]);
- const uint16x8_t p1q1 = vcombine_u16(src[1], src[4]);
- const uint16x8_t p2q2 = vcombine_u16(src[0], src[5]);
- Filter6Masks(p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
- &needs_filter_mask, &is_flat3_mask, &hev_mask);
-
-#if defined(__aarch64__)
- if (vaddv_u16(needs_filter_mask) == 0) {
- // None of the values will be filtered.
- return;
- }
-#else // !defined(__aarch64__)
- // This might be faster than vaddv (latency 3) because mov to general register
- // has latency 2.
- const uint64x1_t needs_filter_mask64 =
- vreinterpret_u64_u16(needs_filter_mask);
- if (vget_lane_u64(needs_filter_mask64, 0) == 0) {
- // None of the values will be filtered.
- return;
- }
-#endif // defined(__aarch64__)
-
- // Copy the masks to the high bits for packed comparisons later.
- const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
- const uint16x8_t is_flat3_mask_8 = vcombine_u16(is_flat3_mask, is_flat3_mask);
- const uint16x8_t needs_filter_mask_8 =
- vcombine_u16(needs_filter_mask, needs_filter_mask);
-
- uint16x8_t f4_p1q1;
- uint16x8_t f4_p0q0;
- // ZIP1 p0q0, p1q1 may perform better here.
- const uint16x8_t p0q1 = vcombine_u16(src[2], src[4]);
- Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0);
- f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
-
- uint16x8_t p0q0_output, p1q1_output;
- // Because we did not return after testing |needs_filter_mask| we know it is
- // nonzero. |is_flat3_mask| controls whether the needed filter is Filter4 or
- // Filter6. Therefore if it is false when |needs_filter_mask| is true, Filter6
- // output is not used.
- uint16x8_t f6_p1q1, f6_p0q0;
- const uint64x1_t need_filter6 = vreinterpret_u64_u16(is_flat3_mask);
- if (vget_lane_u64(need_filter6, 0) == 0) {
- // Filter6() does not apply, but Filter4() applies to one or more values.
- p0q0_output = p0q0;
- p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
- p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
- } else {
- Filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0);
- p1q1_output = vbslq_u16(is_flat3_mask_8, f6_p1q1, f4_p1q1);
- p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
- p0q0_output = vbslq_u16(is_flat3_mask_8, f6_p0q0, f4_p0q0);
- p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
- }
-
- vst1_u16(dst_p1, vget_low_u16(p1q1_output));
- vst1_u16(dst_p0, vget_low_u16(p0q0_output));
- vst1_u16(dst_q0, vget_high_u16(p0q0_output));
- vst1_u16(dst_q1, vget_high_u16(p1q1_output));
-}
-
-void Vertical6_NEON(void* const dest, const ptrdiff_t stride, int outer_thresh,
- int inner_thresh, int hev_thresh) {
- // Left side of the filter window.
- auto* const dst = static_cast<uint8_t*>(dest) - 3 * sizeof(uint16_t);
- auto* const dst_0 = reinterpret_cast<uint16_t*>(dst);
- auto* const dst_1 = reinterpret_cast<uint16_t*>(dst + stride);
- auto* const dst_2 = reinterpret_cast<uint16_t*>(dst + 2 * stride);
- auto* const dst_3 = reinterpret_cast<uint16_t*>(dst + 3 * stride);
-
- // Overread by 2 values. These overreads become the high halves of src_raw[2]
- // and src_raw[3] after transpose.
- uint16x8_t src_raw[4] = {vld1q_u16(dst_0), vld1q_u16(dst_1), vld1q_u16(dst_2),
- vld1q_u16(dst_3)};
- Transpose4x8(src_raw);
- // p2, p1, p0, q0, q1, q2
- const uint16x4_t src[6] = {
- vget_low_u16(src_raw[0]), vget_low_u16(src_raw[1]),
- vget_low_u16(src_raw[2]), vget_low_u16(src_raw[3]),
- vget_high_u16(src_raw[0]), vget_high_u16(src_raw[1]),
- };
-
- // Adjust thresholds to bitdepth.
- outer_thresh <<= 2;
- inner_thresh <<= 2;
- hev_thresh <<= 2;
- const uint16x4_t outer_mask =
- OuterThreshold(src[1], src[2], src[3], src[4], outer_thresh);
- uint16x4_t hev_mask;
- uint16x4_t needs_filter_mask;
- uint16x4_t is_flat3_mask;
- const uint16x8_t p0q0 = vcombine_u16(src[2], src[3]);
- const uint16x8_t p1q1 = vcombine_u16(src[1], src[4]);
- const uint16x8_t p2q2 = vcombine_u16(src[0], src[5]);
- Filter6Masks(p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
- &needs_filter_mask, &is_flat3_mask, &hev_mask);
-
-#if defined(__aarch64__)
- if (vaddv_u16(needs_filter_mask) == 0) {
- // None of the values will be filtered.
- return;
- }
-#else // !defined(__aarch64__)
- // This might be faster than vaddv (latency 3) because mov to general register
- // has latency 2.
- const uint64x1_t needs_filter_mask64 =
- vreinterpret_u64_u16(needs_filter_mask);
- if (vget_lane_u64(needs_filter_mask64, 0) == 0) {
- // None of the values will be filtered.
- return;
- }
-#endif // defined(__aarch64__)
-
- // Copy the masks to the high bits for packed comparisons later.
- const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
- const uint16x8_t is_flat3_mask_8 = vcombine_u16(is_flat3_mask, is_flat3_mask);
- const uint16x8_t needs_filter_mask_8 =
- vcombine_u16(needs_filter_mask, needs_filter_mask);
-
- uint16x8_t f4_p1q1;
- uint16x8_t f4_p0q0;
- // ZIP1 p0q0, p1q1 may perform better here.
- const uint16x8_t p0q1 = vcombine_u16(src[2], src[4]);
- Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0);
- f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
-
- uint16x8_t p0q0_output, p1q1_output;
- // Because we did not return after testing |needs_filter_mask| we know it is
- // nonzero. |is_flat3_mask| controls whether the needed filter is Filter4 or
- // Filter6. Therefore if it is false when |needs_filter_mask| is true, Filter6
- // output is not used.
- uint16x8_t f6_p1q1, f6_p0q0;
- const uint64x1_t need_filter6 = vreinterpret_u64_u16(is_flat3_mask);
- if (vget_lane_u64(need_filter6, 0) == 0) {
- // Filter6() does not apply, but Filter4() applies to one or more values.
- p0q0_output = p0q0;
- p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
- p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
- } else {
- Filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0);
- p1q1_output = vbslq_u16(is_flat3_mask_8, f6_p1q1, f4_p1q1);
- p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
- p0q0_output = vbslq_u16(is_flat3_mask_8, f6_p0q0, f4_p0q0);
- p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
- }
-
- uint16x4_t output[4] = {
- vget_low_u16(p1q1_output),
- vget_low_u16(p0q0_output),
- vget_high_u16(p0q0_output),
- vget_high_u16(p1q1_output),
- };
- Transpose4x4(output);
-
- // dst_n starts at p2, so adjust to p1.
- vst1_u16(dst_0 + 1, output[0]);
- vst1_u16(dst_1 + 1, output[1]);
- vst1_u16(dst_2 + 1, output[2]);
- vst1_u16(dst_3 + 1, output[3]);
-}
-
-inline void Filter8(const uint16x8_t p3q3, const uint16x8_t p2q2,
- const uint16x8_t p1q1, const uint16x8_t p0q0,
- uint16x8_t* const p2q2_output,
- uint16x8_t* const p1q1_output,
- uint16x8_t* const p0q0_output) {
- // Sum p2 and q2 output from opposite directions.
- // The formula is regrouped to allow 2 doubling operations to be combined.
- // p2 = (3 * p3) + (2 * p2) + p1 + p0 + q0
- // ^^^^^^^^
- // q2 = p0 + q0 + q1 + (2 * q2) + (3 * q3)
- // ^^^^^^^^
- // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
- // ^^^^^^^^^^^
- const uint16x8_t p23q23 = vaddq_u16(p3q3, p2q2);
-
- // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
- // ^^^^^
- uint16x8_t sum = vshlq_n_u16(p23q23, 1);
-
- // Add two other terms to make dual issue with shift more likely.
- // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
- // ^^^^^^^^^^^
- const uint16x8_t p01q01 = vaddq_u16(p0q0, p1q1);
-
- // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
- // ^^^^^^^^^^^^^
- sum = vaddq_u16(sum, p01q01);
-
- // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
- // ^^^^^^
- sum = vaddq_u16(sum, p3q3);
-
- // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
- // ^^^^^^
- const uint16x8_t q0p0 = Transpose64(p0q0);
- sum = vaddq_u16(sum, q0p0);
-
- *p2q2_output = vrshrq_n_u16(sum, 3);
-
- // Convert to p1 and q1 output:
- // p1 = p2 - p3 - p2 + p1 + q1
- // q1 = q2 - q3 - q2 + q0 + p1
- sum = vsubq_u16(sum, p23q23);
- const uint16x8_t q1p1 = Transpose64(p1q1);
- sum = vaddq_u16(sum, vaddq_u16(p1q1, q1p1));
-
- *p1q1_output = vrshrq_n_u16(sum, 3);
-
- // Convert to p0 and q0 output:
- // p0 = p1 - p3 - p1 + p0 + q2
- // q0 = q1 - q3 - q1 + q0 + p2
- sum = vsubq_u16(sum, vaddq_u16(p3q3, p1q1));
- const uint16x8_t q2p2 = Transpose64(p2q2);
- sum = vaddq_u16(sum, vaddq_u16(p0q0, q2p2));
-
- *p0q0_output = vrshrq_n_u16(sum, 3);
-}
-
-void Horizontal8_NEON(void* const dest, const ptrdiff_t stride,
- int outer_thresh, int inner_thresh, int hev_thresh) {
- auto* const dst = static_cast<uint8_t*>(dest);
- auto* const dst_p3 = reinterpret_cast<uint16_t*>(dst - 4 * stride);
- auto* const dst_p2 = reinterpret_cast<uint16_t*>(dst - 3 * stride);
- auto* const dst_p1 = reinterpret_cast<uint16_t*>(dst - 2 * stride);
- auto* const dst_p0 = reinterpret_cast<uint16_t*>(dst - stride);
- auto* const dst_q0 = reinterpret_cast<uint16_t*>(dst);
- auto* const dst_q1 = reinterpret_cast<uint16_t*>(dst + stride);
- auto* const dst_q2 = reinterpret_cast<uint16_t*>(dst + 2 * stride);
- auto* const dst_q3 = reinterpret_cast<uint16_t*>(dst + 3 * stride);
-
- const uint16x4_t src[8] = {
- vld1_u16(dst_p3), vld1_u16(dst_p2), vld1_u16(dst_p1), vld1_u16(dst_p0),
- vld1_u16(dst_q0), vld1_u16(dst_q1), vld1_u16(dst_q2), vld1_u16(dst_q3)};
-
- // Adjust thresholds to bitdepth.
- outer_thresh <<= 2;
- inner_thresh <<= 2;
- hev_thresh <<= 2;
- const uint16x4_t outer_mask =
- OuterThreshold(src[2], src[3], src[4], src[5], outer_thresh);
- uint16x4_t hev_mask;
- uint16x4_t needs_filter_mask;
- uint16x4_t is_flat4_mask;
- const uint16x8_t p0q0 = vcombine_u16(src[3], src[4]);
- const uint16x8_t p1q1 = vcombine_u16(src[2], src[5]);
- const uint16x8_t p2q2 = vcombine_u16(src[1], src[6]);
- const uint16x8_t p3q3 = vcombine_u16(src[0], src[7]);
- Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
- &needs_filter_mask, &is_flat4_mask, &hev_mask);
-
-#if defined(__aarch64__)
- if (vaddv_u16(needs_filter_mask) == 0) {
- // None of the values will be filtered.
- return;
- }
-#else // !defined(__aarch64__)
- // This might be faster than vaddv (latency 3) because mov to general register
- // has latency 2.
- const uint64x1_t needs_filter_mask64 =
- vreinterpret_u64_u16(needs_filter_mask);
- if (vget_lane_u64(needs_filter_mask64, 0) == 0) {
- // None of the values will be filtered.
- return;
- }
-#endif // defined(__aarch64__)
-
- // Copy the masks to the high bits for packed comparisons later.
- const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
- const uint16x8_t needs_filter_mask_8 =
- vcombine_u16(needs_filter_mask, needs_filter_mask);
-
- uint16x8_t f4_p1q1;
- uint16x8_t f4_p0q0;
- // ZIP1 p0q0, p1q1 may perform better here.
- const uint16x8_t p0q1 = vcombine_u16(src[3], src[5]);
- Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0);
- f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
-
- uint16x8_t p0q0_output, p1q1_output, p2q2_output;
- // Because we did not return after testing |needs_filter_mask| we know it is
- // nonzero. |is_flat4_mask| controls whether the needed filter is Filter4 or
- // Filter8. Therefore if it is false when |needs_filter_mask| is true, Filter8
- // output is not used.
- uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
- const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
- if (vget_lane_u64(need_filter8, 0) == 0) {
- // Filter8() does not apply, but Filter4() applies to one or more values.
- p2q2_output = p2q2;
- p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
- p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
- } else {
- const uint16x8_t is_flat4_mask_8 =
- vcombine_u16(is_flat4_mask, is_flat4_mask);
- Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
- p2q2_output = vbslq_u16(is_flat4_mask_8, f8_p2q2, p2q2);
- p1q1_output = vbslq_u16(is_flat4_mask_8, f8_p1q1, f4_p1q1);
- p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
- p0q0_output = vbslq_u16(is_flat4_mask_8, f8_p0q0, f4_p0q0);
- p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
- }
-
- vst1_u16(dst_p2, vget_low_u16(p2q2_output));
- vst1_u16(dst_p1, vget_low_u16(p1q1_output));
- vst1_u16(dst_p0, vget_low_u16(p0q0_output));
- vst1_u16(dst_q0, vget_high_u16(p0q0_output));
- vst1_u16(dst_q1, vget_high_u16(p1q1_output));
- vst1_u16(dst_q2, vget_high_u16(p2q2_output));
-}
-
-inline uint16x8_t ReverseLowHalf(const uint16x8_t a) {
- return vcombine_u16(vrev64_u16(vget_low_u16(a)), vget_high_u16(a));
-}
-
-void Vertical8_NEON(void* const dest, const ptrdiff_t stride, int outer_thresh,
- int inner_thresh, int hev_thresh) {
- auto* const dst = static_cast<uint8_t*>(dest) - 4 * sizeof(uint16_t);
- auto* const dst_0 = reinterpret_cast<uint16_t*>(dst);
- auto* const dst_1 = reinterpret_cast<uint16_t*>(dst + stride);
- auto* const dst_2 = reinterpret_cast<uint16_t*>(dst + 2 * stride);
- auto* const dst_3 = reinterpret_cast<uint16_t*>(dst + 3 * stride);
-
- // src_raw[n] contains p3, p2, p1, p0, q0, q1, q2, q3 for row n.
- // To get desired pairs after transpose, one half should be reversed.
- uint16x8_t src[4] = {vld1q_u16(dst_0), vld1q_u16(dst_1), vld1q_u16(dst_2),
- vld1q_u16(dst_3)};
-
- // src[0] = p0q0
- // src[1] = p1q1
- // src[2] = p2q2
- // src[3] = p3q3
- LoopFilterTranspose4x8(src);
-
- // Adjust thresholds to bitdepth.
- outer_thresh <<= 2;
- inner_thresh <<= 2;
- hev_thresh <<= 2;
- const uint16x4_t outer_mask = OuterThreshold(
- vget_low_u16(src[1]), vget_low_u16(src[0]), vget_high_u16(src[0]),
- vget_high_u16(src[1]), outer_thresh);
- uint16x4_t hev_mask;
- uint16x4_t needs_filter_mask;
- uint16x4_t is_flat4_mask;
- const uint16x8_t p0q0 = src[0];
- const uint16x8_t p1q1 = src[1];
- const uint16x8_t p2q2 = src[2];
- const uint16x8_t p3q3 = src[3];
- Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
- &needs_filter_mask, &is_flat4_mask, &hev_mask);
-
-#if defined(__aarch64__)
- if (vaddv_u16(needs_filter_mask) == 0) {
- // None of the values will be filtered.
- return;
- }
-#else // !defined(__aarch64__)
- // This might be faster than vaddv (latency 3) because mov to general register
- // has latency 2.
- const uint64x1_t needs_filter_mask64 =
- vreinterpret_u64_u16(needs_filter_mask);
- if (vget_lane_u64(needs_filter_mask64, 0) == 0) {
- // None of the values will be filtered.
- return;
- }
-#endif // defined(__aarch64__)
-
- // Copy the masks to the high bits for packed comparisons later.
- const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
- const uint16x8_t needs_filter_mask_8 =
- vcombine_u16(needs_filter_mask, needs_filter_mask);
-
- uint16x8_t f4_p1q1;
- uint16x8_t f4_p0q0;
- const uint16x8_t p0q1 = vcombine_u16(vget_low_u16(p0q0), vget_high_u16(p1q1));
- Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0);
- f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
-
- uint16x8_t p0q0_output, p1q1_output, p2q2_output;
- // Because we did not return after testing |needs_filter_mask| we know it is
- // nonzero. |is_flat4_mask| controls whether the needed filter is Filter4 or
- // Filter8. Therefore if it is false when |needs_filter_mask| is true, Filter8
- // output is not used.
- const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
- if (vget_lane_u64(need_filter8, 0) == 0) {
- // Filter8() does not apply, but Filter4() applies to one or more values.
- p2q2_output = p2q2;
- p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
- p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
- } else {
- const uint16x8_t is_flat4_mask_8 =
- vcombine_u16(is_flat4_mask, is_flat4_mask);
- uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
- Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
- p2q2_output = vbslq_u16(is_flat4_mask_8, f8_p2q2, p2q2);
- p1q1_output = vbslq_u16(is_flat4_mask_8, f8_p1q1, f4_p1q1);
- p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
- p0q0_output = vbslq_u16(is_flat4_mask_8, f8_p0q0, f4_p0q0);
- p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
- }
-
- uint16x8_t output[4] = {p0q0_output, p1q1_output, p2q2_output, p3q3};
- // After transpose, |output| will contain rows of the form:
- // p0 p1 p2 p3 q0 q1 q2 q3
- Transpose4x8(output);
-
- // Reverse p values to produce original order:
- // p3 p2 p1 p0 q0 q1 q2 q3
- vst1q_u16(dst_0, ReverseLowHalf(output[0]));
- vst1q_u16(dst_1, ReverseLowHalf(output[1]));
- vst1q_u16(dst_2, ReverseLowHalf(output[2]));
- vst1q_u16(dst_3, ReverseLowHalf(output[3]));
-}
-inline void Filter14(const uint16x8_t p6q6, const uint16x8_t p5q5,
- const uint16x8_t p4q4, const uint16x8_t p3q3,
- const uint16x8_t p2q2, const uint16x8_t p1q1,
- const uint16x8_t p0q0, uint16x8_t* const p5q5_output,
- uint16x8_t* const p4q4_output,
- uint16x8_t* const p3q3_output,
- uint16x8_t* const p2q2_output,
- uint16x8_t* const p1q1_output,
- uint16x8_t* const p0q0_output) {
- // Sum p5 and q5 output from opposite directions.
- // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
- // ^^^^^^^^
- // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
- // ^^^^^^^^
- const uint16x8_t p6q6_x7 = vsubq_u16(vshlq_n_u16(p6q6, 3), p6q6);
-
- // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
- // ^^^^^^^^^^^^^^^^^^^
- // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
- // ^^^^^^^^^^^^^^^^^^^
- uint16x8_t sum = vshlq_n_u16(vaddq_u16(p5q5, p4q4), 1);
- sum = vaddq_u16(sum, p6q6_x7);
-
- // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
- // ^^^^^^^
- // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
- // ^^^^^^^
- sum = vaddq_u16(vaddq_u16(p3q3, p2q2), sum);
-
- // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
- // ^^^^^^^
- // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
- // ^^^^^^^
- sum = vaddq_u16(vaddq_u16(p1q1, p0q0), sum);
-
- // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
- // ^^
- // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
- // ^^
- const uint16x8_t q0p0 = Transpose64(p0q0);
- sum = vaddq_u16(sum, q0p0);
-
- *p5q5_output = vrshrq_n_u16(sum, 4);
-
- // Convert to p4 and q4 output:
- // p4 = p5 - (2 * p6) + p3 + q1
- // q4 = q5 - (2 * q6) + q3 + p1
- sum = vsubq_u16(sum, vshlq_n_u16(p6q6, 1));
- const uint16x8_t q1p1 = Transpose64(p1q1);
- sum = vaddq_u16(vaddq_u16(p3q3, q1p1), sum);
-
- *p4q4_output = vrshrq_n_u16(sum, 4);
-
- // Convert to p3 and q3 output:
- // p3 = p4 - p6 - p5 + p2 + q2
- // q3 = q4 - q6 - q5 + q2 + p2
- sum = vsubq_u16(sum, vaddq_u16(p6q6, p5q5));
- const uint16x8_t q2p2 = Transpose64(p2q2);
- sum = vaddq_u16(vaddq_u16(p2q2, q2p2), sum);
-
- *p3q3_output = vrshrq_n_u16(sum, 4);
-
- // Convert to p2 and q2 output:
- // p2 = p3 - p6 - p4 + p1 + q3
- // q2 = q3 - q6 - q4 + q1 + p3
- sum = vsubq_u16(sum, vaddq_u16(p6q6, p4q4));
- const uint16x8_t q3p3 = Transpose64(p3q3);
- sum = vaddq_u16(vaddq_u16(p1q1, q3p3), sum);
-
- *p2q2_output = vrshrq_n_u16(sum, 4);
-
- // Convert to p1 and q1 output:
- // p1 = p2 - p6 - p3 + p0 + q4
- // q1 = q2 - q6 - q3 + q0 + p4
- sum = vsubq_u16(sum, vaddq_u16(p6q6, p3q3));
- const uint16x8_t q4p4 = Transpose64(p4q4);
- sum = vaddq_u16(vaddq_u16(p0q0, q4p4), sum);
-
- *p1q1_output = vrshrq_n_u16(sum, 4);
-
- // Convert to p0 and q0 output:
- // p0 = p1 - p6 - p2 + q0 + q5
- // q0 = q1 - q6 - q2 + p0 + p5
- sum = vsubq_u16(sum, vaddq_u16(p6q6, p2q2));
- const uint16x8_t q5p5 = Transpose64(p5q5);
- sum = vaddq_u16(vaddq_u16(q0p0, q5p5), sum);
-
- *p0q0_output = vrshrq_n_u16(sum, 4);
-}
-
-void Horizontal14_NEON(void* const dest, const ptrdiff_t stride,
- int outer_thresh, int inner_thresh, int hev_thresh) {
- auto* const dst = static_cast<uint8_t*>(dest);
- auto* const dst_p6 = reinterpret_cast<uint16_t*>(dst - 7 * stride);
- auto* const dst_p5 = reinterpret_cast<uint16_t*>(dst - 6 * stride);
- auto* const dst_p4 = reinterpret_cast<uint16_t*>(dst - 5 * stride);
- auto* const dst_p3 = reinterpret_cast<uint16_t*>(dst - 4 * stride);
- auto* const dst_p2 = reinterpret_cast<uint16_t*>(dst - 3 * stride);
- auto* const dst_p1 = reinterpret_cast<uint16_t*>(dst - 2 * stride);
- auto* const dst_p0 = reinterpret_cast<uint16_t*>(dst - stride);
- auto* const dst_q0 = reinterpret_cast<uint16_t*>(dst);
- auto* const dst_q1 = reinterpret_cast<uint16_t*>(dst + stride);
- auto* const dst_q2 = reinterpret_cast<uint16_t*>(dst + 2 * stride);
- auto* const dst_q3 = reinterpret_cast<uint16_t*>(dst + 3 * stride);
- auto* const dst_q4 = reinterpret_cast<uint16_t*>(dst + 4 * stride);
- auto* const dst_q5 = reinterpret_cast<uint16_t*>(dst + 5 * stride);
- auto* const dst_q6 = reinterpret_cast<uint16_t*>(dst + 6 * stride);
-
- const uint16x4_t src[14] = {
- vld1_u16(dst_p6), vld1_u16(dst_p5), vld1_u16(dst_p4), vld1_u16(dst_p3),
- vld1_u16(dst_p2), vld1_u16(dst_p1), vld1_u16(dst_p0), vld1_u16(dst_q0),
- vld1_u16(dst_q1), vld1_u16(dst_q2), vld1_u16(dst_q3), vld1_u16(dst_q4),
- vld1_u16(dst_q5), vld1_u16(dst_q6)};
-
- // Adjust thresholds to bitdepth.
- outer_thresh <<= 2;
- inner_thresh <<= 2;
- hev_thresh <<= 2;
- const uint16x4_t outer_mask =
- OuterThreshold(src[5], src[6], src[7], src[8], outer_thresh);
- uint16x4_t hev_mask;
- uint16x4_t needs_filter_mask;
- uint16x4_t is_flat4_mask;
- const uint16x8_t p0q0 = vcombine_u16(src[6], src[7]);
- const uint16x8_t p1q1 = vcombine_u16(src[5], src[8]);
- const uint16x8_t p2q2 = vcombine_u16(src[4], src[9]);
- const uint16x8_t p3q3 = vcombine_u16(src[3], src[10]);
- Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
- &needs_filter_mask, &is_flat4_mask, &hev_mask);
-
-#if defined(__aarch64__)
- if (vaddv_u16(needs_filter_mask) == 0) {
- // None of the values will be filtered.
- return;
- }
-#else // !defined(__aarch64__)
- // This might be faster than vaddv (latency 3) because mov to general register
- // has latency 2.
- const uint64x1_t needs_filter_mask64 =
- vreinterpret_u64_u16(needs_filter_mask);
- if (vget_lane_u64(needs_filter_mask64, 0) == 0) {
- // None of the values will be filtered.
- return;
- }
-#endif // defined(__aarch64__)
- const uint16x8_t p4q4 = vcombine_u16(src[2], src[11]);
- const uint16x8_t p5q5 = vcombine_u16(src[1], src[12]);
- const uint16x8_t p6q6 = vcombine_u16(src[0], src[13]);
- // Mask to choose between the outputs of Filter8 and Filter14.
- // As with the derivation of |is_flat4_mask|, the question of whether to use
- // Filter14 is only raised where |is_flat4_mask| is true.
- const uint16x4_t is_flat4_outer_mask = vand_u16(
- is_flat4_mask, IsFlat4(vabdq_u16(p0q0, p4q4), vabdq_u16(p0q0, p5q5),
- vabdq_u16(p0q0, p6q6)));
- // Copy the masks to the high bits for packed comparisons later.
- const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
- const uint16x8_t needs_filter_mask_8 =
- vcombine_u16(needs_filter_mask, needs_filter_mask);
-
- uint16x8_t f4_p1q1;
- uint16x8_t f4_p0q0;
- // ZIP1 p0q0, p1q1 may perform better here.
- const uint16x8_t p0q1 = vcombine_u16(src[6], src[8]);
- Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0);
- f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
-
- uint16x8_t p0q0_output, p1q1_output, p2q2_output, p3q3_output, p4q4_output,
- p5q5_output;
- // Because we did not return after testing |needs_filter_mask| we know it is
- // nonzero. |is_flat4_mask| controls whether the needed filter is Filter4 or
- // Filter8. Therefore if it is false when |needs_filter_mask| is true, Filter8
- // output is not used.
- uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
- const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
- if (vget_lane_u64(need_filter8, 0) == 0) {
- // Filter8() and Filter14() do not apply, but Filter4() applies to one or
- // more values.
- p5q5_output = p5q5;
- p4q4_output = p4q4;
- p3q3_output = p3q3;
- p2q2_output = p2q2;
- p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
- p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
- } else {
- const uint16x8_t use_filter8_mask =
- vcombine_u16(is_flat4_mask, is_flat4_mask);
- Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
- const uint64x1_t need_filter14 = vreinterpret_u64_u16(is_flat4_outer_mask);
- if (vget_lane_u64(need_filter14, 0) == 0) {
- // Filter14() does not apply, but Filter8() and Filter4() apply to one or
- // more values.
- p5q5_output = p5q5;
- p4q4_output = p4q4;
- p3q3_output = p3q3;
- p2q2_output = vbslq_u16(use_filter8_mask, f8_p2q2, p2q2);
- p1q1_output = vbslq_u16(use_filter8_mask, f8_p1q1, f4_p1q1);
- p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
- p0q0_output = vbslq_u16(use_filter8_mask, f8_p0q0, f4_p0q0);
- p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
- } else {
- // All filters may contribute values to final outputs.
- const uint16x8_t use_filter14_mask =
- vcombine_u16(is_flat4_outer_mask, is_flat4_outer_mask);
- uint16x8_t f14_p5q5, f14_p4q4, f14_p3q3, f14_p2q2, f14_p1q1, f14_p0q0;
- Filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4,
- &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0);
- p5q5_output = vbslq_u16(use_filter14_mask, f14_p5q5, p5q5);
- p4q4_output = vbslq_u16(use_filter14_mask, f14_p4q4, p4q4);
- p3q3_output = vbslq_u16(use_filter14_mask, f14_p3q3, p3q3);
- p2q2_output = vbslq_u16(use_filter14_mask, f14_p2q2, f8_p2q2);
- p2q2_output = vbslq_u16(use_filter8_mask, p2q2_output, p2q2);
- p2q2_output = vbslq_u16(needs_filter_mask_8, p2q2_output, p2q2);
- p1q1_output = vbslq_u16(use_filter14_mask, f14_p1q1, f8_p1q1);
- p1q1_output = vbslq_u16(use_filter8_mask, p1q1_output, f4_p1q1);
- p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
- p0q0_output = vbslq_u16(use_filter14_mask, f14_p0q0, f8_p0q0);
- p0q0_output = vbslq_u16(use_filter8_mask, p0q0_output, f4_p0q0);
- p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
- }
- }
-
- vst1_u16(dst_p5, vget_low_u16(p5q5_output));
- vst1_u16(dst_p4, vget_low_u16(p4q4_output));
- vst1_u16(dst_p3, vget_low_u16(p3q3_output));
- vst1_u16(dst_p2, vget_low_u16(p2q2_output));
- vst1_u16(dst_p1, vget_low_u16(p1q1_output));
- vst1_u16(dst_p0, vget_low_u16(p0q0_output));
- vst1_u16(dst_q0, vget_high_u16(p0q0_output));
- vst1_u16(dst_q1, vget_high_u16(p1q1_output));
- vst1_u16(dst_q2, vget_high_u16(p2q2_output));
- vst1_u16(dst_q3, vget_high_u16(p3q3_output));
- vst1_u16(dst_q4, vget_high_u16(p4q4_output));
- vst1_u16(dst_q5, vget_high_u16(p5q5_output));
-}
-
-inline uint16x8x2_t PermuteACDB64(const uint16x8_t ab, const uint16x8_t cd) {
- uint16x8x2_t acdb;
-#if defined(__aarch64__)
- // a[b] <- [c]d
- acdb.val[0] = vreinterpretq_u16_u64(
- vtrn1q_u64(vreinterpretq_u64_u16(ab), vreinterpretq_u64_u16(cd)));
- // [a]b <- c[d]
- acdb.val[1] = vreinterpretq_u16_u64(
- vtrn2q_u64(vreinterpretq_u64_u16(cd), vreinterpretq_u64_u16(ab)));
-#else
- // a[b] <- [c]d
- acdb.val[0] = vreinterpretq_u16_u64(
- vsetq_lane_u64(vgetq_lane_u64(vreinterpretq_u64_u16(cd), 0),
- vreinterpretq_u64_u16(ab), 1));
- // [a]b <- c[d]
- acdb.val[1] = vreinterpretq_u16_u64(
- vsetq_lane_u64(vgetq_lane_u64(vreinterpretq_u64_u16(cd), 1),
- vreinterpretq_u64_u16(ab), 0));
-#endif // defined(__aarch64__)
- return acdb;
-}
-
-void Vertical14_NEON(void* const dest, const ptrdiff_t stride, int outer_thresh,
- int inner_thresh, int hev_thresh) {
- auto* const dst = static_cast<uint8_t*>(dest) - 8 * sizeof(uint16_t);
- auto* const dst_0 = reinterpret_cast<uint16_t*>(dst);
- auto* const dst_1 = reinterpret_cast<uint16_t*>(dst + stride);
- auto* const dst_2 = reinterpret_cast<uint16_t*>(dst + 2 * stride);
- auto* const dst_3 = reinterpret_cast<uint16_t*>(dst + 3 * stride);
-
- // Low halves: p7 p6 p5 p4
- // High halves: p3 p2 p1 p0
- uint16x8_t src_p[4] = {vld1q_u16(dst_0), vld1q_u16(dst_1), vld1q_u16(dst_2),
- vld1q_u16(dst_3)};
- // p7 will be the low half of src_p[0]. Not used until the end.
- Transpose4x8(src_p);
-
- // Low halves: q0 q1 q2 q3
- // High halves: q4 q5 q6 q7
- uint16x8_t src_q[4] = {vld1q_u16(dst_0 + 8), vld1q_u16(dst_1 + 8),
- vld1q_u16(dst_2 + 8), vld1q_u16(dst_3 + 8)};
- // q7 will be the high half of src_q[3]. Not used until the end.
- Transpose4x8(src_q);
-
- // Adjust thresholds to bitdepth.
- outer_thresh <<= 2;
- inner_thresh <<= 2;
- hev_thresh <<= 2;
- const uint16x4_t outer_mask = OuterThreshold(
- vget_high_u16(src_p[2]), vget_high_u16(src_p[3]), vget_low_u16(src_q[0]),
- vget_low_u16(src_q[1]), outer_thresh);
- const uint16x8_t p0q0 = vextq_u16(src_p[3], src_q[0], 4);
- const uint16x8_t p1q1 = vextq_u16(src_p[2], src_q[1], 4);
- const uint16x8_t p2q2 = vextq_u16(src_p[1], src_q[2], 4);
- const uint16x8_t p3q3 = vextq_u16(src_p[0], src_q[3], 4);
- uint16x4_t hev_mask;
- uint16x4_t needs_filter_mask;
- uint16x4_t is_flat4_mask;
- Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
- &needs_filter_mask, &is_flat4_mask, &hev_mask);
-
-#if defined(__aarch64__)
- if (vaddv_u16(needs_filter_mask) == 0) {
- // None of the values will be filtered.
- return;
- }
-#else // !defined(__aarch64__)
- // This might be faster than vaddv (latency 3) because mov to general register
- // has latency 2.
- const uint64x1_t needs_filter_mask64 =
- vreinterpret_u64_u16(needs_filter_mask);
- if (vget_lane_u64(needs_filter_mask64, 0) == 0) {
- // None of the values will be filtered.
- return;
- }
-#endif // defined(__aarch64__)
- const uint16x8_t p4q4 =
- vcombine_u16(vget_low_u16(src_p[3]), vget_high_u16(src_q[0]));
- const uint16x8_t p5q5 =
- vcombine_u16(vget_low_u16(src_p[2]), vget_high_u16(src_q[1]));
- const uint16x8_t p6q6 =
- vcombine_u16(vget_low_u16(src_p[1]), vget_high_u16(src_q[2]));
- const uint16x8_t p7q7 =
- vcombine_u16(vget_low_u16(src_p[0]), vget_high_u16(src_q[3]));
- // Mask to choose between the outputs of Filter8 and Filter14.
- // As with the derivation of |is_flat4_mask|, the question of whether to use
- // Filter14 is only raised where |is_flat4_mask| is true.
- const uint16x4_t is_flat4_outer_mask = vand_u16(
- is_flat4_mask, IsFlat4(vabdq_u16(p0q0, p4q4), vabdq_u16(p0q0, p5q5),
- vabdq_u16(p0q0, p6q6)));
- // Copy the masks to the high bits for packed comparisons later.
- const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
- const uint16x8_t needs_filter_mask_8 =
- vcombine_u16(needs_filter_mask, needs_filter_mask);
-
- uint16x8_t f4_p1q1;
- uint16x8_t f4_p0q0;
- const uint16x8_t p0q1 = vcombine_u16(vget_low_u16(p0q0), vget_high_u16(p1q1));
- Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0);
- f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
-
- uint16x8_t p0q0_output, p1q1_output, p2q2_output, p3q3_output, p4q4_output,
- p5q5_output;
- // Because we did not return after testing |needs_filter_mask| we know it is
- // nonzero. |is_flat4_mask| controls whether the needed filter is Filter4 or
- // Filter8. Therefore if it is false when |needs_filter_mask| is true, Filter8
- // output is not used.
- uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
- const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
- if (vget_lane_u64(need_filter8, 0) == 0) {
- // Filter8() and Filter14() do not apply, but Filter4() applies to one or
- // more values.
- p5q5_output = p5q5;
- p4q4_output = p4q4;
- p3q3_output = p3q3;
- p2q2_output = p2q2;
- p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
- p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
- } else {
- const uint16x8_t use_filter8_mask =
- vcombine_u16(is_flat4_mask, is_flat4_mask);
- Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
- const uint64x1_t need_filter14 = vreinterpret_u64_u16(is_flat4_outer_mask);
- if (vget_lane_u64(need_filter14, 0) == 0) {
- // Filter14() does not apply, but Filter8() and Filter4() apply to one or
- // more values.
- p5q5_output = p5q5;
- p4q4_output = p4q4;
- p3q3_output = p3q3;
- p2q2_output = vbslq_u16(use_filter8_mask, f8_p2q2, p2q2);
- p1q1_output = vbslq_u16(use_filter8_mask, f8_p1q1, f4_p1q1);
- p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
- p0q0_output = vbslq_u16(use_filter8_mask, f8_p0q0, f4_p0q0);
- p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
- } else {
- // All filters may contribute values to final outputs.
- const uint16x8_t use_filter14_mask =
- vcombine_u16(is_flat4_outer_mask, is_flat4_outer_mask);
- uint16x8_t f14_p5q5, f14_p4q4, f14_p3q3, f14_p2q2, f14_p1q1, f14_p0q0;
- Filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4,
- &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0);
- p5q5_output = vbslq_u16(use_filter14_mask, f14_p5q5, p5q5);
- p4q4_output = vbslq_u16(use_filter14_mask, f14_p4q4, p4q4);
- p3q3_output = vbslq_u16(use_filter14_mask, f14_p3q3, p3q3);
- p2q2_output = vbslq_u16(use_filter14_mask, f14_p2q2, f8_p2q2);
- p2q2_output = vbslq_u16(use_filter8_mask, p2q2_output, p2q2);
- p2q2_output = vbslq_u16(needs_filter_mask_8, p2q2_output, p2q2);
- p1q1_output = vbslq_u16(use_filter14_mask, f14_p1q1, f8_p1q1);
- p1q1_output = vbslq_u16(use_filter8_mask, p1q1_output, f4_p1q1);
- p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
- p0q0_output = vbslq_u16(use_filter14_mask, f14_p0q0, f8_p0q0);
- p0q0_output = vbslq_u16(use_filter8_mask, p0q0_output, f4_p0q0);
- p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
- }
- }
- // To get the correctly ordered rows from the transpose, we need:
- // p7p3 p6p2 p5p1 p4p0
- // q0q4 q1q5 q2q6 q3q7
- const uint16x8x2_t p7p3_q3q7 = PermuteACDB64(p7q7, p3q3_output);
- const uint16x8x2_t p6p2_q2q6 = PermuteACDB64(p6q6, p2q2_output);
- const uint16x8x2_t p5p1_q1q5 = PermuteACDB64(p5q5_output, p1q1_output);
- const uint16x8x2_t p4p0_q0q4 = PermuteACDB64(p4q4_output, p0q0_output);
- uint16x8_t output_p[4] = {p7p3_q3q7.val[0], p6p2_q2q6.val[0],
- p5p1_q1q5.val[0], p4p0_q0q4.val[0]};
- Transpose4x8(output_p);
- uint16x8_t output_q[4] = {p4p0_q0q4.val[1], p5p1_q1q5.val[1],
- p6p2_q2q6.val[1], p7p3_q3q7.val[1]};
- Transpose4x8(output_q);
-
- // Reverse p values to produce original order:
- // p3 p2 p1 p0 q0 q1 q2 q3
- vst1q_u16(dst_0, output_p[0]);
- vst1q_u16(dst_0 + 8, output_q[0]);
- vst1q_u16(dst_1, output_p[1]);
- vst1q_u16(dst_1 + 8, output_q[1]);
- vst1q_u16(dst_2, output_p[2]);
- vst1q_u16(dst_2 + 8, output_q[2]);
- vst1q_u16(dst_3, output_p[3]);
- vst1q_u16(dst_3 + 8, output_q[3]);
-}
-
-void Init10bpp() {
- Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
- assert(dsp != nullptr);
- dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] =
- Horizontal4_NEON;
- dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] = Vertical4_NEON;
- dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] =
- Horizontal6_NEON;
- dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] = Vertical6_NEON;
- dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] =
- Horizontal8_NEON;
- dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] = Vertical8_NEON;
- dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] =
- Horizontal14_NEON;
- dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] =
- Vertical14_NEON;
-}
-
-} // namespace
-} // namespace high_bitdepth
-#endif // LIBGAV1_MAX_BITDEPTH >= 10
-
-void LoopFilterInit_NEON() {
- low_bitdepth::Init8bpp();
-#if LIBGAV1_MAX_BITDEPTH >= 10
- high_bitdepth::Init10bpp();
-#endif
-}
} // namespace dsp
} // namespace libgav1
diff --git a/src/dsp/arm/loop_filter_neon.h b/src/dsp/arm/loop_filter_neon.h
index 540defc..531cd0d 100644
--- a/src/dsp/arm/loop_filter_neon.h
+++ b/src/dsp/arm/loop_filter_neon.h
@@ -26,6 +26,7 @@ namespace dsp {
// Initializes Dsp::loop_filters, see the defines below for specifics. This
// function is not thread-safe.
void LoopFilterInit_NEON();
+void LoopFilterInit10bpp_NEON();
} // namespace dsp
} // namespace libgav1
diff --git a/src/dsp/arm/loop_restoration_neon.cc b/src/dsp/arm/loop_restoration_neon.cc
index 2db137f..cd8552e 100644
--- a/src/dsp/arm/loop_restoration_neon.cc
+++ b/src/dsp/arm/loop_restoration_neon.cc
@@ -1504,7 +1504,6 @@ inline void BoxSumFilterPreProcess5(const uint8_t* const src0,
const ptrdiff_t overread_in_bytes = kWideOverreadInBytesPass1 - width;
uint8x16_t s[2][2], mas[2];
uint16x8_t sq[2][4], bs[3];
- // TODO(b/194217060): Future msan load.
s[0][0] = vld1q_u8(src0);
s[1][0] = vld1q_u8(src1);
@@ -1599,7 +1598,6 @@ inline void BoxSumFilterPreProcess(
const ptrdiff_t overread_in_bytes = kWideOverreadInBytesPass1 - width;
uint8x16_t s[2][2], ma3[2][2], ma5[2];
uint16x8_t sq[2][4], b3[2][3], b5[3];
- // TODO(b/194217060): Future msan load.
s[0][0] = vld1q_u8(src0);
s[1][0] = vld1q_u8(src1);
@@ -1801,7 +1799,6 @@ inline void BoxFilterPass1LastRow(const uint8_t* const src,
uint8_t* const dst) {
uint8x16_t s[2], mas[2];
uint16x8_t sq[4], bs[4];
- // TODO(b/194217060): Future msan load.
s[0] = vld1q_u8(src0);
BoxFilterPreProcess5LastRowLo(s, scale, sum5, square_sum5, sq, &mas[0],
@@ -1812,7 +1809,6 @@ inline void BoxFilterPass1LastRow(const uint8_t* const src,
uint16x8_t ma[2];
uint8x16_t masx[3];
uint32x4x2_t b[2];
- // TODO(b/194217060): Future msan load.
s[1] = vld1q_u8(src0 + x + 16);
BoxFilterPreProcess5LastRow(s, x + 8, scale, sum5, square_sum5, sq + 1, mas,
@@ -1856,7 +1852,6 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterPass2(
const ptrdiff_t overread_in_bytes = kWideOverreadInBytesPass2 - width;
uint8x16_t s[2], mas[2];
uint16x8_t sq[4], bs[3];
- // TODO(b/194217060): Future msan load.
s[0] = vld1q_u8(src0);
BoxFilterPreProcess3Lo(&s[0], scale, sum3, square_sum3, sq, &mas[0], &bs[0]);
@@ -1915,7 +1910,6 @@ LIBGAV1_ALWAYS_INLINE void BoxFilter(
const ptrdiff_t overread_in_bytes = kWideOverreadInBytesPass1 - width;
uint8x16_t s[2][2], ma3[2][2], ma5[2];
uint16x8_t sq[2][4], b3[2][3], b5[3];
- // TODO(b/194217060): Future msan load.
s[0][0] = vld1q_u8(src0);
s[1][0] = vld1q_u8(src1);
@@ -2023,7 +2017,6 @@ inline void BoxFilterLastRow(
uint8x16_t s[2], ma3[2], ma5[2];
uint16x8_t sq[4], ma[3], b3[3], b5[3];
uint32x4x2_t b[3];
- // TODO(b/194217060): Future msan load.
s[0] = vld1q_u8(src0);
BoxFilterPreProcessLastRowLo(s, scales, sum3, sum5, square_sum3, square_sum5,
@@ -2033,7 +2026,6 @@ inline void BoxFilterLastRow(
do {
uint8x16_t ma3x[3], ma5x[3];
int16x8_t p[2];
- // TODO(b/194217060): Future msan load.
s[1] = vld1q_u8(src0 + x + 16);
BoxFilterPreProcessLastRow(s, x + 8, scales, sum3, sum5, square_sum3,
diff --git a/src/dsp/arm/mask_blend_neon.cc b/src/dsp/arm/mask_blend_neon.cc
index 853f949..ecc67f8 100644
--- a/src/dsp/arm/mask_blend_neon.cc
+++ b/src/dsp/arm/mask_blend_neon.cc
@@ -33,50 +33,40 @@ namespace dsp {
namespace low_bitdepth {
namespace {
-// TODO(b/150461164): Consider combining with GetInterIntraMask4x2().
-// Compound predictors use int16_t values and need to multiply long because the
-// Convolve range * 64 is 20 bits. Unfortunately there is no multiply int16_t by
-// int8_t and accumulate into int32_t instruction.
-template <int subsampling_x, int subsampling_y>
-inline int16x8_t GetMask4x2(const uint8_t* mask, ptrdiff_t mask_stride) {
- if (subsampling_x == 1) {
- const int16x4_t mask_val0 = vreinterpret_s16_u16(vpaddl_u8(vld1_u8(mask)));
- const int16x4_t mask_val1 = vreinterpret_s16_u16(
- vpaddl_u8(vld1_u8(mask + (mask_stride << subsampling_y))));
- int16x8_t final_val;
- if (subsampling_y == 1) {
- const int16x4_t next_mask_val0 =
- vreinterpret_s16_u16(vpaddl_u8(vld1_u8(mask + mask_stride)));
- const int16x4_t next_mask_val1 =
- vreinterpret_s16_u16(vpaddl_u8(vld1_u8(mask + mask_stride * 3)));
- final_val = vaddq_s16(vcombine_s16(mask_val0, mask_val1),
- vcombine_s16(next_mask_val0, next_mask_val1));
- } else {
- final_val = vreinterpretq_s16_u16(
- vpaddlq_u8(vreinterpretq_u8_s16(vcombine_s16(mask_val0, mask_val1))));
- }
- return vrshrq_n_s16(final_val, subsampling_y + 1);
+template <int subsampling_y>
+inline uint8x8_t GetMask4x2(const uint8_t* mask) {
+ if (subsampling_y == 1) {
+ const uint8x16x2_t mask_val = vld2q_u8(mask);
+ const uint8x16_t combined_horz = vaddq_u8(mask_val.val[0], mask_val.val[1]);
+ const uint32x2_t row_01 = vreinterpret_u32_u8(vget_low_u8(combined_horz));
+ const uint32x2_t row_23 = vreinterpret_u32_u8(vget_high_u8(combined_horz));
+
+ const uint32x2x2_t row_02_13 = vtrn_u32(row_01, row_23);
+ // Use a halving add to work around the case where all |mask| values are 64.
+ return vrshr_n_u8(vhadd_u8(vreinterpret_u8_u32(row_02_13.val[0]),
+ vreinterpret_u8_u32(row_02_13.val[1])),
+ 1);
}
- assert(subsampling_y == 0 && subsampling_x == 0);
- const uint8x8_t mask_val0 = Load4(mask);
- const uint8x8_t mask_val = Load4<1>(mask + mask_stride, mask_val0);
- return vreinterpretq_s16_u16(vmovl_u8(mask_val));
+ // subsampling_x == 1
+ const uint8x8x2_t mask_val = vld2_u8(mask);
+ return vrhadd_u8(mask_val.val[0], mask_val.val[1]);
}
template <int subsampling_x, int subsampling_y>
-inline int16x8_t GetMask8(const uint8_t* mask, ptrdiff_t mask_stride) {
+inline uint8x8_t GetMask8(const uint8_t* mask) {
+ if (subsampling_x == 1 && subsampling_y == 1) {
+ const uint8x16x2_t mask_val = vld2q_u8(mask);
+ const uint8x16_t combined_horz = vaddq_u8(mask_val.val[0], mask_val.val[1]);
+ // Use a halving add to work around the case where all |mask| values are 64.
+ return vrshr_n_u8(
+ vhadd_u8(vget_low_u8(combined_horz), vget_high_u8(combined_horz)), 1);
+ }
if (subsampling_x == 1) {
- int16x8_t mask_val = vreinterpretq_s16_u16(vpaddlq_u8(vld1q_u8(mask)));
- if (subsampling_y == 1) {
- const int16x8_t next_mask_val =
- vreinterpretq_s16_u16(vpaddlq_u8(vld1q_u8(mask + mask_stride)));
- mask_val = vaddq_s16(mask_val, next_mask_val);
- }
- return vrshrq_n_s16(mask_val, 1 + subsampling_y);
+ const uint8x8x2_t mask_val = vld2_u8(mask);
+ return vrhadd_u8(mask_val.val[0], mask_val.val[1]);
}
assert(subsampling_y == 0 && subsampling_x == 0);
- const uint8x8_t mask_val = vld1_u8(mask);
- return vreinterpretq_s16_u16(vmovl_u8(mask_val));
+ return vld1_u8(mask);
}
inline void WriteMaskBlendLine4x2(const int16_t* LIBGAV1_RESTRICT const pred_0,
@@ -109,89 +99,162 @@ inline void WriteMaskBlendLine4x2(const int16_t* LIBGAV1_RESTRICT const pred_0,
StoreHi4(dst + dst_stride, result);
}
-template <int subsampling_x, int subsampling_y>
+template <int subsampling_y>
inline void MaskBlending4x4_NEON(const int16_t* LIBGAV1_RESTRICT pred_0,
const int16_t* LIBGAV1_RESTRICT pred_1,
const uint8_t* LIBGAV1_RESTRICT mask,
- const ptrdiff_t mask_stride,
uint8_t* LIBGAV1_RESTRICT dst,
const ptrdiff_t dst_stride) {
+ constexpr int subsampling_x = 1;
+ constexpr ptrdiff_t mask_stride = 4 << subsampling_x;
const int16x8_t mask_inverter = vdupq_n_s16(64);
- int16x8_t pred_mask_0 =
- GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+ // Compound predictors use int16_t values and need to multiply long because
+ // the Convolve range * 64 is 20 bits. Unfortunately there is no multiply
+ // int16_t by int8_t and accumulate into int32_t instruction.
+ int16x8_t pred_mask_0 = ZeroExtend(GetMask4x2<subsampling_y>(mask));
int16x8_t pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0);
WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
dst_stride);
- // TODO(b/150461164): Arm tends to do better with load(val); val += stride
- // It may be possible to turn this into a loop with a templated height.
- pred_0 += 4 << 1;
- pred_1 += 4 << 1;
- mask += mask_stride << (1 + subsampling_y);
- dst += dst_stride << 1;
+ pred_0 += 4 << subsampling_x;
+ pred_1 += 4 << subsampling_x;
+ mask += mask_stride << (subsampling_x + subsampling_y);
+ dst += dst_stride << subsampling_x;
- pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+ pred_mask_0 = ZeroExtend(GetMask4x2<subsampling_y>(mask));
pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0);
WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
dst_stride);
}
-template <int subsampling_x, int subsampling_y>
+template <int subsampling_y>
inline void MaskBlending4xH_NEON(const int16_t* LIBGAV1_RESTRICT pred_0,
const int16_t* LIBGAV1_RESTRICT pred_1,
const uint8_t* LIBGAV1_RESTRICT const mask_ptr,
- const ptrdiff_t mask_stride, const int height,
+ const int height,
uint8_t* LIBGAV1_RESTRICT dst,
const ptrdiff_t dst_stride) {
const uint8_t* mask = mask_ptr;
if (height == 4) {
- MaskBlending4x4_NEON<subsampling_x, subsampling_y>(
- pred_0, pred_1, mask, mask_stride, dst, dst_stride);
+ MaskBlending4x4_NEON<subsampling_y>(pred_0, pred_1, mask, dst, dst_stride);
return;
}
+ constexpr int subsampling_x = 1;
+ constexpr ptrdiff_t mask_stride = 4 << subsampling_x;
const int16x8_t mask_inverter = vdupq_n_s16(64);
int y = 0;
do {
int16x8_t pred_mask_0 =
- GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+ vreinterpretq_s16_u16(vmovl_u8(GetMask4x2<subsampling_y>(mask)));
int16x8_t pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0);
WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
dst_stride);
- pred_0 += 4 << 1;
- pred_1 += 4 << 1;
- mask += mask_stride << (1 + subsampling_y);
- dst += dst_stride << 1;
+ pred_0 += 4 << subsampling_x;
+ pred_1 += 4 << subsampling_x;
+ mask += mask_stride << (subsampling_x + subsampling_y);
+ dst += dst_stride << subsampling_x;
- pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+ pred_mask_0 = ZeroExtend(GetMask4x2<subsampling_y>(mask));
pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0);
WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
dst_stride);
- pred_0 += 4 << 1;
- pred_1 += 4 << 1;
- mask += mask_stride << (1 + subsampling_y);
- dst += dst_stride << 1;
+ pred_0 += 4 << subsampling_x;
+ pred_1 += 4 << subsampling_x;
+ mask += mask_stride << (subsampling_x + subsampling_y);
+ dst += dst_stride << subsampling_x;
- pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+ pred_mask_0 = ZeroExtend(GetMask4x2<subsampling_y>(mask));
pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0);
WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
dst_stride);
- pred_0 += 4 << 1;
- pred_1 += 4 << 1;
- mask += mask_stride << (1 + subsampling_y);
- dst += dst_stride << 1;
+ pred_0 += 4 << subsampling_x;
+ pred_1 += 4 << subsampling_x;
+ mask += mask_stride << (subsampling_x + subsampling_y);
+ dst += dst_stride << subsampling_x;
- pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+ pred_mask_0 = ZeroExtend(GetMask4x2<subsampling_y>(mask));
pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0);
WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
dst_stride);
- pred_0 += 4 << 1;
- pred_1 += 4 << 1;
- mask += mask_stride << (1 + subsampling_y);
- dst += dst_stride << 1;
+ pred_0 += 4 << subsampling_x;
+ pred_1 += 4 << subsampling_x;
+ mask += mask_stride << (subsampling_x + subsampling_y);
+ dst += dst_stride << subsampling_x;
y += 8;
} while (y < height);
}
+inline uint8x8_t CombinePred8(const int16_t* LIBGAV1_RESTRICT pred_0,
+ const int16_t* LIBGAV1_RESTRICT pred_1,
+ const int16x8_t pred_mask_0,
+ const int16x8_t pred_mask_1) {
+ // First 8 values.
+ const int16x8_t pred_val_0 = vld1q_s16(pred_0);
+ const int16x8_t pred_val_1 = vld1q_s16(pred_1);
+ // int res = (mask_value * prediction_0[x] +
+ // (64 - mask_value) * prediction_1[x]) >> 6;
+ const int32x4_t weighted_pred_lo =
+ vmull_s16(vget_low_s16(pred_mask_0), vget_low_s16(pred_val_0));
+ const int32x4_t weighted_pred_hi =
+ vmull_s16(vget_high_s16(pred_mask_0), vget_high_s16(pred_val_0));
+ const int32x4_t weighted_combo_lo = vmlal_s16(
+ weighted_pred_lo, vget_low_s16(pred_mask_1), vget_low_s16(pred_val_1));
+ const int32x4_t weighted_combo_hi = vmlal_s16(
+ weighted_pred_hi, vget_high_s16(pred_mask_1), vget_high_s16(pred_val_1));
+
+ // dst[x] = static_cast<Pixel>(
+ // Clip3(RightShiftWithRounding(res, inter_post_round_bits), 0,
+ // (1 << kBitdepth8) - 1));
+ return vqrshrun_n_s16(vcombine_s16(vshrn_n_s32(weighted_combo_lo, 6),
+ vshrn_n_s32(weighted_combo_hi, 6)),
+ 4);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void MaskBlending8xH_NEON(const int16_t* LIBGAV1_RESTRICT pred_0,
+ const int16_t* LIBGAV1_RESTRICT pred_1,
+ const uint8_t* LIBGAV1_RESTRICT const mask_ptr,
+ const int height,
+ uint8_t* LIBGAV1_RESTRICT dst,
+ const ptrdiff_t dst_stride) {
+ const uint8_t* mask = mask_ptr;
+ const int16x8_t mask_inverter = vdupq_n_s16(64);
+ int y = height;
+ do {
+ const int16x8_t pred_mask_0 =
+ ZeroExtend(GetMask8<subsampling_x, subsampling_y>(mask));
+ // 64 - mask
+ const int16x8_t pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0);
+ const uint8x8_t result =
+ CombinePred8(pred_0, pred_1, pred_mask_0, pred_mask_1);
+ vst1_u8(dst, result);
+ dst += dst_stride;
+ mask += 8 << (subsampling_x + subsampling_y);
+ pred_0 += 8;
+ pred_1 += 8;
+ } while (--y != 0);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline uint8x16_t GetMask16(const uint8_t* mask, const ptrdiff_t mask_stride) {
+ if (subsampling_x == 1 && subsampling_y == 1) {
+ const uint8x16x2_t mask_val0 = vld2q_u8(mask);
+ const uint8x16x2_t mask_val1 = vld2q_u8(mask + mask_stride);
+ const uint8x16_t combined_horz0 =
+ vaddq_u8(mask_val0.val[0], mask_val0.val[1]);
+ const uint8x16_t combined_horz1 =
+ vaddq_u8(mask_val1.val[0], mask_val1.val[1]);
+ // Use a halving add to work around the case where all |mask| values are 64.
+ return vrshrq_n_u8(vhaddq_u8(combined_horz0, combined_horz1), 1);
+ }
+ if (subsampling_x == 1) {
+ const uint8x16x2_t mask_val = vld2q_u8(mask);
+ return vrhaddq_u8(mask_val.val[0], mask_val.val[1]);
+ }
+ assert(subsampling_y == 0 && subsampling_x == 0);
+ return vld1q_u8(mask);
+}
+
template <int subsampling_x, int subsampling_y>
inline void MaskBlend_NEON(const void* LIBGAV1_RESTRICT prediction_0,
const void* LIBGAV1_RESTRICT prediction_1,
@@ -204,8 +267,13 @@ inline void MaskBlend_NEON(const void* LIBGAV1_RESTRICT prediction_0,
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
if (width == 4) {
- MaskBlending4xH_NEON<subsampling_x, subsampling_y>(
- pred_0, pred_1, mask_ptr, mask_stride, height, dst, dst_stride);
+ MaskBlending4xH_NEON<subsampling_y>(pred_0, pred_1, mask_ptr, height, dst,
+ dst_stride);
+ return;
+ }
+ if (width == 8) {
+ MaskBlending8xH_NEON<subsampling_x, subsampling_y>(pred_0, pred_1, mask_ptr,
+ height, dst, dst_stride);
return;
}
const uint8_t* mask = mask_ptr;
@@ -214,35 +282,24 @@ inline void MaskBlend_NEON(const void* LIBGAV1_RESTRICT prediction_0,
do {
int x = 0;
do {
- const int16x8_t pred_mask_0 = GetMask8<subsampling_x, subsampling_y>(
+ const uint8x16_t pred_mask_0 = GetMask16<subsampling_x, subsampling_y>(
mask + (x << subsampling_x), mask_stride);
+ const int16x8_t pred_mask_0_lo = ZeroExtend(vget_low_u8(pred_mask_0));
+ const int16x8_t pred_mask_0_hi = ZeroExtend(vget_high_u8(pred_mask_0));
// 64 - mask
- const int16x8_t pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0);
- const int16x8_t pred_val_0 = vld1q_s16(pred_0 + x);
- const int16x8_t pred_val_1 = vld1q_s16(pred_1 + x);
+ const int16x8_t pred_mask_1_lo = vsubq_s16(mask_inverter, pred_mask_0_lo);
+ const int16x8_t pred_mask_1_hi = vsubq_s16(mask_inverter, pred_mask_0_hi);
+
uint8x8_t result;
- // int res = (mask_value * prediction_0[x] +
- // (64 - mask_value) * prediction_1[x]) >> 6;
- const int32x4_t weighted_pred_0_lo =
- vmull_s16(vget_low_s16(pred_mask_0), vget_low_s16(pred_val_0));
- const int32x4_t weighted_pred_0_hi =
- vmull_s16(vget_high_s16(pred_mask_0), vget_high_s16(pred_val_0));
- const int32x4_t weighted_combo_lo =
- vmlal_s16(weighted_pred_0_lo, vget_low_s16(pred_mask_1),
- vget_low_s16(pred_val_1));
- const int32x4_t weighted_combo_hi =
- vmlal_s16(weighted_pred_0_hi, vget_high_s16(pred_mask_1),
- vget_high_s16(pred_val_1));
-
- // dst[x] = static_cast<Pixel>(
- // Clip3(RightShiftWithRounding(res, inter_post_round_bits), 0,
- // (1 << kBitdepth8) - 1));
- result = vqrshrun_n_s16(vcombine_s16(vshrn_n_s32(weighted_combo_lo, 6),
- vshrn_n_s32(weighted_combo_hi, 6)),
- 4);
+ result =
+ CombinePred8(pred_0 + x, pred_1 + x, pred_mask_0_lo, pred_mask_1_lo);
vst1_u8(dst + x, result);
- x += 8;
+ result = CombinePred8(pred_0 + x + 8, pred_1 + x + 8, pred_mask_0_hi,
+ pred_mask_1_hi);
+ vst1_u8(dst + x + 8, result);
+
+ x += 16;
} while (x < width);
dst += dst_stride;
pred_0 += width;
@@ -251,63 +308,19 @@ inline void MaskBlend_NEON(const void* LIBGAV1_RESTRICT prediction_0,
} while (++y < height);
}
-// TODO(b/150461164): This is much faster for inter_intra (input is Pixel
-// values) but regresses compound versions (input is int16_t). Try to
-// consolidate these.
template <int subsampling_x, int subsampling_y>
inline uint8x8_t GetInterIntraMask4x2(const uint8_t* mask,
ptrdiff_t mask_stride) {
if (subsampling_x == 1) {
- const uint8x8_t mask_val =
- vpadd_u8(vld1_u8(mask), vld1_u8(mask + (mask_stride << subsampling_y)));
- if (subsampling_y == 1) {
- const uint8x8_t next_mask_val = vpadd_u8(vld1_u8(mask + mask_stride),
- vld1_u8(mask + mask_stride * 3));
-
- // Use a saturating add to work around the case where all |mask| values
- // are 64. Together with the rounding shift this ensures the correct
- // result.
- const uint8x8_t sum = vqadd_u8(mask_val, next_mask_val);
- return vrshr_n_u8(sum, /*subsampling_x=*/1 + subsampling_y);
- }
-
- return vrshr_n_u8(mask_val, /*subsampling_x=*/1);
+ return GetMask4x2<subsampling_y>(mask);
}
-
+ // When using intra or difference weighted masks, the function doesn't use
+ // subsampling, so |mask_stride| may be 4 or 8.
assert(subsampling_y == 0 && subsampling_x == 0);
const uint8x8_t mask_val0 = Load4(mask);
- // TODO(b/150461164): Investigate the source of |mask| and see if the stride
- // can be removed.
- // TODO(b/150461164): The unit tests start at 8x8. Does this get run?
return Load4<1>(mask + mask_stride, mask_val0);
}
-template <int subsampling_x, int subsampling_y>
-inline uint8x8_t GetInterIntraMask8(const uint8_t* mask,
- ptrdiff_t mask_stride) {
- if (subsampling_x == 1) {
- const uint8x16_t mask_val = vld1q_u8(mask);
- const uint8x8_t mask_paired =
- vpadd_u8(vget_low_u8(mask_val), vget_high_u8(mask_val));
- if (subsampling_y == 1) {
- const uint8x16_t next_mask_val = vld1q_u8(mask + mask_stride);
- const uint8x8_t next_mask_paired =
- vpadd_u8(vget_low_u8(next_mask_val), vget_high_u8(next_mask_val));
-
- // Use a saturating add to work around the case where all |mask| values
- // are 64. Together with the rounding shift this ensures the correct
- // result.
- const uint8x8_t sum = vqadd_u8(mask_paired, next_mask_paired);
- return vrshr_n_u8(sum, /*subsampling_x=*/1 + subsampling_y);
- }
-
- return vrshr_n_u8(mask_paired, /*subsampling_x=*/1);
- }
-
- assert(subsampling_y == 0 && subsampling_x == 0);
- return vld1_u8(mask);
-}
-
inline void InterIntraWriteMaskBlendLine8bpp4x2(
const uint8_t* LIBGAV1_RESTRICT const pred_0,
uint8_t* LIBGAV1_RESTRICT const pred_1, const ptrdiff_t pred_stride_1,
@@ -374,6 +387,32 @@ inline void InterIntraMaskBlending8bpp4xH_NEON(
}
template <int subsampling_x, int subsampling_y>
+inline void InterIntraMaskBlending8bpp8xH_NEON(
+ const uint8_t* LIBGAV1_RESTRICT pred_0, uint8_t* LIBGAV1_RESTRICT pred_1,
+ const ptrdiff_t pred_stride_1, const uint8_t* LIBGAV1_RESTRICT mask,
+ const ptrdiff_t mask_stride, const int height) {
+ const uint8x8_t mask_inverter = vdup_n_u8(64);
+ int y = height;
+ do {
+ const uint8x8_t pred_mask_1 = GetMask8<subsampling_x, subsampling_y>(mask);
+ // 64 - mask
+ const uint8x8_t pred_mask_0 = vsub_u8(mask_inverter, pred_mask_1);
+ const uint8x8_t pred_val_0 = vld1_u8(pred_0);
+ const uint8x8_t pred_val_1 = vld1_u8(pred_1);
+ const uint16x8_t weighted_pred_0 = vmull_u8(pred_mask_0, pred_val_0);
+ // weighted_pred0 + weighted_pred1
+ const uint16x8_t weighted_combo =
+ vmlal_u8(weighted_pred_0, pred_mask_1, pred_val_1);
+ const uint8x8_t result = vrshrn_n_u16(weighted_combo, 6);
+ vst1_u8(pred_1, result);
+
+ pred_0 += 8;
+ pred_1 += pred_stride_1;
+ mask += mask_stride << subsampling_y;
+ } while (--y != 0);
+}
+
+template <int subsampling_x, int subsampling_y>
inline void InterIntraMaskBlend8bpp_NEON(
const uint8_t* LIBGAV1_RESTRICT prediction_0,
uint8_t* LIBGAV1_RESTRICT prediction_1, const ptrdiff_t prediction_stride_1,
@@ -385,30 +424,46 @@ inline void InterIntraMaskBlend8bpp_NEON(
height);
return;
}
+ if (width == 8) {
+ InterIntraMaskBlending8bpp8xH_NEON<subsampling_x, subsampling_y>(
+ prediction_0, prediction_1, prediction_stride_1, mask_ptr, mask_stride,
+ height);
+ return;
+ }
const uint8_t* mask = mask_ptr;
- const uint8x8_t mask_inverter = vdup_n_u8(64);
+ const uint8x16_t mask_inverter = vdupq_n_u8(64);
int y = 0;
do {
int x = 0;
do {
- // TODO(b/150461164): Consider a 16 wide specialization (at least for the
- // unsampled version) to take advantage of vld1q_u8().
- const uint8x8_t pred_mask_1 =
- GetInterIntraMask8<subsampling_x, subsampling_y>(
- mask + (x << subsampling_x), mask_stride);
+ const uint8x16_t pred_mask_1 = GetMask16<subsampling_x, subsampling_y>(
+ mask + (x << subsampling_x), mask_stride);
// 64 - mask
- const uint8x8_t pred_mask_0 = vsub_u8(mask_inverter, pred_mask_1);
- const uint8x8_t pred_val_0 = vld1_u8(prediction_0);
+ const uint8x16_t pred_mask_0 = vsubq_u8(mask_inverter, pred_mask_1);
+ const uint8x8_t pred_val_0_lo = vld1_u8(prediction_0);
+ prediction_0 += 8;
+ const uint8x8_t pred_val_0_hi = vld1_u8(prediction_0);
prediction_0 += 8;
- const uint8x8_t pred_val_1 = vld1_u8(prediction_1 + x);
- const uint16x8_t weighted_pred_0 = vmull_u8(pred_mask_0, pred_val_0);
+ // Ensure armv7 build combines the load.
+ const uint8x16_t pred_val_1 = vld1q_u8(prediction_1 + x);
+ const uint8x8_t pred_val_1_lo = vget_low_u8(pred_val_1);
+ const uint8x8_t pred_val_1_hi = vget_high_u8(pred_val_1);
+ const uint16x8_t weighted_pred_0_lo =
+ vmull_u8(vget_low_u8(pred_mask_0), pred_val_0_lo);
// weighted_pred0 + weighted_pred1
- const uint16x8_t weighted_combo =
- vmlal_u8(weighted_pred_0, pred_mask_1, pred_val_1);
- const uint8x8_t result = vrshrn_n_u16(weighted_combo, 6);
- vst1_u8(prediction_1 + x, result);
+ const uint16x8_t weighted_combo_lo =
+ vmlal_u8(weighted_pred_0_lo, vget_low_u8(pred_mask_1), pred_val_1_lo);
+ const uint8x8_t result_lo = vrshrn_n_u16(weighted_combo_lo, 6);
+ vst1_u8(prediction_1 + x, result_lo);
+ const uint16x8_t weighted_pred_0_hi =
+ vmull_u8(vget_high_u8(pred_mask_0), pred_val_0_hi);
+ // weighted_pred0 + weighted_pred1
+ const uint16x8_t weighted_combo_hi = vmlal_u8(
+ weighted_pred_0_hi, vget_high_u8(pred_mask_1), pred_val_1_hi);
+ const uint8x8_t result_hi = vrshrn_n_u16(weighted_combo_hi, 6);
+ vst1_u8(prediction_1 + x + 8, result_hi);
- x += 8;
+ x += 16;
} while (x < width);
prediction_1 += prediction_stride_1;
mask += mask_stride << subsampling_y;
diff --git a/src/dsp/arm/obmc_neon.cc b/src/dsp/arm/obmc_neon.cc
index 659ed8e..271bbaa 100644
--- a/src/dsp/arm/obmc_neon.cc
+++ b/src/dsp/arm/obmc_neon.cc
@@ -52,6 +52,17 @@ inline void WriteObmcLine4(uint8_t* LIBGAV1_RESTRICT const pred,
StoreLo4(pred, result);
}
+inline void WriteObmcLine8(uint8_t* LIBGAV1_RESTRICT const pred,
+ const uint8x8_t obmc_pred_val,
+ const uint8x8_t pred_mask,
+ const uint8x8_t obmc_pred_mask) {
+ const uint8x8_t pred_val = vld1_u8(pred);
+ const uint16x8_t weighted_pred = vmull_u8(pred_mask, pred_val);
+ const uint8x8_t result =
+ vrshrn_n_u16(vmlal_u8(weighted_pred, obmc_pred_mask, obmc_pred_val), 6);
+ vst1_u8(pred, result);
+}
+
inline void OverlapBlendFromLeft2xH_NEON(
uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred,
@@ -99,24 +110,25 @@ inline void OverlapBlendFromLeft4xH_NEON(
inline void OverlapBlendFromLeft8xH_NEON(
uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
- const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred,
- const ptrdiff_t obmc_prediction_stride) {
+ const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred) {
const uint8x8_t mask_inverter = vdup_n_u8(64);
const uint8x8_t pred_mask = vld1_u8(kObmcMask + 6);
+ constexpr int obmc_prediction_stride = 8;
// 64 - mask
const uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
int y = 0;
do {
- const uint8x8_t pred_val = vld1_u8(pred);
- const uint16x8_t weighted_pred = vmull_u8(pred_mask, pred_val);
- const uint8x8_t obmc_pred_val = vld1_u8(obmc_pred);
- const uint8x8_t result =
- vrshrn_n_u16(vmlal_u8(weighted_pred, obmc_pred_mask, obmc_pred_val), 6);
+ const uint8x16_t obmc_pred_val = vld1q_u8(obmc_pred);
+ WriteObmcLine8(pred, vget_low_u8(obmc_pred_val), pred_mask, obmc_pred_mask);
+ pred += prediction_stride;
- vst1_u8(pred, result);
+ WriteObmcLine8(pred, vget_high_u8(obmc_pred_val), pred_mask,
+ obmc_pred_mask);
pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
- } while (++y != height);
+
+ obmc_pred += obmc_prediction_stride << 1;
+ y += 2;
+ } while (y != height);
}
void OverlapBlendFromLeft_NEON(
@@ -140,8 +152,7 @@ void OverlapBlendFromLeft_NEON(
return;
}
if (width == 8) {
- OverlapBlendFromLeft8xH_NEON(pred, prediction_stride, height, obmc_pred,
- obmc_prediction_stride);
+ OverlapBlendFromLeft8xH_NEON(pred, prediction_stride, height, obmc_pred);
return;
}
const uint8x16_t mask_inverter = vdupq_n_u8(64);
@@ -262,26 +273,31 @@ inline void OverlapBlendFromTop4xH_NEON(
inline void OverlapBlendFromTop8xH_NEON(
uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
- const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred,
- const ptrdiff_t obmc_prediction_stride) {
+ const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred) {
+ constexpr int obmc_prediction_stride = 8;
const uint8x8_t mask_inverter = vdup_n_u8(64);
const uint8_t* mask = kObmcMask + height - 2;
const int compute_height = height - (height >> 2);
int y = 0;
do {
- const uint8x8_t pred_mask = vdup_n_u8(mask[y]);
+ const uint8x8_t pred_mask0 = vdup_n_u8(mask[y]);
// 64 - mask
- const uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
- const uint8x8_t pred_val = vld1_u8(pred);
- const uint16x8_t weighted_pred = vmull_u8(pred_mask, pred_val);
- const uint8x8_t obmc_pred_val = vld1_u8(obmc_pred);
- const uint8x8_t result =
- vrshrn_n_u16(vmlal_u8(weighted_pred, obmc_pred_mask, obmc_pred_val), 6);
+ const uint8x8_t obmc_pred_mask0 = vsub_u8(mask_inverter, pred_mask0);
+ const uint8x16_t obmc_pred_val = vld1q_u8(obmc_pred);
- vst1_u8(pred, result);
+ WriteObmcLine8(pred, vget_low_u8(obmc_pred_val), pred_mask0,
+ obmc_pred_mask0);
pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
- } while (++y != compute_height);
+ ++y;
+
+ const uint8x8_t pred_mask1 = vdup_n_u8(mask[y]);
+ // 64 - mask
+ const uint8x8_t obmc_pred_mask1 = vsub_u8(mask_inverter, pred_mask1);
+ WriteObmcLine8(pred, vget_high_u8(obmc_pred_val), pred_mask1,
+ obmc_pred_mask1);
+ pred += prediction_stride;
+ obmc_pred += obmc_prediction_stride << 1;
+ } while (++y < compute_height);
}
void OverlapBlendFromTop_NEON(
@@ -301,8 +317,7 @@ void OverlapBlendFromTop_NEON(
}
if (width == 8) {
- OverlapBlendFromTop8xH_NEON(pred, prediction_stride, height, obmc_pred,
- obmc_prediction_stride);
+ OverlapBlendFromTop8xH_NEON(pred, prediction_stride, height, obmc_pred);
return;
}
@@ -371,26 +386,23 @@ constexpr uint16_t kObmcMask[62] = {
33, 35, 36, 38, 40, 41, 43, 44, 45, 47, 48, 50, 51, 52, 53, 55, 56, 57, 58,
59, 60, 60, 61, 62, 64, 64, 64, 64, 64, 64, 64, 64};
-inline uint16x4_t BlendObmc2Or4(uint8_t* LIBGAV1_RESTRICT const pred,
- const uint8_t* LIBGAV1_RESTRICT const obmc_pred,
+inline uint16x4_t BlendObmc2Or4(uint16_t* const pred,
+ const uint16x4_t obmc_pred_val,
const uint16x4_t pred_mask,
const uint16x4_t obmc_pred_mask) {
- const uint16x4_t pred_val = vld1_u16(reinterpret_cast<uint16_t*>(pred));
- const uint16x4_t obmc_pred_val =
- vld1_u16(reinterpret_cast<const uint16_t*>(obmc_pred));
+ const uint16x4_t pred_val = vld1_u16(pred);
const uint16x4_t weighted_pred = vmul_u16(pred_mask, pred_val);
const uint16x4_t result =
vrshr_n_u16(vmla_u16(weighted_pred, obmc_pred_mask, obmc_pred_val), 6);
return result;
}
-inline uint16x8_t BlendObmc8(uint8_t* LIBGAV1_RESTRICT const pred,
- const uint8_t* LIBGAV1_RESTRICT const obmc_pred,
+inline uint16x8_t BlendObmc8(uint16_t* LIBGAV1_RESTRICT const pred,
+ const uint16_t* LIBGAV1_RESTRICT const obmc_pred,
const uint16x8_t pred_mask,
const uint16x8_t obmc_pred_mask) {
- const uint16x8_t pred_val = vld1q_u16(reinterpret_cast<uint16_t*>(pred));
- const uint16x8_t obmc_pred_val =
- vld1q_u16(reinterpret_cast<const uint16_t*>(obmc_pred));
+ const uint16x8_t pred_val = vld1q_u16(pred);
+ const uint16x8_t obmc_pred_val = vld1q_u16(obmc_pred);
const uint16x8_t weighted_pred = vmulq_u16(pred_mask, pred_val);
const uint16x8_t result =
vrshrq_n_u16(vmlaq_u16(weighted_pred, obmc_pred_mask, obmc_pred_val), 6);
@@ -398,27 +410,29 @@ inline uint16x8_t BlendObmc8(uint8_t* LIBGAV1_RESTRICT const pred,
}
inline void OverlapBlendFromLeft2xH_NEON(
- uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
- const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred,
- const ptrdiff_t obmc_prediction_stride) {
+ uint16_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
+ const int height, const uint16_t* LIBGAV1_RESTRICT obmc_pred) {
+ constexpr int obmc_prediction_stride = 2;
const uint16x4_t mask_inverter = vdup_n_u16(64);
// Second two lanes unused.
const uint16x4_t pred_mask = vld1_u16(kObmcMask);
const uint16x4_t obmc_pred_mask = vsub_u16(mask_inverter, pred_mask);
int y = 0;
do {
+ const uint16x4_t obmc_pred_0 = vld1_u16(obmc_pred);
const uint16x4_t result_0 =
- BlendObmc2Or4(pred, obmc_pred, pred_mask, obmc_pred_mask);
- Store2<0>(reinterpret_cast<uint16_t*>(pred), result_0);
+ BlendObmc2Or4(pred, obmc_pred_0, pred_mask, obmc_pred_mask);
+ Store2<0>(pred, result_0);
- pred += prediction_stride;
+ pred = AddByteStride(pred, prediction_stride);
obmc_pred += obmc_prediction_stride;
+ const uint16x4_t obmc_pred_1 = vld1_u16(obmc_pred);
const uint16x4_t result_1 =
- BlendObmc2Or4(pred, obmc_pred, pred_mask, obmc_pred_mask);
- Store2<0>(reinterpret_cast<uint16_t*>(pred), result_1);
+ BlendObmc2Or4(pred, obmc_pred_1, pred_mask, obmc_pred_mask);
+ Store2<0>(pred, result_1);
- pred += prediction_stride;
+ pred = AddByteStride(pred, prediction_stride);
obmc_pred += obmc_prediction_stride;
y += 2;
@@ -426,26 +440,26 @@ inline void OverlapBlendFromLeft2xH_NEON(
}
inline void OverlapBlendFromLeft4xH_NEON(
- uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
- const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred,
- const ptrdiff_t obmc_prediction_stride) {
+ uint16_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
+ const int height, const uint16_t* LIBGAV1_RESTRICT obmc_pred) {
+ constexpr int obmc_prediction_stride = 4;
const uint16x4_t mask_inverter = vdup_n_u16(64);
const uint16x4_t pred_mask = vld1_u16(kObmcMask + 2);
// 64 - mask
const uint16x4_t obmc_pred_mask = vsub_u16(mask_inverter, pred_mask);
int y = 0;
do {
- const uint16x4_t result_0 =
- BlendObmc2Or4(pred, obmc_pred, pred_mask, obmc_pred_mask);
- vst1_u16(reinterpret_cast<uint16_t*>(pred), result_0);
- pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
-
- const uint16x4_t result_1 =
- BlendObmc2Or4(pred, obmc_pred, pred_mask, obmc_pred_mask);
- vst1_u16(reinterpret_cast<uint16_t*>(pred), result_1);
- pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
+ const uint16x8_t obmc_pred_val = vld1q_u16(obmc_pred);
+ const uint16x4_t result_0 = BlendObmc2Or4(pred, vget_low_u16(obmc_pred_val),
+ pred_mask, obmc_pred_mask);
+ vst1_u16(pred, result_0);
+ pred = AddByteStride(pred, prediction_stride);
+
+ const uint16x4_t result_1 = BlendObmc2Or4(
+ pred, vget_high_u16(obmc_pred_val), pred_mask, obmc_pred_mask);
+ vst1_u16(pred, result_1);
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred += obmc_prediction_stride << 1;
y += 2;
} while (y != height);
@@ -456,52 +470,47 @@ void OverlapBlendFromLeft_NEON(
const int width, const int height,
const void* LIBGAV1_RESTRICT const obmc_prediction,
const ptrdiff_t obmc_prediction_stride) {
- auto* pred = static_cast<uint8_t*>(prediction);
- const auto* obmc_pred = static_cast<const uint8_t*>(obmc_prediction);
+ auto* pred = static_cast<uint16_t*>(prediction);
+ const auto* obmc_pred = static_cast<const uint16_t*>(obmc_prediction);
assert(width >= 2);
assert(height >= 4);
if (width == 2) {
- OverlapBlendFromLeft2xH_NEON(pred, prediction_stride, height, obmc_pred,
- obmc_prediction_stride);
+ OverlapBlendFromLeft2xH_NEON(pred, prediction_stride, height, obmc_pred);
return;
}
if (width == 4) {
- OverlapBlendFromLeft4xH_NEON(pred, prediction_stride, height, obmc_pred,
- obmc_prediction_stride);
+ OverlapBlendFromLeft4xH_NEON(pred, prediction_stride, height, obmc_pred);
return;
}
const uint16x8_t mask_inverter = vdupq_n_u16(64);
const uint16_t* mask = kObmcMask + width - 2;
int x = 0;
do {
- pred = reinterpret_cast<uint8_t*>(static_cast<uint16_t*>(prediction) + x);
- obmc_pred = reinterpret_cast<const uint8_t*>(
- static_cast<const uint16_t*>(obmc_prediction) + x);
+ uint16_t* pred_x = pred + x;
+ const uint16_t* obmc_pred_x = obmc_pred + x;
const uint16x8_t pred_mask = vld1q_u16(mask + x);
// 64 - mask
const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask);
int y = 0;
do {
const uint16x8_t result =
- BlendObmc8(pred, obmc_pred, pred_mask, obmc_pred_mask);
- vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
+ BlendObmc8(pred_x, obmc_pred_x, pred_mask, obmc_pred_mask);
+ vst1q_u16(pred_x, result);
- pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
+ pred_x = AddByteStride(pred_x, prediction_stride);
+ obmc_pred_x = AddByteStride(obmc_pred_x, obmc_prediction_stride);
} while (++y < height);
x += 8;
} while (x < width);
}
template <int lane>
-inline uint16x4_t BlendObmcFromTop4(
- uint8_t* LIBGAV1_RESTRICT const pred,
- const uint8_t* LIBGAV1_RESTRICT const obmc_pred, const uint16x8_t pred_mask,
- const uint16x8_t obmc_pred_mask) {
- const uint16x4_t pred_val = vld1_u16(reinterpret_cast<uint16_t*>(pred));
- const uint16x4_t obmc_pred_val =
- vld1_u16(reinterpret_cast<const uint16_t*>(obmc_pred));
+inline uint16x4_t BlendObmcFromTop4(uint16_t* const pred,
+ const uint16x4_t obmc_pred_val,
+ const uint16x8_t pred_mask,
+ const uint16x8_t obmc_pred_mask) {
+ const uint16x4_t pred_val = vld1_u16(pred);
const uint16x4_t weighted_pred = VMulLaneQU16<lane>(pred_val, pred_mask);
const uint16x4_t result = vrshr_n_u16(
VMlaLaneQU16<lane>(weighted_pred, obmc_pred_val, obmc_pred_mask), 6);
@@ -510,12 +519,11 @@ inline uint16x4_t BlendObmcFromTop4(
template <int lane>
inline uint16x8_t BlendObmcFromTop8(
- uint8_t* LIBGAV1_RESTRICT const pred,
- const uint8_t* LIBGAV1_RESTRICT const obmc_pred, const uint16x8_t pred_mask,
- const uint16x8_t obmc_pred_mask) {
- const uint16x8_t pred_val = vld1q_u16(reinterpret_cast<uint16_t*>(pred));
- const uint16x8_t obmc_pred_val =
- vld1q_u16(reinterpret_cast<const uint16_t*>(obmc_pred));
+ uint16_t* LIBGAV1_RESTRICT const pred,
+ const uint16_t* LIBGAV1_RESTRICT const obmc_pred,
+ const uint16x8_t pred_mask, const uint16x8_t obmc_pred_mask) {
+ const uint16x8_t pred_val = vld1q_u16(pred);
+ const uint16x8_t obmc_pred_val = vld1q_u16(obmc_pred);
const uint16x8_t weighted_pred = VMulQLaneQU16<lane>(pred_val, pred_mask);
const uint16x8_t result = vrshrq_n_u16(
VMlaQLaneQU16<lane>(weighted_pred, obmc_pred_val, obmc_pred_mask), 6);
@@ -523,41 +531,43 @@ inline uint16x8_t BlendObmcFromTop8(
}
inline void OverlapBlendFromTop4x2Or4_NEON(
- uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
- const uint8_t* LIBGAV1_RESTRICT obmc_pred,
- const ptrdiff_t obmc_prediction_stride, const int height) {
+ uint16_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
+ const uint16_t* LIBGAV1_RESTRICT obmc_pred, const int height) {
+ constexpr int obmc_prediction_stride = 4;
const uint16x8_t pred_mask = vld1q_u16(&kObmcMask[height - 2]);
const uint16x8_t mask_inverter = vdupq_n_u16(64);
const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask);
- uint16x4_t result =
- BlendObmcFromTop4<0>(pred, obmc_pred, pred_mask, obmc_pred_mask);
- vst1_u16(reinterpret_cast<uint16_t*>(pred), result);
- pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
+ const uint16x8_t obmc_pred_val_0 = vld1q_u16(obmc_pred);
+ uint16x4_t result = BlendObmcFromTop4<0>(pred, vget_low_u16(obmc_pred_val_0),
+ pred_mask, obmc_pred_mask);
+ vst1_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
if (height == 2) {
// Mask value is 64, meaning |pred| is unchanged.
return;
}
- result = BlendObmcFromTop4<1>(pred, obmc_pred, pred_mask, obmc_pred_mask);
- vst1_u16(reinterpret_cast<uint16_t*>(pred), result);
- pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
+ result = BlendObmcFromTop4<1>(pred, vget_high_u16(obmc_pred_val_0), pred_mask,
+ obmc_pred_mask);
+ vst1_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred += obmc_prediction_stride << 1;
- result = BlendObmcFromTop4<2>(pred, obmc_pred, pred_mask, obmc_pred_mask);
- vst1_u16(reinterpret_cast<uint16_t*>(pred), result);
+ const uint16x4_t obmc_pred_val_2 = vld1_u16(obmc_pred);
+ result =
+ BlendObmcFromTop4<2>(pred, obmc_pred_val_2, pred_mask, obmc_pred_mask);
+ vst1_u16(pred, result);
}
inline void OverlapBlendFromTop4xH_NEON(
- uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
- const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred,
- const ptrdiff_t obmc_prediction_stride) {
+ uint16_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
+ const int height, const uint16_t* LIBGAV1_RESTRICT obmc_pred) {
if (height < 8) {
- OverlapBlendFromTop4x2Or4_NEON(pred, prediction_stride, obmc_pred,
- obmc_prediction_stride, height);
+ OverlapBlendFromTop4x2Or4_NEON(pred, prediction_stride, obmc_pred, height);
return;
}
+ constexpr int obmc_prediction_stride = 4;
const uint16_t* mask = kObmcMask + height - 2;
const uint16x8_t mask_inverter = vdupq_n_u16(64);
int y = 0;
@@ -566,36 +576,44 @@ inline void OverlapBlendFromTop4xH_NEON(
do {
const uint16x8_t pred_mask = vld1q_u16(&mask[y]);
const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask);
- uint16x4_t result =
- BlendObmcFromTop4<0>(pred, obmc_pred, pred_mask, obmc_pred_mask);
- vst1_u16(reinterpret_cast<uint16_t*>(pred), result);
- pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
-
- result = BlendObmcFromTop4<1>(pred, obmc_pred, pred_mask, obmc_pred_mask);
- vst1_u16(reinterpret_cast<uint16_t*>(pred), result);
- pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
-
- result = BlendObmcFromTop4<2>(pred, obmc_pred, pred_mask, obmc_pred_mask);
- vst1_u16(reinterpret_cast<uint16_t*>(pred), result);
- pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
-
- result = BlendObmcFromTop4<3>(pred, obmc_pred, pred_mask, obmc_pred_mask);
- vst1_u16(reinterpret_cast<uint16_t*>(pred), result);
- pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
-
- result = BlendObmcFromTop4<4>(pred, obmc_pred, pred_mask, obmc_pred_mask);
- vst1_u16(reinterpret_cast<uint16_t*>(pred), result);
- pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
-
- result = BlendObmcFromTop4<5>(pred, obmc_pred, pred_mask, obmc_pred_mask);
- vst1_u16(reinterpret_cast<uint16_t*>(pred), result);
- pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
+ // Load obmc row 0, 1.
+ uint16x8_t obmc_pred_val = vld1q_u16(obmc_pred);
+ uint16x4_t result = BlendObmcFromTop4<0>(pred, vget_low_u16(obmc_pred_val),
+ pred_mask, obmc_pred_mask);
+ vst1_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
+
+ result = BlendObmcFromTop4<1>(pred, vget_high_u16(obmc_pred_val), pred_mask,
+ obmc_pred_mask);
+ vst1_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred += obmc_prediction_stride << 1;
+
+ // Load obmc row 2, 3.
+ obmc_pred_val = vld1q_u16(obmc_pred);
+ result = BlendObmcFromTop4<2>(pred, vget_low_u16(obmc_pred_val), pred_mask,
+ obmc_pred_mask);
+ vst1_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
+
+ result = BlendObmcFromTop4<3>(pred, vget_high_u16(obmc_pred_val), pred_mask,
+ obmc_pred_mask);
+ vst1_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred += obmc_prediction_stride << 1;
+
+ // Load obmc row 4, 5.
+ obmc_pred_val = vld1q_u16(obmc_pred);
+ result = BlendObmcFromTop4<4>(pred, vget_low_u16(obmc_pred_val), pred_mask,
+ obmc_pred_mask);
+ vst1_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
+
+ result = BlendObmcFromTop4<5>(pred, vget_high_u16(obmc_pred_val), pred_mask,
+ obmc_pred_mask);
+ vst1_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred += obmc_prediction_stride << 1;
// Increment for the right mask index.
y += 6;
@@ -603,147 +621,147 @@ inline void OverlapBlendFromTop4xH_NEON(
}
inline void OverlapBlendFromTop8xH_NEON(
- uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
- const uint8_t* LIBGAV1_RESTRICT obmc_pred,
- const ptrdiff_t obmc_prediction_stride, const int height) {
+ uint16_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
+ const uint16_t* LIBGAV1_RESTRICT obmc_pred, const int height) {
const uint16_t* mask = kObmcMask + height - 2;
const uint16x8_t mask_inverter = vdupq_n_u16(64);
uint16x8_t pred_mask = vld1q_u16(mask);
uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask);
uint16x8_t result =
BlendObmcFromTop8<0>(pred, obmc_pred, pred_mask, obmc_pred_mask);
- vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
+ vst1q_u16(pred, result);
if (height == 2) return;
- pred += prediction_stride;
+ constexpr int obmc_prediction_stride = 8;
+ pred = AddByteStride(pred, prediction_stride);
obmc_pred += obmc_prediction_stride;
result = BlendObmcFromTop8<1>(pred, obmc_pred, pred_mask, obmc_pred_mask);
- vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
- pred += prediction_stride;
+ vst1q_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
obmc_pred += obmc_prediction_stride;
result = BlendObmcFromTop8<2>(pred, obmc_pred, pred_mask, obmc_pred_mask);
- vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
- pred += prediction_stride;
+ vst1q_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
obmc_pred += obmc_prediction_stride;
result = BlendObmcFromTop8<3>(pred, obmc_pred, pred_mask, obmc_pred_mask);
- vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
+ vst1q_u16(pred, result);
if (height == 4) return;
- pred += prediction_stride;
+ pred = AddByteStride(pred, prediction_stride);
obmc_pred += obmc_prediction_stride;
result = BlendObmcFromTop8<4>(pred, obmc_pred, pred_mask, obmc_pred_mask);
- vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
- pred += prediction_stride;
+ vst1q_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
obmc_pred += obmc_prediction_stride;
result = BlendObmcFromTop8<5>(pred, obmc_pred, pred_mask, obmc_pred_mask);
- vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
+ vst1q_u16(pred, result);
if (height == 8) return;
- pred += prediction_stride;
+ pred = AddByteStride(pred, prediction_stride);
obmc_pred += obmc_prediction_stride;
result = BlendObmcFromTop8<6>(pred, obmc_pred, pred_mask, obmc_pred_mask);
- vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
- pred += prediction_stride;
+ vst1q_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
obmc_pred += obmc_prediction_stride;
result = BlendObmcFromTop8<7>(pred, obmc_pred, pred_mask, obmc_pred_mask);
- vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
- pred += prediction_stride;
+ vst1q_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
obmc_pred += obmc_prediction_stride;
pred_mask = vld1q_u16(&mask[8]);
obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask);
result = BlendObmcFromTop8<0>(pred, obmc_pred, pred_mask, obmc_pred_mask);
- vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
- pred += prediction_stride;
+ vst1q_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
obmc_pred += obmc_prediction_stride;
result = BlendObmcFromTop8<1>(pred, obmc_pred, pred_mask, obmc_pred_mask);
- vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
- pred += prediction_stride;
+ vst1q_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
obmc_pred += obmc_prediction_stride;
result = BlendObmcFromTop8<2>(pred, obmc_pred, pred_mask, obmc_pred_mask);
- vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
- pred += prediction_stride;
+ vst1q_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
obmc_pred += obmc_prediction_stride;
result = BlendObmcFromTop8<3>(pred, obmc_pred, pred_mask, obmc_pred_mask);
- vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
+ vst1q_u16(pred, result);
if (height == 16) return;
- pred += prediction_stride;
+ pred = AddByteStride(pred, prediction_stride);
obmc_pred += obmc_prediction_stride;
result = BlendObmcFromTop8<4>(pred, obmc_pred, pred_mask, obmc_pred_mask);
- vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
- pred += prediction_stride;
+ vst1q_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
obmc_pred += obmc_prediction_stride;
result = BlendObmcFromTop8<5>(pred, obmc_pred, pred_mask, obmc_pred_mask);
- vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
- pred += prediction_stride;
+ vst1q_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
obmc_pred += obmc_prediction_stride;
result = BlendObmcFromTop8<6>(pred, obmc_pred, pred_mask, obmc_pred_mask);
- vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
- pred += prediction_stride;
+ vst1q_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
obmc_pred += obmc_prediction_stride;
result = BlendObmcFromTop8<7>(pred, obmc_pred, pred_mask, obmc_pred_mask);
- vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
- pred += prediction_stride;
+ vst1q_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
obmc_pred += obmc_prediction_stride;
pred_mask = vld1q_u16(&mask[16]);
obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask);
result = BlendObmcFromTop8<0>(pred, obmc_pred, pred_mask, obmc_pred_mask);
- vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
- pred += prediction_stride;
+ vst1q_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
obmc_pred += obmc_prediction_stride;
result = BlendObmcFromTop8<1>(pred, obmc_pred, pred_mask, obmc_pred_mask);
- vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
- pred += prediction_stride;
+ vst1q_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
obmc_pred += obmc_prediction_stride;
result = BlendObmcFromTop8<2>(pred, obmc_pred, pred_mask, obmc_pred_mask);
- vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
- pred += prediction_stride;
+ vst1q_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
obmc_pred += obmc_prediction_stride;
result = BlendObmcFromTop8<3>(pred, obmc_pred, pred_mask, obmc_pred_mask);
- vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
- pred += prediction_stride;
+ vst1q_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
obmc_pred += obmc_prediction_stride;
result = BlendObmcFromTop8<4>(pred, obmc_pred, pred_mask, obmc_pred_mask);
- vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
- pred += prediction_stride;
+ vst1q_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
obmc_pred += obmc_prediction_stride;
result = BlendObmcFromTop8<5>(pred, obmc_pred, pred_mask, obmc_pred_mask);
- vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
- pred += prediction_stride;
+ vst1q_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
obmc_pred += obmc_prediction_stride;
result = BlendObmcFromTop8<6>(pred, obmc_pred, pred_mask, obmc_pred_mask);
- vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
- pred += prediction_stride;
+ vst1q_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
obmc_pred += obmc_prediction_stride;
result = BlendObmcFromTop8<7>(pred, obmc_pred, pred_mask, obmc_pred_mask);
- vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
+ vst1q_u16(pred, result);
}
void OverlapBlendFromTop_NEON(
@@ -751,20 +769,18 @@ void OverlapBlendFromTop_NEON(
const int width, const int height,
const void* LIBGAV1_RESTRICT const obmc_prediction,
const ptrdiff_t obmc_prediction_stride) {
- auto* pred = static_cast<uint8_t*>(prediction);
- const auto* obmc_pred = static_cast<const uint8_t*>(obmc_prediction);
+ auto* pred = static_cast<uint16_t*>(prediction);
+ const auto* obmc_pred = static_cast<const uint16_t*>(obmc_prediction);
assert(width >= 4);
assert(height >= 2);
if (width == 4) {
- OverlapBlendFromTop4xH_NEON(pred, prediction_stride, height, obmc_pred,
- obmc_prediction_stride);
+ OverlapBlendFromTop4xH_NEON(pred, prediction_stride, height, obmc_pred);
return;
}
if (width == 8) {
- OverlapBlendFromTop8xH_NEON(pred, prediction_stride, obmc_pred,
- obmc_prediction_stride, height);
+ OverlapBlendFromTop8xH_NEON(pred, prediction_stride, obmc_pred, height);
return;
}
@@ -773,19 +789,16 @@ void OverlapBlendFromTop_NEON(
const uint16x8_t pred_mask = vld1q_u16(mask);
// 64 - mask
const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask);
-#define OBMC_ROW_FROM_TOP(n) \
- do { \
- int x = 0; \
- do { \
- const uint16x8_t result = BlendObmcFromTop8<n>( \
- reinterpret_cast<uint8_t*>(reinterpret_cast<uint16_t*>(pred) + x), \
- reinterpret_cast<const uint8_t*>( \
- reinterpret_cast<const uint16_t*>(obmc_pred) + x), \
- pred_mask, obmc_pred_mask); \
- vst1q_u16(reinterpret_cast<uint16_t*>(pred) + x, result); \
- \
- x += 8; \
- } while (x < width); \
+#define OBMC_ROW_FROM_TOP(n) \
+ do { \
+ int x = 0; \
+ do { \
+ const uint16x8_t result = BlendObmcFromTop8<n>( \
+ pred + x, obmc_pred + x, pred_mask, obmc_pred_mask); \
+ vst1q_u16(pred + x, result); \
+ \
+ x += 8; \
+ } while (x < width); \
} while (false)
// Compute 1 row.
@@ -797,11 +810,11 @@ void OverlapBlendFromTop_NEON(
// Compute 3 rows.
if (height == 4) {
OBMC_ROW_FROM_TOP(0);
- pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
OBMC_ROW_FROM_TOP(1);
- pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
OBMC_ROW_FROM_TOP(2);
return;
}
@@ -809,20 +822,20 @@ void OverlapBlendFromTop_NEON(
// Compute 6 rows.
if (height == 8) {
OBMC_ROW_FROM_TOP(0);
- pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
OBMC_ROW_FROM_TOP(1);
- pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
OBMC_ROW_FROM_TOP(2);
- pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
OBMC_ROW_FROM_TOP(3);
- pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
OBMC_ROW_FROM_TOP(4);
- pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
OBMC_ROW_FROM_TOP(5);
return;
}
@@ -830,42 +843,42 @@ void OverlapBlendFromTop_NEON(
// Compute 12 rows.
if (height == 16) {
OBMC_ROW_FROM_TOP(0);
- pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
OBMC_ROW_FROM_TOP(1);
- pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
OBMC_ROW_FROM_TOP(2);
- pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
OBMC_ROW_FROM_TOP(3);
- pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
OBMC_ROW_FROM_TOP(4);
- pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
OBMC_ROW_FROM_TOP(5);
- pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
OBMC_ROW_FROM_TOP(6);
- pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
OBMC_ROW_FROM_TOP(7);
- pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
const uint16x8_t pred_mask = vld1q_u16(&mask[8]);
// 64 - mask
const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask);
OBMC_ROW_FROM_TOP(0);
- pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
OBMC_ROW_FROM_TOP(1);
- pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
OBMC_ROW_FROM_TOP(2);
- pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
OBMC_ROW_FROM_TOP(3);
return;
}
@@ -879,29 +892,29 @@ void OverlapBlendFromTop_NEON(
// 64 - mask
const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask);
OBMC_ROW_FROM_TOP(0);
- pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
OBMC_ROW_FROM_TOP(1);
- pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
OBMC_ROW_FROM_TOP(2);
- pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
OBMC_ROW_FROM_TOP(3);
- pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
OBMC_ROW_FROM_TOP(4);
- pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
OBMC_ROW_FROM_TOP(5);
- pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
OBMC_ROW_FROM_TOP(6);
- pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
OBMC_ROW_FROM_TOP(7);
- pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
y += 8;
} while (y < compute_height);
diff --git a/src/dsp/arm/warp_neon.cc b/src/dsp/arm/warp_neon.cc
index 71e0a43..da380b1 100644
--- a/src/dsp/arm/warp_neon.cc
+++ b/src/dsp/arm/warp_neon.cc
@@ -147,14 +147,8 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source,
do {
const int src_x = (start_x + 4) << subsampling_x;
const int src_y = (start_y + 4) << subsampling_y;
- const int dst_x =
- src_x * warp_params[2] + src_y * warp_params[3] + warp_params[0];
- const int dst_y =
- src_x * warp_params[4] + src_y * warp_params[5] + warp_params[1];
- const int x4 = dst_x >> subsampling_x;
- const int y4 = dst_y >> subsampling_y;
- const int ix4 = x4 >> kWarpedModelPrecisionBits;
- const int iy4 = y4 >> kWarpedModelPrecisionBits;
+ const WarpFilterParams filter_params = GetWarpFilterParams(
+ src_x, src_y, subsampling_x, subsampling_y, warp_params);
// A prediction block may fall outside the frame's boundaries. If a
// prediction block is calculated using only samples outside the frame's
// boundary, the filtering can be simplified. We can divide the plane
@@ -207,22 +201,24 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source,
// border index (source_width - 1 or 0, respectively). Then for each x,
// the inner for loop of the horizontal filter is reduced to multiplying
// the border pixel by the sum of the filter coefficients.
- if (ix4 - 7 >= source_width - 1 || ix4 + 7 <= 0) {
+ if (filter_params.ix4 - 7 >= source_width - 1 ||
+ filter_params.ix4 + 7 <= 0) {
// Regions 1 and 2.
// Points to the left or right border of the first row of |src|.
const uint8_t* first_row_border =
- (ix4 + 7 <= 0) ? src : src + source_width - 1;
+ (filter_params.ix4 + 7 <= 0) ? src : src + source_width - 1;
// In general, for y in [-7, 8), the row number iy4 + y is clipped:
// const int row = Clip3(iy4 + y, 0, source_height - 1);
// In two special cases, iy4 + y is clipped to either 0 or
// source_height - 1 for all y. In the rest of the cases, iy4 + y is
// bounded and we can avoid clipping iy4 + y by relying on a reference
// frame's boundary extension on the top and bottom.
- if (iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0) {
+ if (filter_params.iy4 - 7 >= source_height - 1 ||
+ filter_params.iy4 + 7 <= 0) {
// Region 1.
// Every sample used to calculate the prediction block has the same
// value. So the whole prediction block has the same value.
- const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1;
+ const int row = (filter_params.iy4 + 7 <= 0) ? 0 : source_height - 1;
const uint8_t row_border_pixel =
first_row_border[row * source_stride];
@@ -256,15 +252,15 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source,
// We may over-read up to 13 pixels above the top source row, or up
// to 13 pixels below the bottom source row. This is proved in
// warp.cc.
- const int row = iy4 + y;
+ const int row = filter_params.iy4 + y;
int sum = first_row_border[row * source_stride];
sum <<= (kFilterBits - kInterRoundBitsHorizontal);
intermediate_result_column[y + 7] = sum;
}
// Vertical filter.
DestType* dst_row = dst + start_x - block_start_x;
- int sy4 =
- (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta);
+ int sy4 = (filter_params.y4 & ((1 << kWarpedModelPrecisionBits) - 1)) -
+ MultiplyBy4(delta);
for (int y = 0; y < 8; ++y) {
int sy = sy4 - MultiplyBy4(gamma);
#if defined(__aarch64__)
@@ -341,10 +337,11 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source,
// source_height - 1 for all y. In the rest of the cases, iy4 + y is
// bounded and we can avoid clipping iy4 + y by relying on a reference
// frame's boundary extension on the top and bottom.
- if (iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0) {
+ if (filter_params.iy4 - 7 >= source_height - 1 ||
+ filter_params.iy4 + 7 <= 0) {
// Region 3.
// Horizontal filter.
- const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1;
+ const int row = (filter_params.iy4 + 7 <= 0) ? 0 : source_height - 1;
const uint8_t* const src_row = src + row * source_stride;
// Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also
// read but is ignored.
@@ -354,11 +351,12 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source,
// has left and right borders of at least 13 bytes that extend the
// frame boundary pixels. We also assume there is at least one extra
// padding byte after the right border of the last source row.
- const uint8x16_t src_row_v = vld1q_u8(&src_row[ix4 - 7]);
+ const uint8x16_t src_row_v = vld1q_u8(&src_row[filter_params.ix4 - 7]);
// Convert src_row_v to int8 (subtract 128).
const int8x16_t src_row_centered =
vreinterpretq_s8_u8(vsubq_u8(src_row_v, vdupq_n_u8(128)));
- int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7;
+ int sx4 = (filter_params.x4 & ((1 << kWarpedModelPrecisionBits) - 1)) -
+ beta * 7;
for (int y = -7; y < 8; ++y) {
HorizontalFilter(sx4, alpha, src_row_centered,
intermediate_result[y + 7]);
@@ -367,12 +365,13 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source,
} else {
// Region 4.
// Horizontal filter.
- int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7;
+ int sx4 = (filter_params.x4 & ((1 << kWarpedModelPrecisionBits) - 1)) -
+ beta * 7;
for (int y = -7; y < 8; ++y) {
// We may over-read up to 13 pixels above the top source row, or up
// to 13 pixels below the bottom source row. This is proved in
// warp.cc.
- const int row = iy4 + y;
+ const int row = filter_params.iy4 + y;
const uint8_t* const src_row = src + row * source_stride;
// Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also
// read but is ignored.
@@ -382,7 +381,8 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source,
// has left and right borders of at least 13 bytes that extend the
// frame boundary pixels. We also assume there is at least one extra
// padding byte after the right border of the last source row.
- const uint8x16_t src_row_v = vld1q_u8(&src_row[ix4 - 7]);
+ const uint8x16_t src_row_v =
+ vld1q_u8(&src_row[filter_params.ix4 - 7]);
// Convert src_row_v to int8 (subtract 128).
const int8x16_t src_row_centered =
vreinterpretq_s8_u8(vsubq_u8(src_row_v, vdupq_n_u8(128)));
@@ -395,8 +395,8 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source,
// Regions 3 and 4.
// Vertical filter.
DestType* dst_row = dst + start_x - block_start_x;
- int sy4 =
- (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta);
+ int sy4 = (filter_params.y4 & ((1 << kWarpedModelPrecisionBits) - 1)) -
+ MultiplyBy4(delta);
for (int y = 0; y < 8; ++y) {
int sy = sy4 - MultiplyBy4(gamma);
int16x8_t filter[8];
@@ -574,14 +574,8 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source,
do {
const int src_x = (start_x + 4) << subsampling_x;
const int src_y = (start_y + 4) << subsampling_y;
- const int dst_x =
- src_x * warp_params[2] + src_y * warp_params[3] + warp_params[0];
- const int dst_y =
- src_x * warp_params[4] + src_y * warp_params[5] + warp_params[1];
- const int x4 = dst_x >> subsampling_x;
- const int y4 = dst_y >> subsampling_y;
- const int ix4 = x4 >> kWarpedModelPrecisionBits;
- const int iy4 = y4 >> kWarpedModelPrecisionBits;
+ const WarpFilterParams filter_params = GetWarpFilterParams(
+ src_x, src_y, subsampling_x, subsampling_y, warp_params);
// A prediction block may fall outside the frame's boundaries. If a
// prediction block is calculated using only samples outside the frame's
// boundary, the filtering can be simplified. We can divide the plane
@@ -634,22 +628,24 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source,
// border index (source_width - 1 or 0, respectively). Then for each x,
// the inner for loop of the horizontal filter is reduced to multiplying
// the border pixel by the sum of the filter coefficients.
- if (ix4 - 7 >= source_width - 1 || ix4 + 7 <= 0) {
+ if (filter_params.ix4 - 7 >= source_width - 1 ||
+ filter_params.ix4 + 7 <= 0) {
// Regions 1 and 2.
// Points to the left or right border of the first row of |src|.
const uint16_t* first_row_border =
- (ix4 + 7 <= 0) ? src : src + source_width - 1;
+ (filter_params.ix4 + 7 <= 0) ? src : src + source_width - 1;
// In general, for y in [-7, 8), the row number iy4 + y is clipped:
// const int row = Clip3(iy4 + y, 0, source_height - 1);
// In two special cases, iy4 + y is clipped to either 0 or
// source_height - 1 for all y. In the rest of the cases, iy4 + y is
// bounded and we can avoid clipping iy4 + y by relying on a reference
// frame's boundary extension on the top and bottom.
- if (iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0) {
+ if (filter_params.iy4 - 7 >= source_height - 1 ||
+ filter_params.iy4 + 7 <= 0) {
// Region 1.
// Every sample used to calculate the prediction block has the same
// value. So the whole prediction block has the same value.
- const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1;
+ const int row = (filter_params.iy4 + 7 <= 0) ? 0 : source_height - 1;
const uint16_t row_border_pixel = first_row_border[row * src_stride];
DestType* dst_row = dst + start_x - block_start_x;
@@ -684,15 +680,15 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source,
// We may over-read up to 13 pixels above the top source row, or up
// to 13 pixels below the bottom source row. This is proved in
// warp.cc.
- const int row = iy4 + y;
+ const int row = filter_params.iy4 + y;
int sum = first_row_border[row * src_stride];
sum <<= (kFilterBits - kInterRoundBitsHorizontal);
intermediate_result_column[y + 7] = sum;
}
// Vertical filter.
DestType* dst_row = dst + start_x - block_start_x;
- int sy4 =
- (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta);
+ int sy4 = (filter_params.y4 & ((1 << kWarpedModelPrecisionBits) - 1)) -
+ MultiplyBy4(delta);
for (int y = 0; y < 8; ++y) {
int sy = sy4 - MultiplyBy4(gamma);
#if defined(__aarch64__)
@@ -782,10 +778,11 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source,
// source_height - 1 for all y. In the rest of the cases, iy4 + y is
// bounded and we can avoid clipping iy4 + y by relying on a reference
// frame's boundary extension on the top and bottom.
- if (iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0) {
+ if (filter_params.iy4 - 7 >= source_height - 1 ||
+ filter_params.iy4 + 7 <= 0) {
// Region 3.
// Horizontal filter.
- const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1;
+ const int row = (filter_params.iy4 + 7 <= 0) ? 0 : source_height - 1;
const uint16_t* const src_row = src + row * src_stride;
// Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also
// read but is ignored.
@@ -795,8 +792,10 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source,
// has left and right borders of at least 13 pixels that extend the
// frame boundary pixels. We also assume there is at least one extra
// padding pixel after the right border of the last source row.
- const uint16x8x2_t src_row_v = LoadSrcRow(&src_row[ix4 - 7]);
- int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7;
+ const uint16x8x2_t src_row_v =
+ LoadSrcRow(&src_row[filter_params.ix4 - 7]);
+ int sx4 = (filter_params.x4 & ((1 << kWarpedModelPrecisionBits) - 1)) -
+ beta * 7;
for (int y = -7; y < 8; ++y) {
HorizontalFilter(sx4, alpha, src_row_v, intermediate_result[y + 7]);
sx4 += beta;
@@ -804,12 +803,13 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source,
} else {
// Region 4.
// Horizontal filter.
- int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7;
+ int sx4 = (filter_params.x4 & ((1 << kWarpedModelPrecisionBits) - 1)) -
+ beta * 7;
for (int y = -7; y < 8; ++y) {
// We may over-read up to 13 pixels above the top source row, or up
// to 13 pixels below the bottom source row. This is proved in
// warp.cc.
- const int row = iy4 + y;
+ const int row = filter_params.iy4 + y;
const uint16_t* const src_row = src + row * src_stride;
// Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also
// read but is ignored.
@@ -819,7 +819,8 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source,
// frame has left and right borders of at least 13 pixels that extend
// the frame boundary pixels. We also assume there is at least one
// extra padding pixel after the right border of the last source row.
- const uint16x8x2_t src_row_v = LoadSrcRow(&src_row[ix4 - 7]);
+ const uint16x8x2_t src_row_v =
+ LoadSrcRow(&src_row[filter_params.ix4 - 7]);
HorizontalFilter(sx4, alpha, src_row_v, intermediate_result[y + 7]);
sx4 += beta;
}
@@ -828,8 +829,8 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source,
// Regions 3 and 4.
// Vertical filter.
DestType* dst_row = dst + start_x - block_start_x;
- int sy4 =
- (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta);
+ int sy4 = (filter_params.y4 & ((1 << kWarpedModelPrecisionBits) - 1)) -
+ MultiplyBy4(delta);
for (int y = 0; y < 8; ++y) {
int sy = sy4 - MultiplyBy4(gamma);
int16x8_t filter[8];
diff --git a/src/dsp/average_blend.cc b/src/dsp/average_blend.cc
index 273b355..1a37aa1 100644
--- a/src/dsp/average_blend.cc
+++ b/src/dsp/average_blend.cc
@@ -87,6 +87,21 @@ void Init10bpp() {
}
#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+void Init12bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(12);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->average_blend = AverageBlend_C<12, uint16_t>;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp12bpp_AverageBlend
+ dsp->average_blend = AverageBlend_C<12, uint16_t>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif
+
} // namespace
void AverageBlendInit_C() {
@@ -94,6 +109,9 @@ void AverageBlendInit_C() {
#if LIBGAV1_MAX_BITDEPTH >= 10
Init10bpp();
#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+ Init12bpp();
+#endif
}
} // namespace dsp
diff --git a/src/dsp/average_blend_test.cc b/src/dsp/average_blend_test.cc
index 04e24e5..6d1100a 100644
--- a/src/dsp/average_blend_test.cc
+++ b/src/dsp/average_blend_test.cc
@@ -59,6 +59,7 @@ template <int bitdepth, typename Pixel>
class AverageBlendTest : public testing::TestWithParam<BlockSize>,
public test_utils::MaxAlignedAllocable {
public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
AverageBlendTest() = default;
~AverageBlendTest() override = default;
@@ -282,6 +283,60 @@ INSTANTIATE_TEST_SUITE_P(NEON, AverageBlendTest10bpp,
#endif
#endif // LIBGAV1_MAX_BITDEPTH >= 10
+#if LIBGAV1_MAX_BITDEPTH == 12
+using AverageBlendTest12bpp = AverageBlendTest<12, uint16_t>;
+
+const char* GetAverageBlendDigest12bpp(const BlockSize block_size) {
+ static const char* const kDigests[kMaxBlockSizes] = {
+ // 4xN
+ "8f5ad8fba61a0f1cb6b77f5460c241be",
+ "3a9d017848fdb4162315c689b4449ac6",
+ "bb97029fff021b168b98b209dcee5123",
+ // 8xN
+ "a7ff1b199965b8856499ae3f1b2c48eb",
+ "05220c72835fc4662d261183df0a57cf",
+ "97de8c325f1475c44e1afc44183e55ad",
+ "60d820c46cad14d9d934da238bb79707",
+ // 16xN
+ "f3e4863121819bc28f7c1f453898650c",
+ "5f5f68d21269d7df546c848921e8f2cd",
+ "17efe0b0fce1f8d4c7bc6eacf769063e",
+ "3da591e201f44511cdd6c465692ace1e",
+ "5a0ca6c88664d2e918a032b5fcf66070",
+ // 32xN
+ "efe236bee8a9fef90b99d8012006f985",
+ "d6ff3aacbbbadff6d0ccb0873fb9fa2a",
+ "38801f7361052873423d57b574aabddc",
+ "55c76772ecdc1721e92ca04d2fc7c089",
+ // 64xN
+ "4261ecdde34eedc4e5066a93e0f64881",
+ "fe82e012efab872672193316d670fd82",
+ "6c698bc2d4acf4444a64ac55ae9641de",
+ "98626e25101cff69019d1b7e6e439404",
+ // 128xN
+ "fe0f3c89dd39786df1c952a2470d680d",
+ "af7e166fc3d8c9ce85789acf3467ed9d",
+ };
+ assert(block_size < kMaxBlockSizes);
+ return kDigests[block_size];
+}
+
+TEST_P(AverageBlendTest12bpp, Blending) {
+ Test(GetAverageBlendDigest12bpp(GetParam()), 1, false);
+}
+
+TEST_P(AverageBlendTest12bpp, DISABLED_Speed) {
+ Test(GetAverageBlendDigest12bpp(GetParam()),
+ kNumSpeedTests /
+ (kBlockHeightPixels[GetParam()] * kBlockHeightPixels[GetParam()]) /
+ 2,
+ false);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, AverageBlendTest12bpp,
+ testing::ValuesIn(kTestParam));
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
} // namespace
} // namespace dsp
diff --git a/src/dsp/cdef.cc b/src/dsp/cdef.cc
index ca2adfd..9dd9287 100644
--- a/src/dsp/cdef.cc
+++ b/src/dsp/cdef.cc
@@ -32,9 +32,11 @@ namespace {
#include "src/dsp/cdef.inc"
// Silence unused function warnings when CdefDirection_C is obviated.
-#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \
- !defined(LIBGAV1_Dsp8bpp_CdefDirection) || \
- (LIBGAV1_MAX_BITDEPTH >= 10 && !defined(LIBGAV1_Dsp10bpp_CdefDirection))
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \
+ !defined(LIBGAV1_Dsp8bpp_CdefDirection) || \
+ (LIBGAV1_MAX_BITDEPTH >= 10 && \
+ !defined(LIBGAV1_Dsp10bpp_CdefDirection)) || \
+ (LIBGAV1_MAX_BITDEPTH == 12 && !defined(LIBGAV1_Dsp12bpp_CdefDirection))
constexpr int16_t kDivisionTable[] = {840, 420, 280, 210, 168, 140, 120, 105};
int32_t Square(int32_t x) { return x * x; }
@@ -103,12 +105,15 @@ void CdefDirection_C(const void* LIBGAV1_RESTRICT const source,
#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS ||
// !defined(LIBGAV1_Dsp8bpp_CdefDirection) ||
// (LIBGAV1_MAX_BITDEPTH >= 10 &&
- // !defined(LIBGAV1_Dsp10bpp_CdefDirection))
+ // !defined(LIBGAV1_Dsp10bpp_CdefDirection))
+ // (LIBGAV1_MAX_BITDEPTH == 12 &&
+ // !defined(LIBGAV1_Dsp12bpp_CdefDirection))
// Silence unused function warnings when CdefFilter_C is obviated.
-#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \
- !defined(LIBGAV1_Dsp8bpp_CdefFilters) || \
- (LIBGAV1_MAX_BITDEPTH >= 10 && !defined(LIBGAV1_Dsp10bpp_CdefFilters))
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \
+ !defined(LIBGAV1_Dsp8bpp_CdefFilters) || \
+ (LIBGAV1_MAX_BITDEPTH >= 10 && !defined(LIBGAV1_Dsp10bpp_CdefFilters)) || \
+ (LIBGAV1_MAX_BITDEPTH == 12 && !defined(LIBGAV1_Dsp12bpp_CdefFilters))
int Constrain(int diff, int threshold, int damping) {
assert(threshold != 0);
@@ -218,7 +223,9 @@ void CdefFilter_C(const uint16_t* LIBGAV1_RESTRICT src,
#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS ||
// !defined(LIBGAV1_Dsp8bpp_CdefFilters) ||
// (LIBGAV1_MAX_BITDEPTH >= 10 &&
- // !defined(LIBGAV1_Dsp10bpp_CdefFilters))
+ // !defined(LIBGAV1_Dsp10bpp_CdefFilters))
+ // (LIBGAV1_MAX_BITDEPTH == 12 &&
+ // !defined(LIBGAV1_Dsp12bpp_CdefFilters))
void Init8bpp() {
Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
@@ -294,7 +301,48 @@ void Init10bpp() {
#endif
#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
}
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+void Init12bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(12);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->cdef_direction = CdefDirection_C<12, uint16_t>;
+ dsp->cdef_filters[0][0] = CdefFilter_C<4, 12, uint16_t>;
+ dsp->cdef_filters[0][1] =
+ CdefFilter_C<4, 12, uint16_t, /*enable_primary=*/true,
+ /*enable_secondary=*/false>;
+ dsp->cdef_filters[0][2] =
+ CdefFilter_C<4, 12, uint16_t, /*enable_primary=*/false>;
+ dsp->cdef_filters[1][0] = CdefFilter_C<8, 12, uint16_t>;
+ dsp->cdef_filters[1][1] =
+ CdefFilter_C<8, 12, uint16_t, /*enable_primary=*/true,
+ /*enable_secondary=*/false>;
+ dsp->cdef_filters[1][2] =
+ CdefFilter_C<8, 12, uint16_t, /*enable_primary=*/false>;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp12bpp_CdefDirection
+ dsp->cdef_direction = CdefDirection_C<12, uint16_t>;
#endif
+#ifndef LIBGAV1_Dsp12bpp_CdefFilters
+ dsp->cdef_filters[0][0] = CdefFilter_C<4, 12, uint16_t>;
+ dsp->cdef_filters[0][1] =
+ CdefFilter_C<4, 12, uint16_t, /*enable_primary=*/true,
+ /*enable_secondary=*/false>;
+ dsp->cdef_filters[0][2] =
+ CdefFilter_C<4, 12, uint16_t, /*enable_primary=*/false>;
+ dsp->cdef_filters[1][0] = CdefFilter_C<8, 12, uint16_t>;
+ dsp->cdef_filters[1][1] =
+ CdefFilter_C<8, 12, uint16_t, /*enable_primary=*/true,
+ /*enable_secondary=*/false>;
+ dsp->cdef_filters[1][2] =
+ CdefFilter_C<8, 12, uint16_t, /*enable_primary=*/false>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif // LIBGAV1_MAX_BITDEPTH == 12
} // namespace
@@ -303,6 +351,9 @@ void CdefInit_C() {
#if LIBGAV1_MAX_BITDEPTH >= 10
Init10bpp();
#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+ Init12bpp();
+#endif
}
} // namespace dsp
diff --git a/src/dsp/cdef.h b/src/dsp/cdef.h
index b820b77..ce23ea5 100644
--- a/src/dsp/cdef.h
+++ b/src/dsp/cdef.h
@@ -38,6 +38,11 @@
namespace libgav1 {
namespace dsp {
+enum {
+ kCdefSecondaryTap0 = 2,
+ kCdefSecondaryTap1 = 1,
+};
+
// Initializes Dsp::cdef_direction and Dsp::cdef_filters. This function is not
// thread-safe.
void CdefInit_C();
diff --git a/src/dsp/cdef_test.cc b/src/dsp/cdef_test.cc
index c10a8d7..c25d7df 100644
--- a/src/dsp/cdef_test.cc
+++ b/src/dsp/cdef_test.cc
@@ -46,10 +46,11 @@ constexpr int kSourceBufferSize =
constexpr int kNumSpeedTests = 5000;
const char* GetDirectionDigest(const int bitdepth, const int num_runs) {
- static const char* const kDigest[2][2] = {
+ static const char* const kDigest[3][2] = {
{"de78c820a1fec7e81385aa0a615dbf8c", "7bfc543244f932a542691480dc4541b2"},
- {"b54236de5d25e16c0f8678d9784cb85e", "559144cf183f3c69cb0e5d98cbf532ff"}};
- const int bitdepth_index = (bitdepth == 8) ? 0 : 1;
+ {"b54236de5d25e16c0f8678d9784cb85e", "559144cf183f3c69cb0e5d98cbf532ff"},
+ {"5532919a157c4f937da9e822bdb105f7", "dd9dfca6dfca83777d942e693c17627a"}};
+ const int bitdepth_index = (bitdepth - 8) / 2;
const int run_index = (num_runs == 1) ? 0 : 1;
return kDigest[bitdepth_index][run_index];
}
@@ -59,6 +60,7 @@ const char* GetDirectionDigest(const int bitdepth, const int num_runs) {
template <int bitdepth, typename Pixel>
class CdefDirectionTest : public testing::TestWithParam<int> {
public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
CdefDirectionTest() = default;
CdefDirectionTest(const CdefDirectionTest&) = delete;
CdefDirectionTest& operator=(const CdefDirectionTest&) = delete;
@@ -167,6 +169,18 @@ INSTANTIATE_TEST_SUITE_P(NEON, CdefDirectionTest10bpp, testing::Values(0));
#endif
#endif // LIBGAV1_MAX_BITDEPTH >= 10
+#if LIBGAV1_MAX_BITDEPTH == 12
+using CdefDirectionTest12bpp = CdefDirectionTest<12, uint16_t>;
+
+TEST_P(CdefDirectionTest12bpp, Correctness) { TestRandomValues(1); }
+
+TEST_P(CdefDirectionTest12bpp, DISABLED_Speed) {
+ TestRandomValues(kNumSpeedTests / 100);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, CdefDirectionTest12bpp, testing::Values(0));
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
const char* GetDigest8bpp(int id) {
static const char* const kDigest[] = {
"b6fe1a1f5bbb23e35197160ce57d90bd", "8aed39871b19184f1d381b145779bc33",
@@ -199,6 +213,23 @@ const char* GetDigest10bpp(int id) {
}
#endif // LIBGAV1_MAX_BITDEPTH >= 10
+#if LIBGAV1_MAX_BITDEPTH == 12
+const char* GetDigest12bpp(int id) {
+ static const char* const kDigest[] = {
+ "a32569989c42fd4254979f70c1c65f5a", "dc389048217633e2dd64126376be7d25",
+ "3b0e8dae294895330f349863b1773c39", "9741fe8d27d109cb99b7a9cdc030f52a",
+ "ab70f3729b52287c6432ba7624280a68", "c1e5cf39cbc8030b82e09633c6c67d42",
+ "d5120a196164ff5a0ad7aa8c02e9b064", "1133759f3aee3a362a0ab668f6faf843",
+ "feb0ab7f515665f79fce213e8cd2fb10", "e86ea55c2d6d5cc69716535bd455c99f",
+ "e463da1b9d089b6ee82c041794257fd7", "27800e4af0cceeaf0a95c96275a7befe",
+ "f42e426481db00582b327eb2971bca96", "6127ff289833dde0270000d8240f36b7",
+ "cc5dbaf70e2fef7729a8e2ea9937fbcf", "51850b4e3e2a3919e110376fcb6318d3",
+ "d5ac7ac25eb1b5aee293b2a2ec9de775", "64ecc00b2e24a2f07df833fb50ce09c3",
+ };
+ return kDigest[id];
+}
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
struct CdefTestParam {
CdefTestParam(int subsampling_x, int subsampling_y, int rows4x4,
int columns4x4)
@@ -224,6 +255,7 @@ std::ostream& operator<<(std::ostream& os, const CdefTestParam& param) {
template <int bitdepth, typename Pixel>
class CdefFilteringTest : public testing::TestWithParam<CdefTestParam> {
public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
CdefFilteringTest() = default;
CdefFilteringTest(const CdefFilteringTest&) = delete;
CdefFilteringTest& operator=(const CdefFilteringTest&) = delete;
@@ -328,19 +360,26 @@ void CdefFilteringTest<bitdepth, Pixel>::TestRandomValues(int num_runs) {
}
for (int plane = kPlaneY; plane < kMaxPlanes; ++plane) {
- if (bitdepth == 8) {
- test_utils::CheckMd5Digest(kCdef, kCdefFilterName,
- GetDigest8bpp(id + plane),
- reinterpret_cast<uint8_t*>(dest_[plane]),
- sizeof(dest_[plane]), elapsed_time);
+ const char* expected_digest = nullptr;
+ switch (bitdepth) {
+ case 8:
+ expected_digest = GetDigest8bpp(id + plane);
+ break;
#if LIBGAV1_MAX_BITDEPTH >= 10
- } else {
- test_utils::CheckMd5Digest(kCdef, kCdefFilterName,
- GetDigest10bpp(id + plane),
- reinterpret_cast<uint8_t*>(dest_[plane]),
- sizeof(dest_[plane]), elapsed_time);
-#endif // LIBGAV1_MAX_BITDEPTH >= 10
+ case 10:
+ expected_digest = GetDigest10bpp(id + plane);
+ break;
+#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+ case 12:
+ expected_digest = GetDigest12bpp(id + plane);
+ break;
+#endif
}
+ ASSERT_NE(expected_digest, nullptr);
+ test_utils::CheckMd5Digest(kCdef, kCdefFilterName, expected_digest,
+ reinterpret_cast<uint8_t*>(dest_[plane]),
+ sizeof(dest_[plane]), elapsed_time);
}
}
@@ -396,6 +435,19 @@ INSTANTIATE_TEST_SUITE_P(NEON, CdefFilteringTest10bpp,
#endif
#endif // LIBGAV1_MAX_BITDEPTH >= 10
+#if LIBGAV1_MAX_BITDEPTH == 12
+using CdefFilteringTest12bpp = CdefFilteringTest<12, uint16_t>;
+
+TEST_P(CdefFilteringTest12bpp, Correctness) { TestRandomValues(1); }
+
+TEST_P(CdefFilteringTest12bpp, DISABLED_Speed) {
+ TestRandomValues(kNumSpeedTests);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, CdefFilteringTest12bpp,
+ testing::ValuesIn(cdef_test_param));
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
} // namespace
} // namespace dsp
} // namespace libgav1
diff --git a/src/dsp/constants.h b/src/dsp/constants.h
index 7c1b62c..dd0a4e0 100644
--- a/src/dsp/constants.h
+++ b/src/dsp/constants.h
@@ -27,25 +27,7 @@
namespace libgav1 {
enum {
- // Documentation variables.
- kBitdepth8 = 8,
- kBitdepth10 = 10,
- kBitdepth12 = 12,
- // Weights are quadratic from '1' to '1 / block_size', scaled by
- // 2^kSmoothWeightScale.
- kSmoothWeightScale = 8,
kCflLumaBufferStride = 32,
- // InterRound0, Section 7.11.3.2.
- kInterRoundBitsHorizontal = 3, // 8 & 10-bit.
- kInterRoundBitsHorizontal12bpp = 5,
- kInterRoundBitsCompoundVertical = 7, // 8, 10 & 12-bit compound prediction.
- kInterRoundBitsVertical = 11, // 8 & 10-bit, single prediction.
- kInterRoundBitsVertical12bpp = 9,
- // Offset applied to 10bpp and 12bpp predictors to allow storing them in
- // uint16_t. Removed before blending.
- kCompoundOffset = (1 << 14) + (1 << 13),
- kCdefSecondaryTap0 = 2,
- kCdefSecondaryTap1 = 1,
}; // anonymous enum
extern const int8_t kFilterIntraTaps[kNumFilterIntraPredictors][8][8];
diff --git a/src/dsp/convolve.cc b/src/dsp/convolve.cc
index f11b45e..6989da0 100644
--- a/src/dsp/convolve.cc
+++ b/src/dsp/convolve.cc
@@ -864,7 +864,93 @@ void Init10bpp() {
#endif
#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
}
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+void Init12bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(12);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->convolve[0][0][0][0] = ConvolveCopy_C<12, uint16_t>;
+ dsp->convolve[0][0][0][1] = ConvolveHorizontal_C<12, uint16_t>;
+ dsp->convolve[0][0][1][0] = ConvolveVertical_C<12, uint16_t>;
+ dsp->convolve[0][0][1][1] = Convolve2D_C<12, uint16_t>;
+
+ dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_C<12, uint16_t>;
+ dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_C<12, uint16_t>;
+ dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_C<12, uint16_t>;
+ dsp->convolve[0][1][1][1] = ConvolveCompound2D_C<12, uint16_t>;
+
+ dsp->convolve[1][0][0][0] = ConvolveCopy_C<12, uint16_t>;
+ dsp->convolve[1][0][0][1] =
+ ConvolveIntraBlockCopy1D_C<12, uint16_t, /*is_horizontal=*/true>;
+ dsp->convolve[1][0][1][0] =
+ ConvolveIntraBlockCopy1D_C<12, uint16_t, /*is_horizontal=*/false>;
+ dsp->convolve[1][0][1][1] = ConvolveIntraBlockCopy2D_C<12, uint16_t>;
+
+ dsp->convolve[1][1][0][0] = nullptr;
+ dsp->convolve[1][1][0][1] = nullptr;
+ dsp->convolve[1][1][1][0] = nullptr;
+ dsp->convolve[1][1][1][1] = nullptr;
+
+ dsp->convolve_scale[0] = ConvolveScale2D_C<12, uint16_t>;
+ dsp->convolve_scale[1] = ConvolveCompoundScale2D_C<12, uint16_t>;
+#else // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+#ifndef LIBGAV1_Dsp12bpp_ConvolveCopy
+ dsp->convolve[0][0][0][0] = ConvolveCopy_C<12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_ConvolveHorizontal
+ dsp->convolve[0][0][0][1] = ConvolveHorizontal_C<12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_ConvolveVertical
+ dsp->convolve[0][0][1][0] = ConvolveVertical_C<12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_Convolve2D
+ dsp->convolve[0][0][1][1] = Convolve2D_C<12, uint16_t>;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_ConvolveCompoundCopy
+ dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_C<12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_ConvolveCompoundHorizontal
+ dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_C<12, uint16_t>;
#endif
+#ifndef LIBGAV1_Dsp12bpp_ConvolveCompoundVertical
+ dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_C<12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_ConvolveCompound2D
+ dsp->convolve[0][1][1][1] = ConvolveCompound2D_C<12, uint16_t>;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_ConvolveIntraBlockCopy
+ dsp->convolve[1][0][0][0] = ConvolveCopy_C<12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_ConvolveIntraBlockHorizontal
+ dsp->convolve[1][0][0][1] =
+ ConvolveIntraBlockCopy1D_C<12, uint16_t, /*is_horizontal=*/true>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_ConvolveIntraBlockVertical
+ dsp->convolve[1][0][1][0] =
+ ConvolveIntraBlockCopy1D_C<12, uint16_t, /*is_horizontal=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_ConvolveIntraBlock2D
+ dsp->convolve[1][0][1][1] = ConvolveIntraBlockCopy2D_C<12, uint16_t>;
+#endif
+
+ dsp->convolve[1][1][0][0] = nullptr;
+ dsp->convolve[1][1][0][1] = nullptr;
+ dsp->convolve[1][1][1][0] = nullptr;
+ dsp->convolve[1][1][1][1] = nullptr;
+
+#ifndef LIBGAV1_Dsp12bpp_ConvolveScale2D
+ dsp->convolve_scale[0] = ConvolveScale2D_C<12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_ConvolveCompoundScale2D
+ dsp->convolve_scale[1] = ConvolveCompoundScale2D_C<12, uint16_t>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif // LIBGAV1_MAX_BITDEPTH == 12
} // namespace
@@ -873,6 +959,9 @@ void ConvolveInit_C() {
#if LIBGAV1_MAX_BITDEPTH >= 10
Init10bpp();
#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+ Init12bpp();
+#endif
}
} // namespace dsp
diff --git a/src/dsp/convolve.h b/src/dsp/convolve.h
index 5bc0bad..8780bfc 100644
--- a/src/dsp/convolve.h
+++ b/src/dsp/convolve.h
@@ -17,6 +17,8 @@
#ifndef LIBGAV1_SRC_DSP_CONVOLVE_H_
#define LIBGAV1_SRC_DSP_CONVOLVE_H_
+#include <cassert>
+
// Pull in LIBGAV1_DspXXX defines representing the implementation status
// of each function. The resulting value of each can be used by each module to
// determine whether an implementation is needed at compile time.
@@ -43,6 +45,35 @@ namespace dsp {
// thread-safe.
void ConvolveInit_C();
+inline int GetNumTapsInFilter(const int filter_index) {
+ if (filter_index < 2) {
+ // Despite the names these only use 6 taps.
+ // kInterpolationFilterEightTap
+ // kInterpolationFilterEightTapSmooth
+ return 6;
+ }
+
+ if (filter_index == 2) {
+ // kInterpolationFilterEightTapSharp
+ return 8;
+ }
+
+ if (filter_index == 3) {
+ // kInterpolationFilterBilinear
+ return 2;
+ }
+
+ assert(filter_index > 3);
+ // For small sizes (width/height <= 4) the large filters are replaced with 4
+ // tap options.
+ // If the original filters were |kInterpolationFilterEightTap| or
+ // |kInterpolationFilterEightTapSharp| then it becomes
+ // |kInterpolationFilterSwitchable|.
+ // If it was |kInterpolationFilterEightTapSmooth| then it becomes an unnamed 4
+ // tap filter.
+ return 4;
+}
+
} // namespace dsp
} // namespace libgav1
diff --git a/src/dsp/convolve.inc b/src/dsp/convolve.inc
index e0f755e..2e0b270 100644
--- a/src/dsp/convolve.inc
+++ b/src/dsp/convolve.inc
@@ -12,39 +12,10 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-// Constants and utility functions used for convolve implementations.
+// Constants used for convolve implementations.
// This will be included inside an anonymous namespace on files where these are
// necessary.
-int GetNumTapsInFilter(const int filter_index) {
- if (filter_index < 2) {
- // Despite the names these only use 6 taps.
- // kInterpolationFilterEightTap
- // kInterpolationFilterEightTapSmooth
- return 6;
- }
-
- if (filter_index == 2) {
- // kInterpolationFilterEightTapSharp
- return 8;
- }
-
- if (filter_index == 3) {
- // kInterpolationFilterBilinear
- return 2;
- }
-
- assert(filter_index > 3);
- // For small sizes (width/height <= 4) the large filters are replaced with 4
- // tap options.
- // If the original filters were |kInterpolationFilterEightTap| or
- // |kInterpolationFilterEightTapSharp| then it becomes
- // |kInterpolationFilterSwitchable|.
- // If it was |kInterpolationFilterEightTapSmooth| then it becomes an unnamed 4
- // tap filter.
- return 4;
-}
-
constexpr int kIntermediateAllocWidth = kMaxSuperBlockSizeInPixels;
constexpr int kIntermediateStride = 8;
constexpr int kHorizontalOffset = 3;
diff --git a/src/dsp/convolve_test.cc b/src/dsp/convolve_test.cc
index 295c814..42cdeb7 100644
--- a/src/dsp/convolve_test.cc
+++ b/src/dsp/convolve_test.cc
@@ -418,6 +418,166 @@ const char* GetConvolveScaleDigest10bpp(int id) {
}
#endif // LIBGAV1_MAX_BITDEPTH >= 10
+#if LIBGAV1_MAX_BITDEPTH == 12
+const char* GetConvolveDigest12bpp(int id) {
+ // Entries containing 'XXXXX...' are skipped. See the test for details.
+ static const char* const kDigest[ConvolveTestParam::kNumBlockSizes * 16] = {
+ "e25031afae184cc4d186cde7e3d51e33", "6fb55dec2506dae6c229469cdf2e7d83",
+ "9df34d27f5bd040d1ed1455b151cd1ff", "7f6829458f00edb88f78851dd1a08739",
+ "a8bbe9b6b9eaf6f681d91c981b994949", "21f74980b36cb246426f4bc3fe7c08c3",
+ "403c2ccced3b5b141a4c7559c0cd841b", "1c3c4c6cd8a3e79cd95d6038531b47e5",
+ "f18d6950d36619086ac0055bab528cb1", "37d9c5babddf24fe8cb061297297b769",
+ "c111000d4021381f3d18ea0e24a1b5f5", "4e1e4f0a592ff028e35f8187627d6e68",
+ "0ca9ad4614d32da05746f8712a46d861", "8a122ab194e2fdb7089b29be50af8c86",
+ "3c21326e22a80982d1b0ffc09be4beae", "f6c8d1fe2c1fb19604c49c6a49bd26a8",
+ "d3eda9d7aa80e4ea1f18bf565b143e57", "fe21bd1cb8e90466dc727f2223ea7aed",
+ "01efe3df83c715325aaddd4d4ce130ad", "ecaa751360121d3746b72932415fb998",
+ "291e67095399651dc5c8a033390f255f", "66b26828e434faf37ddc57d3e0abb6db",
+ "e9cd69e9ba70864e3d0b175ac0a177d6", "64e4db895a843cb05384f5997b1ba978",
+ "f305161c82de999d2c93eac65f609cfe", "4762b2bd27983ad916ec0a930c0eca6b",
+ "1631495ffae43a927267ebd476015ef1", "b0f22de7b10057e07af71f9bce4615ce",
+ "6fa29dc4be1a46d246a41d66a3d35cb4", "734601c2185bdf30ba9ded8b07003a05",
+ "524e4553d92c69e7e4ed934f7b806c6b", "3709c8950bc5fcc4a2b3ec68fc78bf7e",
+ "69c274d9f8e0fd6790495e9695251f1f", "ee30cc1232c27494ef53edd383568f25",
+ "e525dbeb0b4341952a92270dcfc51730", "b3685c9e783d3402497bbd49d28c7dd7",
+ "d1c9f02dc818e6b974794dfb7749aac8", "bdb9e4961f9aa8c25568d3394e968518",
+ "f5f74555adcad85f3ebd3cb85dc7b770", "737e2a0be806dbd701014f2078be7898",
+ "20a18294e3a9422193aa0a219fd80ede", "7106648ecb9ae24a54d1dbabf2a9e318",
+ "20f39cbd6b5ed87a6ae4f818932325c0", "a99666e3157e32a07c87b01e52091a76",
+ "123e4d533d478c3089c975323c85396b", "d2a8021f7683a0cdf2658418fa90a6fc",
+ "1437e192a3349db8702d5b90eb88dbc1", "fe097fc4aeed7cda0b0f405124efb19d",
+ "1988227c51fa589db1307fd890bb5972", "537e25a6c30b240dc1e3bddd1c3a0a03",
+ "aebe5cffaae448db5a08987a3375a428", "7127ae9bdc63df4459590dc02ca95403",
+ "7ad281903a210f2b1f39f7c40c8df272", "d4b97ba21f7b400ba9f9cd8bb1a576df",
+ "0884a824203aaf72c78f73fdaf2b23a2", "63d60388605c92daee55d517de622a9e",
+ "171ec49a779de1efa69510eefbd09bba", "541cf051579c5a10b9debd3bfdcb7f32",
+ "91c14451ad93ed88e96b5d639ce408de", "3b0313ec0e043d19744bf88c90e875a1",
+ "6adcb3cee91fe3a83b36deb11c5ad6dd", "c6d4bfad24616a88222681992a99d782",
+ "515dc0f2a41730d5c434e4f3c81b02c3", "1c69abdee3b9608a6094034badc2bec0",
+ "1785a0f321d7dd90aa8846961737a767", "dd12c5b8c341f2423d0d5db4f285d199",
+ "5741fb69aae1ca8a0fbe4f1478df88ef", "a4390ceb4e4e9f5cf6a47a9b11a97015",
+ "6778eb25df902092b440c3402e7f0f80", "5ad9d6b36f8898bb55e901c1c0c523da",
+ "73969b6c03bb5a7345a8b968b542668e", "f48192947e66d70f116193a4186d0186",
+ "53f60d0e89d7d994ec6d6131fb7e75ae", "c75f6f8813839ae3cf192baa29039265",
+ "9ff0852ebbad56663250f86ac3a3bf9b", "668938580a770ea7ace8bbf7d349e89f",
+ "5b06bb0a15ac465a250d9b209f05289f", "a2128f5c8692fed7e7c1c7af22ce9f72",
+ "f80f1d7a58869ec794258c0f7df14620", "ed1e03a35924c92ed2fc9808dc3f06f3",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "35ef89c35d2e8e46feb856c554c21c9f",
+ "b98ce33a1bf4fab840b7ef261b30dbc4", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+ "b98ce33a1bf4fab840b7ef261b30dbc4", "42263fb359c4fdf1c7cdb4980b3e97f2",
+ "1e7071b7db3144188bdcf5d199fe5355", "1e7071b7db3144188bdcf5d199fe5355",
+ "30d367304a87bd25f0ad2ff8e4b5eb41", "4abe6dbb3198219015838dbedf07297a",
+ "4abe6dbb3198219015838dbedf07297a", "acec349a95b5bba98bb830372fa15e73",
+ "a73ad8661256ce2fdf5110425eb260b2", "a73ad8661256ce2fdf5110425eb260b2",
+ "8ff2f049d3f972867f14775188fe589b", "87f5f9a07aea75c325e6d7ff6c96c7c2",
+ "87f5f9a07aea75c325e6d7ff6c96c7c2", "325fcde7d415d7aa4929a3ea013fb9cc",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "05aa29814d5ce35389dbcf20368850da",
+ "fbb89f907a040e70953e3364dbe1feda", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+ "44ac511baf45032078cc0b45e41dba79", "efb98974adc58d88e122293436bb9184",
+ "7eee18c1a16bcb4e7ef7b27f68ba884f", "b0904c9b118dd9a1f9f034c0ff82d1c1",
+ "54436deb5183dd9669dd4f5feadb3850", "4db1c310b7d9a8bd3e2b5d20fa820e3b",
+ "c40abc6b2d67527f48a287cd7e157428", "48ec3fcf509805f484c8e0948c3469be",
+ "cb7d4a76fa7de52ed2fe889785327b38", "f57983346815fa41e969c195c1c03774",
+ "7dba59b0de2c877666ded6bdaefdcc30", "4837f8ba2f67f17f28a38c5e2a434c73",
+ "09e06fe9dc7ef7818f2a96895235afd4", "002976970ec62b360f956b9c091782d4",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "78673b1084367e97b8dd83990adc5219",
+ "06b5d4a30b9efb6c1d95ef7957f49e76", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+ "ce460146922cd53970b510d460aa4062", "6fd051938b8efcec9ece042f1edc177a",
+ "f5ff0dcfe3c1a56e3856549d8ded416b", "b69bc2cfc17c6b4313264db96831f0d1",
+ "38a5e65bd71934becfb376eb3b9bc513", "32c1163aa4ca6b6c69d950aba7b06d48",
+ "0c22a6c014c6347983de4ca863f3b53f", "a80c5ee9eb2dfb9a0d56e92eb3f85d91",
+ "a9719722a150a81175427bc161b95d7a", "8237befd456131a488cc5b8b63f4aca5",
+ "51616abcd0beea53a78ffce106b974fc", "6c47b22270f01d27b404da07e1be1202",
+ "356268298d3887edaabd0169a912c94e", "d2b00216e106cb8c5450e2eff1f8481a",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "c2de3a582c79aee811076211c497d2bc",
+ "d1b6d9c73da41def26dd4f85fbd1bde8", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+ "d8374eb7825081b89f74b05c66bccd63", "d5f7d68c10b5eaf0fba6f93ee26266e6",
+ "94d19cb65f29db65e6656b588f431ade", "5126e95f0249024a6f6d426714bd5b20",
+ "d7d3654b9c2dabe13239875984770acd", "6491afd5d651aab80aa179b579b74341",
+ "037a5de0de89983808f8e8f6dc39110f", "5980073b7685c5c9b2ec027e06be2cbc",
+ "0abb9d035aca426b62ca0f3fab063bab", "fe002a902bb4ec24dfe3ea0fe381a472",
+ "1ac15726df1aa2cd8855162a91893379", "0758c3ac16467605d73c725a697c3dc1",
+ "97d894d85f6ccfa4ff81e0e8fdf03da1", "c3c7b362f063a18244ea542a42d2873c",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "7f6829458f00edb88f78851dd1a08739",
+ "a8bbe9b6b9eaf6f681d91c981b994949", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+ "403c2ccced3b5b141a4c7559c0cd841b", "1c3c4c6cd8a3e79cd95d6038531b47e5",
+ "f18d6950d36619086ac0055bab528cb1", "37d9c5babddf24fe8cb061297297b769",
+ "c111000d4021381f3d18ea0e24a1b5f5", "4e1e4f0a592ff028e35f8187627d6e68",
+ "0ca9ad4614d32da05746f8712a46d861", "8a122ab194e2fdb7089b29be50af8c86",
+ "3c21326e22a80982d1b0ffc09be4beae", "f6c8d1fe2c1fb19604c49c6a49bd26a8",
+ "d3eda9d7aa80e4ea1f18bf565b143e57", "fe21bd1cb8e90466dc727f2223ea7aed",
+ "01efe3df83c715325aaddd4d4ce130ad", "ecaa751360121d3746b72932415fb998",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "67b2ea94cc4d0b32db3ae3c29eee4d46",
+ "bcfec99ad75988fa1efc1733204f17f2", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+ "79c222c5796e50119f1921e7bc534a25", "ae3f7c189458f044e9c52399d37a55e2",
+ "fd6dde45bd2937331428de9ef4f8e869", "b384d065423f3d271b85781d76a73218",
+ "466ea0680c06f59e8b3bb293608731fb", "360541ba94f42d115fe687a97a457ffb",
+ "e5a0794d37af40c40a4d2c6d3f7d2aa2", "4eed285651a75614bd60adebbe2e185c",
+ "bbdbf93942282d7b9c4f07591a1764a6", "1288a9ec3e6f79213b6745e6e7568c44",
+ "4ff1310bfd656d69ed5c108a91a9b01a", "3380806b5f67eb3ebce42f8e7c05b256",
+ "09c4bdf0f30aca6812fb55a5ac06b1bd", "722c86ba6bf21f40742ee33b4edc17c4",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "f5303c96d1630f9840eaaba058cd818b",
+ "c20cd45782b2f52c05e4189912047570", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+ "d6360f96fe15a3ee1e903b0a53dcaaeb", "4b18995cdf2f5d18410d3a732c5932b1",
+ "6f62bf7395c3dfccc1565ba8424f20e8", "c9987ed30491cd28bbc711dd57228247",
+ "8e277ec837cbecf529ae2eb0578fddc1", "c0c132386f23c5f0fba055a12fb79547",
+ "6b5617ab78dd0916690dfa358298b7b3", "394abedca37f60d1a5148a4c975305ed",
+ "bb88881e0e4cf2d88c2d2b38b5833f20", "bef10806be8d58ea8e97870a813b075e",
+ "b4b017d1f792bea69d3b773db7c80c7c", "0660bc63041213a8a4d74724a3bc4291",
+ "5050c8c5388a561691fd414b00c041df", "9ed40c68de6a8008a902d7224f8b620f",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "ec10ce4a674424478a401847f744251d",
+ "bdd897eafc8ef2651a7bba5e523a6ac2", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+ "2745de4a6b29abb85ee513e22ad362c3", "8aaad384b7cd349b4b968e657ec15523",
+ "fb6c0723432bcd2246d51a90f5fb5826", "f8104ed5921ebd48c6eed16150ffe028",
+ "85c2e236b3e32bf731601237cf0594cd", "8bd6eefff9640766cdf64ab082cb1485",
+ "78b5cd9dde6c6a5900f3040c57172091", "aaa980506bd7bb1d75924a8025698d1a",
+ "90050a411d501f7166f6741832b0c342", "d6ec88b2c38e32511f3359e1d5f9d85b",
+ "96506b8b39274c8fe687ea39761997f1", "3322ea83995c2762fb60db993b401658",
+ "151b6e4ce60392639982fca5a73ac3d3", "d52a1038e135bef233674a843f8c7cb6",
+ };
+ assert(id >= 0);
+ assert(id < sizeof(kDigest) / sizeof(kDigest[0]));
+ return kDigest[id];
+}
+
+const char* GetConvolveScaleDigest12bpp(int id) {
+ // Entries containing 'XXXXX...' are skipped. See the test for details.
+ static const char* const kDigest[ConvolveTestParam::kNumBlockSizes * 2] = {
+ "aea59b7a638f27acad2b90fd2b8c9fee", "be87ba981a0af25611a7d5f0970be9b3",
+ "7c81f1486cd607376d776bf2c6e81dec", "f683ba2a9b353bea35f26c1ed730f3c5",
+ "11e2d70daff1726093cb4fcae33ce0d6", "567575eac0dea2f379019b2d4bafe444",
+ "216479ed580d6e0d7c1d523015394814", "dcabbe5f5709a4b6634d77cc514e863a",
+ "4e888207fe917faeea2b44383ac16caf", "d617c5608fae3b01c507c7e88040fee3",
+ "eeac5d9b3dc005e76f13dfc7483eae48", "8ff0a82811f77303c4516bb8c761336f",
+ "95a7c315aaa208097b6ab006f1d07654", "da63527ee80e6772435cff8321a29a95",
+ "404457f72e7113d1f3797a39319fd3fe", "43cbccfe2663ec11c157319acfe629a5",
+ "1dc5b8dee4542f3d7fcf6b0fa325dfde", "16d4506674f2fcedfcd1e006eb097141",
+ "4fcf329ddb405cd6bbb0a6fb87e29eb3", "de77a781957653ea1750f79995605cdc",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+ "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "436f6fdc008d94a94bc6f516f98f402f",
+ "b436bd9036f08ba7e50cfc536911dbbd", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+ "720a01018856bd83f4d89a9024b14728", "b7e01a3f161007712ce342f59b2c51f2",
+ "922420ebe5dec4f19c259ebdf8a3259a", "979aaba579556207a7bbcc939123c1b2",
+ "89a30898cbaa4d64f9072173e8365864", "0586ff961f2e4228f4e38299fb25ae07",
+ "adb27a03f8b1b50fe2a52b5ca8d4fc28", "4f91cd92aab2e09f4b123251a8d2f219",
+ "620fba0fff163d96a1cd663d1af4a4c5", "bf7a0fa65b1a90ba34c834558fa2c86e",
+ "c21f7d7d16d047a27b871a7bf8476e2d", "a94b17c81f3ce2b47081bd8dd762a2e5",
+ "9078d20f59bc24862af3856acb8c0357", "ee510ce6b3d22de9e4bd7920a26fd69a",
+ };
+ assert(id >= 0);
+ assert(id < sizeof(kDigest) / sizeof(kDigest[0]));
+ return kDigest[id];
+}
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
struct ConvolveTypeParam {
ConvolveTypeParam(bool is_intra_block_copy, bool is_compound,
bool has_vertical_filter, bool has_horizontal_filter)
@@ -447,6 +607,7 @@ template <int bitdepth, typename Pixel>
class ConvolveTest : public testing::TestWithParam<
std::tuple<ConvolveTypeParam, ConvolveTestParam>> {
public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
ConvolveTest() = default;
~ConvolveTest() override = default;
@@ -725,14 +886,24 @@ void ConvolveTest<bitdepth, Pixel>::Test(
if (!use_fixed_values) {
// md5 sums are only calculated for random input.
- const char* ref_digest;
- if (bitdepth == 8) {
- ref_digest = GetConvolveDigest8bpp(GetDigestId());
- } else {
+ const char* ref_digest = nullptr;
+ switch (bitdepth) {
+ case 8:
+ ref_digest = GetConvolveDigest8bpp(GetDigestId());
+ break;
#if LIBGAV1_MAX_BITDEPTH >= 10
- ref_digest = GetConvolveDigest10bpp(GetDigestId());
-#endif // LIBGAV1_MAX_BITDEPTH >= 10
+ case 10:
+ ref_digest = GetConvolveDigest10bpp(GetDigestId());
+ break;
+#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+ case 12:
+ ref_digest = GetConvolveDigest12bpp(GetDigestId());
+ break;
+#endif
}
+ ASSERT_NE(ref_digest, nullptr);
+
const char* direction;
if (type_param_.has_vertical_filter && type_param_.has_horizontal_filter) {
direction = "2D";
@@ -896,6 +1067,7 @@ class ConvolveScaleTest
: public testing::TestWithParam<
std::tuple<bool /*is_compound*/, ConvolveTestParam>> {
public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
ConvolveScaleTest() = default;
~ConvolveScaleTest() override = default;
@@ -1160,14 +1332,23 @@ void ConvolveScaleTest<bitdepth, Pixel>::Test(
if (!use_fixed_values) {
// md5 sums are only calculated for random input.
- const char* ref_digest;
- if (bitdepth == 8) {
- ref_digest = GetConvolveScaleDigest8bpp(GetDigestId());
- } else {
+ const char* ref_digest = nullptr;
+ switch (bitdepth) {
+ case 8:
+ ref_digest = GetConvolveScaleDigest8bpp(GetDigestId());
+ break;
#if LIBGAV1_MAX_BITDEPTH >= 10
- ref_digest = GetConvolveScaleDigest10bpp(GetDigestId());
-#endif // LIBGAV1_MAX_BITDEPTH >= 10
+ case 10:
+ ref_digest = GetConvolveScaleDigest10bpp(GetDigestId());
+ break;
+#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+ case 12:
+ ref_digest = GetConvolveScaleDigest12bpp(GetDigestId());
+ break;
+#endif
}
+ ASSERT_NE(ref_digest, nullptr);
const auto elapsed_time_us =
static_cast<int>(absl::ToInt64Microseconds(elapsed_time));
@@ -1322,6 +1503,47 @@ INSTANTIATE_TEST_SUITE_P(NEON, ConvolveScaleTest10bpp,
#endif // LIBGAV1_MAX_BITDEPTH >= 10
+#if LIBGAV1_MAX_BITDEPTH == 12
+using ConvolveTest12bpp = ConvolveTest<12, uint16_t>;
+
+TEST_P(ConvolveTest12bpp, FixedValues) {
+ Test(true, 0);
+ Test(true, 1);
+ Test(true, 128);
+ Test(true, (1 << 12) - 1);
+}
+
+TEST_P(ConvolveTest12bpp, RandomValues) { Test(false, 0); }
+
+TEST_P(ConvolveTest12bpp, DISABLED_Speed) {
+ const int num_runs = static_cast<int>(1.0e7 / (param_.width * param_.height));
+ Test(false, 0, num_runs);
+}
+
+using ConvolveScaleTest12bpp = ConvolveScaleTest<12, uint16_t>;
+
+TEST_P(ConvolveScaleTest12bpp, FixedValues) {
+ Test(true, 0);
+ Test(true, 1);
+ Test(true, 128);
+ Test(true, (1 << 12) - 1);
+}
+
+TEST_P(ConvolveScaleTest12bpp, RandomValues) { Test(false, 0); }
+
+TEST_P(ConvolveScaleTest12bpp, DISABLED_Speed) {
+ const int num_runs = static_cast<int>(1.0e7 / (param_.width * param_.height));
+ Test(false, 0, num_runs);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, ConvolveTest12bpp,
+ testing::Combine(testing::ValuesIn(kConvolveTypeParam),
+ testing::ValuesIn(kConvolveParam)));
+INSTANTIATE_TEST_SUITE_P(C, ConvolveScaleTest12bpp,
+ testing::Combine(testing::Bool(),
+ testing::ValuesIn(kConvolveParam)));
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
} // namespace
} // namespace dsp
} // namespace libgav1
diff --git a/src/dsp/distance_weighted_blend.cc b/src/dsp/distance_weighted_blend.cc
index 34d10fc..ef83235 100644
--- a/src/dsp/distance_weighted_blend.cc
+++ b/src/dsp/distance_weighted_blend.cc
@@ -88,7 +88,22 @@ void Init10bpp() {
#endif
#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
}
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+void Init12bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(12);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->distance_weighted_blend = DistanceWeightedBlend_C<12, uint16_t>;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp12bpp_DistanceWeightedBlend
+ dsp->distance_weighted_blend = DistanceWeightedBlend_C<12, uint16_t>;
#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif // LIBGAV1_MAX_BITDEPTH == 12
} // namespace
@@ -97,6 +112,9 @@ void DistanceWeightedBlendInit_C() {
#if LIBGAV1_MAX_BITDEPTH >= 10
Init10bpp();
#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+ Init12bpp();
+#endif
}
} // namespace dsp
diff --git a/src/dsp/distance_weighted_blend_test.cc b/src/dsp/distance_weighted_blend_test.cc
index fdf058e..88040b4 100644
--- a/src/dsp/distance_weighted_blend_test.cc
+++ b/src/dsp/distance_weighted_blend_test.cc
@@ -47,6 +47,7 @@ template <int bitdepth, typename Pixel>
class DistanceWeightedBlendTest : public testing::TestWithParam<BlockSize>,
public test_utils::MaxAlignedAllocable {
public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
DistanceWeightedBlendTest() = default;
~DistanceWeightedBlendTest() override = default;
@@ -268,6 +269,56 @@ INSTANTIATE_TEST_SUITE_P(NEON, DistanceWeightedBlendTest10bpp,
#endif
#endif // LIBGAV1_MAX_BITDEPTH >= 10
+#if LIBGAV1_MAX_BITDEPTH == 12
+const char* GetDistanceWeightedBlendDigest12bpp(const BlockSize block_size) {
+ static const char* const kDigests[] = {
+ // 4xN
+ "e30bf8f5f294206ad1dd79bd10a20827",
+ "f0cfb60134562d9c5f2ec6ad106e01ef",
+ "ad0876244e1b769203266a9c75b74afc",
+ // 8xN
+ "5265b954479c15a80f427561c5f36ff4",
+ "7f157457d1671e4ecce7a0884e9e3f76",
+ "d2cef5cf217f2d1f787c8951b7fe7cb2",
+ "6d23059008adbbb84ac941c8b4968f5b",
+ // 16xN
+ "ae521a5656ed3440d1fa950c20d90a79",
+ "935bec0e12b5dd3e0c34b3de8ba51476",
+ "0334bafcdcd7ddddb673ded492bca25a",
+ "c5360f08d0be77c79dc19fb55a0c5fe0",
+ "c2d1e7a4244a8aaaac041aed0cefc148",
+ // 32xN
+ "ce7f3cf78ae4f836cf69763137f7f6a6",
+ "800e52ebb14d5831c047d391cd760f95",
+ "74aa2b412b42165f1967daf3042b4f17",
+ "140d4cc600944b629b1991e88a9fe97c",
+ // 64xN
+ "3d206f93229ee2cea5c5da4e0ac6445a",
+ "3d13028f8fffe79fd35752c0177291ca",
+ "e7a7669acb5979dc7b15a19eed09cd4c",
+ "599368f4971c203fc5fa32989fe8cb44",
+ // 128xN
+ "54b46af2e2c8d2081e26fa0315b4ffd7",
+ "602e769bb2104e78223e68e50e7e86a0",
+ };
+ assert(block_size < kMaxBlockSizes);
+ return kDigests[block_size];
+}
+
+using DistanceWeightedBlendTest12bpp = DistanceWeightedBlendTest<12, uint16_t>;
+
+TEST_P(DistanceWeightedBlendTest12bpp, Blending) {
+ Test(GetDistanceWeightedBlendDigest12bpp(GetParam()), 1);
+}
+
+TEST_P(DistanceWeightedBlendTest12bpp, DISABLED_Speed) {
+ Test(GetDistanceWeightedBlendDigest12bpp(GetParam()), kNumSpeedTests);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, DistanceWeightedBlendTest12bpp,
+ testing::ValuesIn(kTestParam));
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
} // namespace
} // namespace dsp
diff --git a/src/dsp/dsp.cc b/src/dsp/dsp.cc
index aac0ca0..97a064f 100644
--- a/src/dsp/dsp.cc
+++ b/src/dsp/dsp.cc
@@ -78,6 +78,12 @@ dsp::Dsp* GetWritableDspTable(int bitdepth) {
return &dsp_10bpp;
}
#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+ case 12: {
+ static dsp::Dsp dsp_12bpp;
+ return &dsp_12bpp;
+ }
+#endif
}
return nullptr;
}
@@ -157,6 +163,7 @@ void DspInit() {
#if LIBGAV1_MAX_BITDEPTH >= 10
ConvolveInit10bpp_NEON();
InverseTransformInit10bpp_NEON();
+ LoopFilterInit10bpp_NEON();
LoopRestorationInit10bpp_NEON();
#endif // LIBGAV1_MAX_BITDEPTH >= 10
#endif // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/dsp_test.cc b/src/dsp/dsp_test.cc
index 5c2a3aa..6d2817b 100644
--- a/src/dsp/dsp_test.cc
+++ b/src/dsp/dsp_test.cc
@@ -41,7 +41,9 @@ constexpr int kMaxTransform1dSize[kNumTransform1ds] = {
};
void CheckTables(bool c_only) {
-#if LIBGAV1_MAX_BITDEPTH >= 10
+#if LIBGAV1_MAX_BITDEPTH == 12
+ static constexpr int kBitdepths[] = {kBitdepth8, kBitdepth10, kBitdepth12};
+#elif LIBGAV1_MAX_BITDEPTH >= 10
static constexpr int kBitdepths[] = {kBitdepth8, kBitdepth10};
#else
static constexpr int kBitdepths[] = {kBitdepth8};
@@ -108,7 +110,9 @@ void CheckTables(bool c_only) {
const uint32_t cpu_features = GetCpuInfo();
super_res_coefficients_is_nonnull = (cpu_features & kSSE4_1) != 0;
#endif
- if (c_only) super_res_coefficients_is_nonnull = false;
+ if (c_only || bitdepth == kBitdepth12) {
+ super_res_coefficients_is_nonnull = false;
+ }
if (super_res_coefficients_is_nonnull) {
EXPECT_NE(dsp->super_res_coefficients, nullptr);
} else {
@@ -234,6 +238,9 @@ TEST(Dsp, TablesArePopulatedCOnly) {
#if LIBGAV1_MAX_BITDEPTH >= 10
test_utils::ResetDspTable(kBitdepth10);
#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+ test_utils::ResetDspTable(kBitdepth12);
+#endif
dsp_internal::DspInit_C();
CheckTables(/*c_only=*/true);
}
@@ -241,15 +248,22 @@ TEST(Dsp, TablesArePopulatedCOnly) {
TEST(Dsp, GetDspTable) {
EXPECT_EQ(GetDspTable(1), nullptr);
- EXPECT_NE(GetDspTable(8), nullptr);
+ EXPECT_NE(GetDspTable(kBitdepth8), nullptr);
EXPECT_EQ(dsp_internal::GetWritableDspTable(1), nullptr);
- EXPECT_NE(dsp_internal::GetWritableDspTable(8), nullptr);
+ EXPECT_NE(dsp_internal::GetWritableDspTable(kBitdepth8), nullptr);
#if LIBGAV1_MAX_BITDEPTH >= 10
- EXPECT_NE(GetDspTable(10), nullptr);
- EXPECT_NE(dsp_internal::GetWritableDspTable(10), nullptr);
+ EXPECT_NE(GetDspTable(kBitdepth10), nullptr);
+ EXPECT_NE(dsp_internal::GetWritableDspTable(kBitdepth10), nullptr);
+#else
+ EXPECT_EQ(GetDspTable(kBitdepth10), nullptr);
+ EXPECT_EQ(dsp_internal::GetWritableDspTable(kBitdepth10), nullptr);
+#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+ EXPECT_NE(GetDspTable(kBitdepth12), nullptr);
+ EXPECT_NE(dsp_internal::GetWritableDspTable(kBitdepth12), nullptr);
#else
- EXPECT_EQ(GetDspTable(10), nullptr);
- EXPECT_EQ(dsp_internal::GetWritableDspTable(10), nullptr);
+ EXPECT_EQ(GetDspTable(kBitdepth12), nullptr);
+ EXPECT_EQ(dsp_internal::GetWritableDspTable(kBitdepth12), nullptr);
#endif
}
diff --git a/src/dsp/film_grain.cc b/src/dsp/film_grain.cc
index fa12b69..906230d 100644
--- a/src/dsp/film_grain.cc
+++ b/src/dsp/film_grain.cc
@@ -19,17 +19,16 @@
#include <cstddef>
#include <cstdint>
#include <cstring>
-#include <new>
-#include "src/dsp/common.h"
#include "src/dsp/constants.h"
#include "src/dsp/dsp.h"
#include "src/dsp/film_grain_common.h"
#include "src/utils/array_2d.h"
#include "src/utils/common.h"
#include "src/utils/compiler_attributes.h"
-#include "src/utils/logging.h"
+#include "src/utils/constants.h"
#include "src/utils/memory.h"
+#include "src/utils/types.h"
namespace libgav1 {
namespace dsp {
@@ -45,7 +44,7 @@ void InitializeScalingLookupTable_C(int num_points, const uint8_t point_value[],
memset(scaling_lut, 0, sizeof(scaling_lut[0]) * scaling_lut_length);
return;
}
- constexpr int index_shift = bitdepth - kBitdepth8;
+ constexpr int index_shift = (bitdepth == kBitdepth10) ? 2 : 0;
static_assert(sizeof(scaling_lut[0]) == 2, "");
Memset(scaling_lut, point_scaling[0],
std::max(static_cast<int>(point_value[0]), 1) << index_shift);
@@ -866,6 +865,121 @@ void Init10bpp() {
}
#endif // LIBGAV1_MAX_BITDEPTH >= 10
+#if LIBGAV1_MAX_BITDEPTH == 12
+void Init12bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(12);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+
+ // LumaAutoRegressionFunc
+ dsp->film_grain.luma_auto_regression[0] =
+ ApplyAutoRegressiveFilterToLumaGrain_C<kBitdepth12, int16_t>;
+ dsp->film_grain.luma_auto_regression[1] =
+ ApplyAutoRegressiveFilterToLumaGrain_C<kBitdepth12, int16_t>;
+ dsp->film_grain.luma_auto_regression[2] =
+ ApplyAutoRegressiveFilterToLumaGrain_C<kBitdepth12, int16_t>;
+
+ // ChromaAutoRegressionFunc
+ // Chroma autoregression should never be called when lag is 0 and use_luma is
+ // false.
+ dsp->film_grain.chroma_auto_regression[0][0] = nullptr;
+ dsp->film_grain.chroma_auto_regression[0][1] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 1, false>;
+ dsp->film_grain.chroma_auto_regression[0][2] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 2, false>;
+ dsp->film_grain.chroma_auto_regression[0][3] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 3, false>;
+ dsp->film_grain.chroma_auto_regression[1][0] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 0, true>;
+ dsp->film_grain.chroma_auto_regression[1][1] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 1, true>;
+ dsp->film_grain.chroma_auto_regression[1][2] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 2, true>;
+ dsp->film_grain.chroma_auto_regression[1][3] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 3, true>;
+
+ // ConstructNoiseStripesFunc
+ dsp->film_grain.construct_noise_stripes[0] =
+ ConstructNoiseStripes_C<kBitdepth12, int16_t>;
+ dsp->film_grain.construct_noise_stripes[1] =
+ ConstructNoiseStripesWithOverlap_C<kBitdepth12, int16_t>;
+
+ // ConstructNoiseImageOverlapFunc
+ dsp->film_grain.construct_noise_image_overlap =
+ ConstructNoiseImageOverlap_C<kBitdepth12, int16_t>;
+
+ // InitializeScalingLutFunc
+ dsp->film_grain.initialize_scaling_lut =
+ InitializeScalingLookupTable_C<kBitdepth12>;
+
+ // BlendNoiseWithImageLumaFunc
+ dsp->film_grain.blend_noise_luma =
+ BlendNoiseWithImageLuma_C<kBitdepth12, int16_t, uint16_t>;
+
+ // BlendNoiseWithImageChromaFunc
+ dsp->film_grain.blend_noise_chroma[0] =
+ BlendNoiseWithImageChroma_C<kBitdepth12, int16_t, uint16_t>;
+ dsp->film_grain.blend_noise_chroma[1] =
+ BlendNoiseWithImageChromaWithCfl_C<kBitdepth12, int16_t, uint16_t>;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp12bpp_FilmGrainAutoregressionLuma
+ dsp->film_grain.luma_auto_regression[0] =
+ ApplyAutoRegressiveFilterToLumaGrain_C<kBitdepth12, int16_t>;
+ dsp->film_grain.luma_auto_regression[1] =
+ ApplyAutoRegressiveFilterToLumaGrain_C<kBitdepth12, int16_t>;
+ dsp->film_grain.luma_auto_regression[2] =
+ ApplyAutoRegressiveFilterToLumaGrain_C<kBitdepth12, int16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_FilmGrainAutoregressionChroma
+ // Chroma autoregression should never be called when lag is 0 and use_luma is
+ // false.
+ dsp->film_grain.chroma_auto_regression[0][0] = nullptr;
+ dsp->film_grain.chroma_auto_regression[0][1] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 1, false>;
+ dsp->film_grain.chroma_auto_regression[0][2] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 2, false>;
+ dsp->film_grain.chroma_auto_regression[0][3] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 3, false>;
+ dsp->film_grain.chroma_auto_regression[1][0] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 0, true>;
+ dsp->film_grain.chroma_auto_regression[1][1] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 1, true>;
+ dsp->film_grain.chroma_auto_regression[1][2] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 2, true>;
+ dsp->film_grain.chroma_auto_regression[1][3] =
+ ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth12, int16_t, 3, true>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_FilmGrainConstructNoiseStripes
+ dsp->film_grain.construct_noise_stripes[0] =
+ ConstructNoiseStripes_C<kBitdepth12, int16_t>;
+ dsp->film_grain.construct_noise_stripes[1] =
+ ConstructNoiseStripesWithOverlap_C<kBitdepth12, int16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_FilmGrainConstructNoiseImageOverlap
+ dsp->film_grain.construct_noise_image_overlap =
+ ConstructNoiseImageOverlap_C<kBitdepth12, int16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_FilmGrainInitializeScalingLutFunc
+ dsp->film_grain.initialize_scaling_lut =
+ InitializeScalingLookupTable_C<kBitdepth12>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_FilmGrainBlendNoiseLuma
+ dsp->film_grain.blend_noise_luma =
+ BlendNoiseWithImageLuma_C<kBitdepth12, int16_t, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_FilmGrainBlendNoiseChroma
+ dsp->film_grain.blend_noise_chroma[0] =
+ BlendNoiseWithImageChroma_C<kBitdepth12, int16_t, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_FilmGrainBlendNoiseChromaWithCfl
+ dsp->film_grain.blend_noise_chroma[1] =
+ BlendNoiseWithImageChromaWithCfl_C<kBitdepth12, int16_t, uint16_t>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
} // namespace
} // namespace film_grain
@@ -874,6 +988,9 @@ void FilmGrainInit_C() {
#if LIBGAV1_MAX_BITDEPTH >= 10
film_grain::Init10bpp();
#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+ film_grain::Init12bpp();
+#endif
}
} // namespace dsp
diff --git a/src/dsp/film_grain_common.h b/src/dsp/film_grain_common.h
index 2e6ad45..3c8d761 100644
--- a/src/dsp/film_grain_common.h
+++ b/src/dsp/film_grain_common.h
@@ -17,15 +17,7 @@
#ifndef LIBGAV1_SRC_DSP_FILM_GRAIN_COMMON_H_
#define LIBGAV1_SRC_DSP_FILM_GRAIN_COMMON_H_
-#include <cstddef>
#include <cstdint>
-#include <memory>
-#include <type_traits>
-
-#include "src/dsp/common.h"
-#include "src/utils/array_2d.h"
-#include "src/utils/constants.h"
-#include "src/utils/cpu.h"
namespace libgav1 {
diff --git a/src/dsp/intra_edge.cc b/src/dsp/intra_edge.cc
index fe66db2..9875ef1 100644
--- a/src/dsp/intra_edge.cc
+++ b/src/dsp/intra_edge.cc
@@ -100,7 +100,26 @@ void Init10bpp() {
#endif
#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
}
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+void Init12bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(12);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->intra_edge_filter = IntraEdgeFilter_C<uint16_t>;
+ dsp->intra_edge_upsampler = IntraEdgeUpsampler_C<12, uint16_t>;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp12bpp_IntraEdgeFilter
+ dsp->intra_edge_filter = IntraEdgeFilter_C<uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_IntraEdgeUpsampler
+ dsp->intra_edge_upsampler = IntraEdgeUpsampler_C<12, uint16_t>;
#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif // LIBGAV1_MAX_BITDEPTH == 12
} // namespace
@@ -109,6 +128,9 @@ void IntraEdgeInit_C() {
#if LIBGAV1_MAX_BITDEPTH >= 10
Init10bpp();
#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+ Init12bpp();
+#endif
}
} // namespace dsp
diff --git a/src/dsp/intra_edge_test.cc b/src/dsp/intra_edge_test.cc
index aca6f9e..b287544 100644
--- a/src/dsp/intra_edge_test.cc
+++ b/src/dsp/intra_edge_test.cc
@@ -76,6 +76,7 @@ constexpr EdgeFilterParams kIntraEdgeFilterParamList[] = {
template <int bitdepth, typename Pixel>
class IntraEdgeFilterTest : public testing::TestWithParam<EdgeFilterParams> {
public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
IntraEdgeFilterTest() = default;
IntraEdgeFilterTest(const IntraEdgeFilterTest&) = delete;
IntraEdgeFilterTest& operator=(const IntraEdgeFilterTest&) = delete;
@@ -315,11 +316,27 @@ TEST_P(IntraEdgeFilterTest10bpp, FixedInput) {
}
TEST_P(IntraEdgeFilterTest10bpp, DISABLED_Speed) { TestRandomValues(1e7); }
-#endif
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+using IntraEdgeFilterTest12bpp = IntraEdgeFilterTest<12, uint16_t>;
+
+const char* GetIntraEdgeFilterDigest12bpp(int strength, int size) {
+ return GetIntraEdgeFilterDigest10bpp(strength, size);
+}
+
+TEST_P(IntraEdgeFilterTest12bpp, FixedInput) {
+ TestFixedValues(GetIntraEdgeFilterDigest12bpp(strength_, size_));
+ TestRandomValues(1);
+}
+
+TEST_P(IntraEdgeFilterTest12bpp, DISABLED_Speed) { TestRandomValues(1e7); }
+#endif // LIBGAV1_MAX_BITDEPTH == 12
template <int bitdepth, typename Pixel>
class IntraEdgeUpsamplerTest : public testing::TestWithParam<int> {
public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
IntraEdgeUpsamplerTest() = default;
IntraEdgeUpsamplerTest(const IntraEdgeUpsamplerTest&) = delete;
IntraEdgeUpsamplerTest& operator=(const IntraEdgeUpsamplerTest&) = delete;
@@ -476,7 +493,22 @@ TEST_P(IntraEdgeUpsamplerTest10bpp, FixedInput) {
}
TEST_P(IntraEdgeUpsamplerTest10bpp, DISABLED_Speed) { TestRandomValues(5e7); }
-#endif
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+using IntraEdgeUpsamplerTest12bpp = IntraEdgeUpsamplerTest<12, uint16_t>;
+
+const char* GetIntraEdgeUpsampleDigest12bpp(int size) {
+ return GetIntraEdgeUpsampleDigest10bpp(size);
+}
+
+TEST_P(IntraEdgeUpsamplerTest12bpp, FixedInput) {
+ TestFixedValues(GetIntraEdgeUpsampleDigest12bpp(size_));
+ TestRandomValues(1);
+}
+
+TEST_P(IntraEdgeUpsamplerTest12bpp, DISABLED_Speed) { TestRandomValues(5e7); }
+#endif // LIBGAV1_MAX_BITDEPTH == 12
INSTANTIATE_TEST_SUITE_P(C, IntraEdgeFilterTest8bpp,
testing::ValuesIn(kIntraEdgeFilterParamList));
@@ -512,7 +544,15 @@ INSTANTIATE_TEST_SUITE_P(NEON, IntraEdgeUpsamplerTest10bpp,
testing::ValuesIn(kIntraEdgeUpsampleSizes));
#endif
-#endif
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+INSTANTIATE_TEST_SUITE_P(C, IntraEdgeFilterTest12bpp,
+ testing::ValuesIn(kIntraEdgeFilterParamList));
+INSTANTIATE_TEST_SUITE_P(C, IntraEdgeUpsamplerTest12bpp,
+ testing::ValuesIn(kIntraEdgeUpsampleSizes));
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
} // namespace
} // namespace dsp
} // namespace libgav1
diff --git a/src/dsp/intrapred.cc b/src/dsp/intrapred.cc
index 75af279..3162acc 100644
--- a/src/dsp/intrapred.cc
+++ b/src/dsp/intrapred.cc
@@ -1422,6 +1422,551 @@ void Init10bpp() {
} // NOLINT(readability/fn_size)
#endif // LIBGAV1_MAX_BITDEPTH >= 10
+#if LIBGAV1_MAX_BITDEPTH == 12
+using Defs12bpp = IntraPredBppDefs<12, uint16_t>;
+
+void Init12bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(12);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ INIT_INTRAPREDICTORS(DefsHbd, Defs12bpp);
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcFill] =
+ Defs12bpp::_4x4::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcTop] =
+ DefsHbd::_4x4::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcLeft] =
+ DefsHbd::_4x4::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDc] =
+ DefsHbd::_4x4::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorVertical] =
+ DefsHbd::_4x4::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorHorizontal] =
+ DefsHbd::_4x4::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorPaeth] =
+ DefsHbd::_4x4::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcFill] =
+ Defs12bpp::_4x8::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcTop] =
+ DefsHbd::_4x8::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcLeft] =
+ DefsHbd::_4x8::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDc] =
+ DefsHbd::_4x8::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorVertical] =
+ DefsHbd::_4x8::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorHorizontal] =
+ DefsHbd::_4x8::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorPaeth] =
+ DefsHbd::_4x8::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcFill] =
+ Defs12bpp::_4x16::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcTop] =
+ DefsHbd::_4x16::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcLeft] =
+ DefsHbd::_4x16::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDc] =
+ DefsHbd::_4x16::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorVertical] =
+ DefsHbd::_4x16::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorHorizontal] =
+ DefsHbd::_4x16::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorPaeth] =
+ DefsHbd::_4x16::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcFill] =
+ Defs12bpp::_8x4::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcTop] =
+ DefsHbd::_8x4::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcLeft] =
+ DefsHbd::_8x4::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDc] =
+ DefsHbd::_8x4::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorVertical] =
+ DefsHbd::_8x4::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorHorizontal] =
+ DefsHbd::_8x4::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorPaeth] =
+ DefsHbd::_8x4::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcFill] =
+ Defs12bpp::_8x8::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcTop] =
+ DefsHbd::_8x8::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcLeft] =
+ DefsHbd::_8x8::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDc] =
+ DefsHbd::_8x8::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorVertical] =
+ DefsHbd::_8x8::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorHorizontal] =
+ DefsHbd::_8x8::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorPaeth] =
+ DefsHbd::_8x8::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcFill] =
+ Defs12bpp::_8x16::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcTop] =
+ DefsHbd::_8x16::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcLeft] =
+ DefsHbd::_8x16::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDc] =
+ DefsHbd::_8x16::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorVertical] =
+ DefsHbd::_8x16::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorHorizontal] =
+ DefsHbd::_8x16::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorPaeth] =
+ DefsHbd::_8x16::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcFill] =
+ Defs12bpp::_8x32::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcTop] =
+ DefsHbd::_8x32::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcLeft] =
+ DefsHbd::_8x32::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDc] =
+ DefsHbd::_8x32::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorVertical] =
+ DefsHbd::_8x32::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorHorizontal] =
+ DefsHbd::_8x32::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorPaeth] =
+ DefsHbd::_8x32::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcFill] =
+ Defs12bpp::_16x4::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcTop] =
+ DefsHbd::_16x4::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcLeft] =
+ DefsHbd::_16x4::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDc] =
+ DefsHbd::_16x4::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorVertical] =
+ DefsHbd::_16x4::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorHorizontal] =
+ DefsHbd::_16x4::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorPaeth] =
+ DefsHbd::_16x4::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcFill] =
+ Defs12bpp::_16x8::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcTop] =
+ DefsHbd::_16x8::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcLeft] =
+ DefsHbd::_16x8::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDc] =
+ DefsHbd::_16x8::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorVertical] =
+ DefsHbd::_16x8::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorHorizontal] =
+ DefsHbd::_16x8::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorPaeth] =
+ DefsHbd::_16x8::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcFill] =
+ Defs12bpp::_16x16::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcTop] =
+ DefsHbd::_16x16::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcLeft] =
+ DefsHbd::_16x16::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDc] =
+ DefsHbd::_16x16::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorVertical] =
+ DefsHbd::_16x16::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorHorizontal] =
+ DefsHbd::_16x16::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorPaeth] =
+ DefsHbd::_16x16::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcFill] =
+ Defs12bpp::_16x32::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcTop] =
+ DefsHbd::_16x32::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcLeft] =
+ DefsHbd::_16x32::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDc] =
+ DefsHbd::_16x32::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorVertical] =
+ DefsHbd::_16x32::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorHorizontal] =
+ DefsHbd::_16x32::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorPaeth] =
+ DefsHbd::_16x32::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x64_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcFill] =
+ Defs12bpp::_16x64::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x64_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcTop] =
+ DefsHbd::_16x64::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x64_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcLeft] =
+ DefsHbd::_16x64::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x64_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDc] =
+ DefsHbd::_16x64::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x64_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorVertical] =
+ DefsHbd::_16x64::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x64_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorHorizontal] =
+ DefsHbd::_16x64::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x64_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorPaeth] =
+ DefsHbd::_16x64::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcFill] =
+ Defs12bpp::_32x8::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcTop] =
+ DefsHbd::_32x8::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcLeft] =
+ DefsHbd::_32x8::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDc] =
+ DefsHbd::_32x8::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorVertical] =
+ DefsHbd::_32x8::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorHorizontal] =
+ DefsHbd::_32x8::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorPaeth] =
+ DefsHbd::_32x8::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcFill] =
+ Defs12bpp::_32x16::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcTop] =
+ DefsHbd::_32x16::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcLeft] =
+ DefsHbd::_32x16::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDc] =
+ DefsHbd::_32x16::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorVertical] =
+ DefsHbd::_32x16::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorHorizontal] =
+ DefsHbd::_32x16::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorPaeth] =
+ DefsHbd::_32x16::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcFill] =
+ Defs12bpp::_32x32::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcTop] =
+ DefsHbd::_32x32::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcLeft] =
+ DefsHbd::_32x32::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDc] =
+ DefsHbd::_32x32::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorVertical] =
+ DefsHbd::_32x32::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorHorizontal] =
+ DefsHbd::_32x32::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorPaeth] =
+ DefsHbd::_32x32::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x64_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcFill] =
+ Defs12bpp::_32x64::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x64_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcTop] =
+ DefsHbd::_32x64::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x64_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcLeft] =
+ DefsHbd::_32x64::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x64_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDc] =
+ DefsHbd::_32x64::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x64_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorVertical] =
+ DefsHbd::_32x64::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x64_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorHorizontal] =
+ DefsHbd::_32x64::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x64_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorPaeth] =
+ DefsHbd::_32x64::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x16_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcFill] =
+ Defs12bpp::_64x16::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x16_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcTop] =
+ DefsHbd::_64x16::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x16_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcLeft] =
+ DefsHbd::_64x16::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x16_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDc] =
+ DefsHbd::_64x16::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x16_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorVertical] =
+ DefsHbd::_64x16::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x16_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorHorizontal] =
+ DefsHbd::_64x16::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x16_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorPaeth] =
+ DefsHbd::_64x16::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x32_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcFill] =
+ Defs12bpp::_64x32::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x32_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcTop] =
+ DefsHbd::_64x32::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x32_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcLeft] =
+ DefsHbd::_64x32::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x32_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDc] =
+ DefsHbd::_64x32::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x32_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorVertical] =
+ DefsHbd::_64x32::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x32_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorHorizontal] =
+ DefsHbd::_64x32::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x32_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorPaeth] =
+ DefsHbd::_64x32::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x64_IntraPredictorDcFill
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcFill] =
+ Defs12bpp::_64x64::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x64_IntraPredictorDcTop
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcTop] =
+ DefsHbd::_64x64::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x64_IntraPredictorDcLeft
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcLeft] =
+ DefsHbd::_64x64::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x64_IntraPredictorDc
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDc] =
+ DefsHbd::_64x64::Dc;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x64_IntraPredictorVertical
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorVertical] =
+ DefsHbd::_64x64::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x64_IntraPredictorHorizontal
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorHorizontal] =
+ DefsHbd::_64x64::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x64_IntraPredictorPaeth
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorPaeth] =
+ DefsHbd::_64x64::Paeth;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+} // NOLINT(readability/fn_size)
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
#undef INIT_INTRAPREDICTORS_WxH
#undef INIT_INTRAPREDICTORS
} // namespace
@@ -1431,6 +1976,9 @@ void IntraPredInit_C() {
#if LIBGAV1_MAX_BITDEPTH >= 10
Init10bpp();
#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+ Init12bpp();
+#endif
}
} // namespace dsp
diff --git a/src/dsp/intrapred_cfl.cc b/src/dsp/intrapred_cfl.cc
index 0f7f4f2..798bb73 100644
--- a/src/dsp/intrapred_cfl.cc
+++ b/src/dsp/intrapred_cfl.cc
@@ -639,6 +639,263 @@ void Init10bpp() {
} // NOLINT(readability/fn_size)
#endif // LIBGAV1_MAX_BITDEPTH >= 10
+#if LIBGAV1_MAX_BITDEPTH == 12
+void Init12bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(12);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ INIT_CFL_INTRAPREDICTORS(12, uint16_t);
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize4x4] =
+ CflIntraPredictor_C<4, 4, 12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] =
+ CflSubsampler_C<4, 4, 12, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType422] =
+ CflSubsampler_C<4, 4, 12, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] =
+ CflSubsampler_C<4, 4, 12, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize4x8] =
+ CflIntraPredictor_C<4, 8, 12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] =
+ CflSubsampler_C<4, 8, 12, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType422] =
+ CflSubsampler_C<4, 8, 12, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] =
+ CflSubsampler_C<4, 8, 12, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize4x16] =
+ CflIntraPredictor_C<4, 16, 12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] =
+ CflSubsampler_C<4, 16, 12, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType422] =
+ CflSubsampler_C<4, 16, 12, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] =
+ CflSubsampler_C<4, 16, 12, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize8x4] =
+ CflIntraPredictor_C<8, 4, 12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] =
+ CflSubsampler_C<8, 4, 12, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType422] =
+ CflSubsampler_C<8, 4, 12, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] =
+ CflSubsampler_C<8, 4, 12, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize8x8] =
+ CflIntraPredictor_C<8, 8, 12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] =
+ CflSubsampler_C<8, 8, 12, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType422] =
+ CflSubsampler_C<8, 8, 12, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] =
+ CflSubsampler_C<8, 8, 12, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize8x16] =
+ CflIntraPredictor_C<8, 16, 12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] =
+ CflSubsampler_C<8, 16, 12, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType422] =
+ CflSubsampler_C<8, 16, 12, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] =
+ CflSubsampler_C<8, 16, 12, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize8x32] =
+ CflIntraPredictor_C<8, 32, 12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] =
+ CflSubsampler_C<8, 32, 12, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType422] =
+ CflSubsampler_C<8, 32, 12, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] =
+ CflSubsampler_C<8, 32, 12, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize16x4] =
+ CflIntraPredictor_C<16, 4, 12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] =
+ CflSubsampler_C<16, 4, 12, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType422] =
+ CflSubsampler_C<16, 4, 12, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] =
+ CflSubsampler_C<16, 4, 12, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize16x8] =
+ CflIntraPredictor_C<16, 8, 12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] =
+ CflSubsampler_C<16, 8, 12, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType422] =
+ CflSubsampler_C<16, 8, 12, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] =
+ CflSubsampler_C<16, 8, 12, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize16x16] =
+ CflIntraPredictor_C<16, 16, 12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] =
+ CflSubsampler_C<16, 16, 12, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType422] =
+ CflSubsampler_C<16, 16, 12, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] =
+ CflSubsampler_C<16, 16, 12, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize16x32] =
+ CflIntraPredictor_C<16, 32, 12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] =
+ CflSubsampler_C<16, 32, 12, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType422] =
+ CflSubsampler_C<16, 32, 12, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] =
+ CflSubsampler_C<16, 32, 12, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize32x8] =
+ CflIntraPredictor_C<32, 8, 12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] =
+ CflSubsampler_C<32, 8, 12, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType422] =
+ CflSubsampler_C<32, 8, 12, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] =
+ CflSubsampler_C<32, 8, 12, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize32x16] =
+ CflIntraPredictor_C<32, 16, 12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] =
+ CflSubsampler_C<32, 16, 12, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType422] =
+ CflSubsampler_C<32, 16, 12, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] =
+ CflSubsampler_C<32, 16, 12, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize32x32] =
+ CflIntraPredictor_C<32, 32, 12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] =
+ CflSubsampler_C<32, 32, 12, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType422] =
+ CflSubsampler_C<32, 32, 12, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] =
+ CflSubsampler_C<32, 32, 12, uint16_t, 1, 1>;
+#endif
+
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ // Cfl predictors are available only for transform sizes with max(width,
+ // height) <= 32. Set all others to nullptr.
+ for (const auto i : kTransformSizesLargerThan32x32) {
+ dsp->cfl_intra_predictors[i] = nullptr;
+ for (int j = 0; j < kNumSubsamplingTypes; ++j) {
+ dsp->cfl_subsamplers[i][j] = nullptr;
+ }
+ }
+} // NOLINT(readability/fn_size)
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
#undef INIT_CFL_INTRAPREDICTOR_WxH
#undef INIT_CFL_INTRAPREDICTORS
@@ -649,6 +906,9 @@ void IntraPredCflInit_C() {
#if LIBGAV1_MAX_BITDEPTH >= 10
Init10bpp();
#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+ Init12bpp();
+#endif
}
} // namespace dsp
diff --git a/src/dsp/intrapred_cfl_test.cc b/src/dsp/intrapred_cfl_test.cc
index 82f1d2f..8415d51 100644
--- a/src/dsp/intrapred_cfl_test.cc
+++ b/src/dsp/intrapred_cfl_test.cc
@@ -49,6 +49,7 @@ template <int bitdepth, typename Pixel>
class IntraPredTestBase : public testing::TestWithParam<TransformSize>,
public test_utils::MaxAlignedAllocable {
public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
IntraPredTestBase() {
switch (tx_size_) {
case kNumTransformSizes:
@@ -127,6 +128,7 @@ class IntraPredTestBase : public testing::TestWithParam<TransformSize>,
template <int bitdepth, typename Pixel>
class CflIntraPredTest : public IntraPredTestBase<bitdepth, Pixel> {
public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
CflIntraPredTest() = default;
CflIntraPredTest(const CflIntraPredTest&) = delete;
CflIntraPredTest& operator=(const CflIntraPredTest&) = delete;
@@ -274,6 +276,7 @@ void CflIntraPredTest<bitdepth, Pixel>::TestRandomValues() {
template <int bitdepth, typename Pixel, SubsamplingType subsampling_type>
class CflSubsamplerTest : public IntraPredTestBase<bitdepth, Pixel> {
public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
CflSubsamplerTest() = default;
CflSubsamplerTest(const CflSubsamplerTest&) = delete;
CflSubsamplerTest& operator=(const CflSubsamplerTest&) = delete;
@@ -654,8 +657,6 @@ TEST_P(CflSubsamplerTest8bpp420, Random) { TestRandomValues(); }
//------------------------------------------------------------------------------
#if LIBGAV1_MAX_BITDEPTH >= 10
-//------------------------------------------------------------------------------
-
using CflIntraPredTest10bpp = CflIntraPredTest<10, uint16_t>;
const char* GetCflIntraPredDigest10bpp(TransformSize tx_size) {
@@ -853,9 +854,238 @@ TEST_P(CflSubsamplerTest10bpp420, FixedInput) {
TEST_P(CflSubsamplerTest10bpp420, Overflow) { TestSaturatedValues(); }
TEST_P(CflSubsamplerTest10bpp420, Random) { TestRandomValues(); }
-
#endif // LIBGAV1_MAX_BITDEPTH >= 10
+//------------------------------------------------------------------------------
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+using CflIntraPredTest12bpp = CflIntraPredTest<12, uint16_t>;
+
+const char* GetCflIntraPredDigest12bpp(TransformSize tx_size) {
+ static const char* const kDigest4x4 = "1d92a681a58f99396f22acd8b3154e2b";
+ static const char* const kDigest4x8 = "cf6833ebc64c9ae45f192ee384ef4aa3";
+ static const char* const kDigest4x16 = "06a4fbb8590aca98a045c902ed15c777";
+ static const char* const kDigest8x4 = "ad5944c7455f731ae8dd28b2b25a1b9f";
+ static const char* const kDigest8x8 = "c19621e42ca2bc184d5065131d27be2c";
+ static const char* const kDigest8x16 = "8faa7c95e8c3c18621168ed6759c1ac1";
+ static const char* const kDigest8x32 = "502699ef7a8c7aebc8c3bc653e733703";
+ static const char* const kDigest16x4 = "7f30bb038217967336fb8548a6f7df45";
+ static const char* const kDigest16x8 = "b70943098d0fb256c2943e2ebdbe6d34";
+ static const char* const kDigest16x16 = "4c34f5669880ab78d648b16b68ea0c24";
+ static const char* const kDigest16x32 = "5d85daf690020ed235617870a1a179b1";
+ static const char* const kDigest32x8 = "f8eec12e58c469ffb698fc60b13b927c";
+ static const char* const kDigest32x16 = "f272bb7e5d2df333aa63d806c95e6748";
+ static const char* const kDigest32x32 = "c737987c0a5414b03e6014f145dd999c";
+
+ switch (tx_size) {
+ case kTransformSize4x4:
+ return kDigest4x4;
+ case kTransformSize4x8:
+ return kDigest4x8;
+ case kTransformSize4x16:
+ return kDigest4x16;
+ case kTransformSize8x4:
+ return kDigest8x4;
+ case kTransformSize8x8:
+ return kDigest8x8;
+ case kTransformSize8x16:
+ return kDigest8x16;
+ case kTransformSize8x32:
+ return kDigest8x32;
+ case kTransformSize16x4:
+ return kDigest16x4;
+ case kTransformSize16x8:
+ return kDigest16x8;
+ case kTransformSize16x16:
+ return kDigest16x16;
+ case kTransformSize16x32:
+ return kDigest16x32;
+ case kTransformSize32x8:
+ return kDigest32x8;
+ case kTransformSize32x16:
+ return kDigest32x16;
+ case kTransformSize32x32:
+ return kDigest32x32;
+ default:
+ ADD_FAILURE() << "Unknown transform size: " << tx_size;
+ return nullptr;
+ }
+}
+
+TEST_P(CflIntraPredTest12bpp, DISABLED_Speed) {
+ const auto num_runs =
+ static_cast<int>(2.0e9 / (block_width_ * block_height_));
+ TestSpeed(GetCflIntraPredDigest12bpp(tx_size_), num_runs);
+}
+
+TEST_P(CflIntraPredTest12bpp, FixedInput) {
+ TestSpeed(GetCflIntraPredDigest12bpp(tx_size_), 1);
+}
+
+TEST_P(CflIntraPredTest12bpp, Overflow) { TestSaturatedValues(); }
+
+TEST_P(CflIntraPredTest12bpp, Random) { TestRandomValues(); }
+
+//------------------------------------------------------------------------------
+
+using CflSubsamplerTest12bpp444 =
+ CflSubsamplerTest<12, uint16_t, kSubsamplingType444>;
+using CflSubsamplerTest12bpp422 =
+ CflSubsamplerTest<12, uint16_t, kSubsamplingType422>;
+using CflSubsamplerTest12bpp420 =
+ CflSubsamplerTest<12, uint16_t, kSubsamplingType420>;
+
+const char* GetCflSubsamplerDigest12bpp(TransformSize tx_size,
+ SubsamplingType subsampling_type) {
+ static const char* const kDigests4x4[3] = {
+ "44af37c60e9ccaacea004b57d5dea4cf",
+ "e29dd1d93f23b23778ed8cd85910d987",
+ "81e5dac2fd4c90f872ab814ed0f76ae5",
+ };
+ static const char* const kDigests4x8[3] = {
+ "bfc04aed9fe41ec07b0462a219652d16",
+ "693dd064636a0aa3be7aa098e867c512",
+ "0636c25d88aacd85d63e56011e7c5d15",
+ };
+ static const char* const kDigests4x16[3] = {
+ "6479ab30377288e75a78068d47c7e194",
+ "7d6f9b8b3eb85e73626118fc9210e622",
+ "1f3d474cd7c86899da90e515b8b7a906",
+ };
+ static const char* const kDigests8x4[3] = {
+ "7da5a2029bcdab159225c475fdff02da",
+ "096bfef24caa0670d2cd7b0bb63a7ba6",
+ "f749310dfc8a6129ed438dbc845470c0",
+ };
+ static const char* const kDigests8x8[3] = {
+ "08494051a7ff50718313a79ec7c51f92",
+ "637efad0630e253f7cce11af1a0af456",
+ "b220faf7dfedef860d59079dcf201757",
+ };
+ static const char* const kDigests8x16[3] = {
+ "19f027af516e88d3b9e613e578deb126",
+ "4f3bb155d70f9ea76d05b2f41b297a0c",
+ "b7504347eeda1e59ba8e36385c219e40",
+ };
+ static const char* const kDigests8x32[3] = {
+ "b8f1ef01c5672c87ee1004bb3cd7b8bc",
+ "b3e3318b050eb1c165d1e320ef622fa7",
+ "67754f7c5ae84dc23bb76ffaa2fa848e",
+ };
+ static const char* const kDigests16x4[3] = {
+ "f687fb4e22d8a1446eeb4915036874f4",
+ "7b5ef3d393a98dfe0ba49a0db2083465",
+ "840bbb6edaa50e9f7d391033a3dda2d9",
+ };
+ static const char* const kDigests16x8[3] = {
+ "dd9aed11d115a028035f0cee5b90d433",
+ "340d5d0784356ea199d3d751f4d6ed5e",
+ "e55f6fb5f34d829727e9dc2068098933",
+ };
+ static const char* const kDigests16x16[3] = {
+ "1df36a20d76a405c6273b88b38693cf9",
+ "2a7590d01df60b4bc6f10bfdb07b7a65",
+ "510ee31a5bd609e8f4542bb817539668",
+ };
+ static const char* const kDigests16x32[3] = {
+ "bdbc13b9fb7c3c50d25fda57f86f5ad9",
+ "7c138c568794b3d0c8aabff2edc07efd",
+ "581bef267c2a66e4c2fb079968440dbe",
+ };
+ static const char* const kDigests32x8[3] = {
+ "26f62743793811475e2afe1414c5fee1",
+ "6e6bf1678a04f2f727f0679564fb3630",
+ "a4c15562c26dbcfa43fe03a2b6e728b5",
+ };
+ static const char* const kDigests32x16[3] = {
+ "791f0713bbf032081da8ec08e58b9cd3",
+ "5dc7a673e92767186ae86996f4a30691",
+ "651f09d1244c817d92d1baa094c86f56",
+ };
+ static const char* const kDigests32x32[3] = {
+ "543a9d76e7238d88ba86218ec47c1f49",
+ "b0f2b29aae4858c1f09c27fc4344fd15",
+ "1d45083875fed14c4e5f149384a3cd2d",
+ };
+
+ switch (tx_size) {
+ case kTransformSize4x4:
+ return kDigests4x4[subsampling_type];
+ case kTransformSize4x8:
+ return kDigests4x8[subsampling_type];
+ case kTransformSize4x16:
+ return kDigests4x16[subsampling_type];
+ case kTransformSize8x4:
+ return kDigests8x4[subsampling_type];
+ case kTransformSize8x8:
+ return kDigests8x8[subsampling_type];
+ case kTransformSize8x16:
+ return kDigests8x16[subsampling_type];
+ case kTransformSize8x32:
+ return kDigests8x32[subsampling_type];
+ case kTransformSize16x4:
+ return kDigests16x4[subsampling_type];
+ case kTransformSize16x8:
+ return kDigests16x8[subsampling_type];
+ case kTransformSize16x16:
+ return kDigests16x16[subsampling_type];
+ case kTransformSize16x32:
+ return kDigests16x32[subsampling_type];
+ case kTransformSize32x8:
+ return kDigests32x8[subsampling_type];
+ case kTransformSize32x16:
+ return kDigests32x16[subsampling_type];
+ case kTransformSize32x32:
+ return kDigests32x32[subsampling_type];
+ default:
+ ADD_FAILURE() << "Unknown transform size: " << tx_size;
+ return nullptr;
+ }
+}
+
+TEST_P(CflSubsamplerTest12bpp444, DISABLED_Speed) {
+ const auto num_runs =
+ static_cast<int>(2.0e9 / (block_width_ * block_height_));
+ TestSpeed(GetCflSubsamplerDigest12bpp(tx_size_, SubsamplingType()), num_runs);
+}
+
+TEST_P(CflSubsamplerTest12bpp444, FixedInput) {
+ TestSpeed(GetCflSubsamplerDigest12bpp(tx_size_, SubsamplingType()), 1);
+}
+
+TEST_P(CflSubsamplerTest12bpp444, Overflow) { TestSaturatedValues(); }
+
+TEST_P(CflSubsamplerTest12bpp444, Random) { TestRandomValues(); }
+
+TEST_P(CflSubsamplerTest12bpp422, DISABLED_Speed) {
+ const auto num_runs =
+ static_cast<int>(2.0e9 / (block_width_ * block_height_));
+ TestSpeed(GetCflSubsamplerDigest12bpp(tx_size_, SubsamplingType()), num_runs);
+}
+
+TEST_P(CflSubsamplerTest12bpp422, FixedInput) {
+ TestSpeed(GetCflSubsamplerDigest12bpp(tx_size_, SubsamplingType()), 1);
+}
+
+TEST_P(CflSubsamplerTest12bpp422, Overflow) { TestSaturatedValues(); }
+
+TEST_P(CflSubsamplerTest12bpp422, Random) { TestRandomValues(); }
+
+TEST_P(CflSubsamplerTest12bpp420, DISABLED_Speed) {
+ const auto num_runs =
+ static_cast<int>(2.0e9 / (block_width_ * block_height_));
+ TestSpeed(GetCflSubsamplerDigest12bpp(tx_size_, SubsamplingType()), num_runs);
+}
+
+TEST_P(CflSubsamplerTest12bpp420, FixedInput) {
+ TestSpeed(GetCflSubsamplerDigest12bpp(tx_size_, SubsamplingType()), 1);
+}
+
+TEST_P(CflSubsamplerTest12bpp420, Overflow) { TestSaturatedValues(); }
+
+TEST_P(CflSubsamplerTest12bpp420, Random) { TestRandomValues(); }
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
// Cfl predictors are available only for transform sizes with
// max(width, height) <= 32.
constexpr TransformSize kTransformSizesSmallerThan32x32[] = {
@@ -918,6 +1148,17 @@ INSTANTIATE_TEST_SUITE_P(NEON, CflSubsamplerTest10bpp420,
#endif // LIBGAV1_MAX_BITDEPTH >= 10
+#if LIBGAV1_MAX_BITDEPTH == 12
+INSTANTIATE_TEST_SUITE_P(C, CflIntraPredTest12bpp,
+ testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(C, CflSubsamplerTest12bpp444,
+ testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(C, CflSubsamplerTest12bpp422,
+ testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(C, CflSubsamplerTest12bpp420,
+ testing::ValuesIn(kTransformSizesSmallerThan32x32));
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
} // namespace
} // namespace dsp
diff --git a/src/dsp/intrapred_directional.cc b/src/dsp/intrapred_directional.cc
index 21a40b5..9146074 100644
--- a/src/dsp/intrapred_directional.cc
+++ b/src/dsp/intrapred_directional.cc
@@ -94,11 +94,19 @@ void DirectionalIntraPredictorZone1_C(
} while (++y < height);
}
+// clang 14.0.0 produces incorrect code with LIBGAV1_RESTRICT.
+// https://github.com/llvm/llvm-project/issues/54427
+#if defined(__clang__) && __clang_major__ == 14
+#define LOCAL_RESTRICT
+#else
+#define LOCAL_RESTRICT LIBGAV1_RESTRICT
+#endif
+
template <typename Pixel>
void DirectionalIntraPredictorZone2_C(
- void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
- const void* LIBGAV1_RESTRICT const top_row,
- const void* LIBGAV1_RESTRICT const left_column, const int width,
+ void* LOCAL_RESTRICT const dest, ptrdiff_t stride,
+ const void* LOCAL_RESTRICT const top_row,
+ const void* LOCAL_RESTRICT const left_column, const int width,
const int height, const int xstep, const int ystep,
const bool upsampled_top, const bool upsampled_left) {
const auto* const top = static_cast<const Pixel*>(top_row);
@@ -143,6 +151,8 @@ void DirectionalIntraPredictorZone2_C(
} while (++y < height);
}
+#undef LOCAL_RESTRICT
+
template <typename Pixel>
void DirectionalIntraPredictorZone3_C(
void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
@@ -236,6 +246,34 @@ void Init10bpp() {
}
#endif // LIBGAV1_MAX_BITDEPTH >= 10
+#if LIBGAV1_MAX_BITDEPTH == 12
+void Init12bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(12);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->directional_intra_predictor_zone1 =
+ DirectionalIntraPredictorZone1_C<uint16_t>;
+ dsp->directional_intra_predictor_zone2 =
+ DirectionalIntraPredictorZone2_C<uint16_t>;
+ dsp->directional_intra_predictor_zone3 =
+ DirectionalIntraPredictorZone3_C<uint16_t>;
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp12bpp_DirectionalIntraPredictorZone1
+ dsp->directional_intra_predictor_zone1 =
+ DirectionalIntraPredictorZone1_C<uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_DirectionalIntraPredictorZone2
+ dsp->directional_intra_predictor_zone2 =
+ DirectionalIntraPredictorZone2_C<uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_DirectionalIntraPredictorZone3
+ dsp->directional_intra_predictor_zone3 =
+ DirectionalIntraPredictorZone3_C<uint16_t>;
+#endif
+}
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
} // namespace
void IntraPredDirectionalInit_C() {
@@ -243,6 +281,9 @@ void IntraPredDirectionalInit_C() {
#if LIBGAV1_MAX_BITDEPTH >= 10
Init10bpp();
#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+ Init12bpp();
+#endif
}
} // namespace dsp
diff --git a/src/dsp/intrapred_directional_test.cc b/src/dsp/intrapred_directional_test.cc
index 9e98242..8d4fa63 100644
--- a/src/dsp/intrapred_directional_test.cc
+++ b/src/dsp/intrapred_directional_test.cc
@@ -60,6 +60,7 @@ template <int bitdepth, typename Pixel>
class IntraPredTestBase : public testing::TestWithParam<TransformSize>,
public test_utils::MaxAlignedAllocable {
public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
IntraPredTestBase() {
switch (tx_size_) {
case kNumTransformSizes:
@@ -150,6 +151,7 @@ class IntraPredTestBase : public testing::TestWithParam<TransformSize>,
template <int bitdepth, typename Pixel>
class DirectionalIntraPredTest : public IntraPredTestBase<bitdepth, Pixel> {
public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
DirectionalIntraPredTest() = default;
DirectionalIntraPredTest(const DirectionalIntraPredTest&) = delete;
DirectionalIntraPredTest& operator=(const DirectionalIntraPredTest&) = delete;
@@ -716,7 +718,7 @@ const char* const* GetDirectionalIntraPredDigests8bpp(TransformSize tx_size) {
TEST_P(DirectionalIntraPredTest8bpp, DISABLED_Speed) {
#if LIBGAV1_ENABLE_NEON
- const auto num_runs = static_cast<int>(2e7 / (block_width_ * block_height_));
+ const auto num_runs = static_cast<int>(2e5 / (block_width_ * block_height_));
#else
const int num_runs = static_cast<int>(4e7 / (block_width_ * block_height_));
#endif
@@ -737,8 +739,8 @@ TEST_P(DirectionalIntraPredTest8bpp, Overflow) { TestSaturatedValues(); }
TEST_P(DirectionalIntraPredTest8bpp, Random) { TestRandomValues(); }
//------------------------------------------------------------------------------
-#if LIBGAV1_MAX_BITDEPTH >= 10
+#if LIBGAV1_MAX_BITDEPTH >= 10
using DirectionalIntraPredTest10bpp = DirectionalIntraPredTest<10, uint16_t>;
const char* const* GetDirectionalIntraPredDigests10bpp(TransformSize tx_size) {
@@ -885,7 +887,7 @@ const char* const* GetDirectionalIntraPredDigests10bpp(TransformSize tx_size) {
TEST_P(DirectionalIntraPredTest10bpp, DISABLED_Speed) {
#if LIBGAV1_ENABLE_NEON
- const int num_runs = static_cast<int>(2e7 / (block_width_ * block_height_));
+ const int num_runs = static_cast<int>(2e5 / (block_width_ * block_height_));
#else
const int num_runs = static_cast<int>(4e7 / (block_width_ * block_height_));
#endif
@@ -904,9 +906,178 @@ TEST_P(DirectionalIntraPredTest10bpp, FixedInput) {
TEST_P(DirectionalIntraPredTest10bpp, Overflow) { TestSaturatedValues(); }
TEST_P(DirectionalIntraPredTest10bpp, Random) { TestRandomValues(); }
-
#endif // LIBGAV1_MAX_BITDEPTH >= 10
+//------------------------------------------------------------------------------
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+using DirectionalIntraPredTest12bpp = DirectionalIntraPredTest<12, uint16_t>;
+
+const char* const* GetDirectionalIntraPredDigests12bpp(TransformSize tx_size) {
+ static const char* const kDigests4x4[kNumDirectionalIntraPredictors] = {
+ "78f3297743f75e928e755b6ffa2d3050",
+ "7315da39861c6e3ef2e47c913e3be349",
+ "5609cb40b575f24d05880df202a60bd3",
+ };
+ static const char* const kDigests4x8[kNumDirectionalIntraPredictors] = {
+ "efb2363d3c25427abe198806c8ba4d6b",
+ "b5aaa41665a10e7e7944fb7fc90fd59a",
+ "5a85610342339ca3109d775fa18dc25c",
+ };
+ static const char* const kDigests4x16[kNumDirectionalIntraPredictors] = {
+ "9045679914980ea1f579d84509397b6e",
+ "f9f50bdc9f81a93095fd9d6998174aa7",
+ "46c1f82e85b8ba5b03bab41a2f561483",
+ };
+ static const char* const kDigests8x4[kNumDirectionalIntraPredictors] = {
+ "a0ae0956b2b667c528b7803d733d49da",
+ "5d9f60ef8904c4faedb6cfc19e54418a",
+ "4ffdcbbbcb23bca8286f1c286b9cb3e8",
+ };
+ static const char* const kDigests8x8[kNumDirectionalIntraPredictors] = {
+ "086116c6b116613b8b47a086726566ea",
+ "141dca7fcae0e4d4b88887a618271ea1",
+ "3575a34278aa0fb1eed934290982f4a7",
+ };
+ static const char* const kDigests8x16[kNumDirectionalIntraPredictors] = {
+ "7922f40216c78a40abaf675667e79493",
+ "55d20588240171df2e24d105ee1563ad",
+ "674b4d8f4dbf514d22e21cc4baeda1d3",
+ };
+ static const char* const kDigests8x32[kNumDirectionalIntraPredictors] = {
+ "32d4d7e256d3b304026ddb5430cf6a09",
+ "72f4be2569f4e067c252d51ff4030de3",
+ "6779a132e1bac0ac43c2373f56553ed8",
+ };
+ static const char* const kDigests16x4[kNumDirectionalIntraPredictors] = {
+ "1be2e0efc1403f9e22cfb8aeb28763d9",
+ "558c8a5418ac91d21a5839c454a9391f",
+ "7693ebef9b86416ebd6e78e98fcafba7",
+ };
+ static const char* const kDigests16x8[kNumDirectionalIntraPredictors] = {
+ "e6217ed1c673ae42e84f8757316b580d",
+ "028aa582c11a9733f0cd693211a067c5",
+ "082de9fc7c4bc80a8ec8522b5a5cb52c",
+ };
+ static const char* const kDigests16x16[kNumDirectionalIntraPredictors] = {
+ "e3b293c09bdc9c5c543ad046a3f0d64f",
+ "2de5803a6ed497c1039c8e6d675c1dd3",
+ "05742f807560f5d5206e54b70097dc4a",
+ };
+ static const char* const kDigests16x32[kNumDirectionalIntraPredictors] = {
+ "57f2ca4ba56be253eff7e6b73df5003d",
+ "ef8bea00437e01fb798a22cda59f0191",
+ "989ff38c96600c2f108d6e6fa381fd13",
+ };
+ static const char* const kDigests16x64[kNumDirectionalIntraPredictors] = {
+ "f5540f4874c02aa2222a3ba75106f841",
+ "17e5d20f798a96c39abc8a81e7aa7bc6",
+ "0fe9ea14c9dcae466b4a36f1c7db6978",
+ };
+ static const char* const kDigests32x8[kNumDirectionalIntraPredictors] = {
+ "aff9429951ab1885c0d9ed29aa1b6a9f",
+ "4b686e2a879bf0b4aadd06b412e0eb48",
+ "39325d71cddc272bfa1dd2dc80d09ffe",
+ };
+ static const char* const kDigests32x16[kNumDirectionalIntraPredictors] = {
+ "b83dffdf8bad2b7c3808925b6138ca1e",
+ "3656b58c7aaf2025979b4a3ed8a2841e",
+ "cfcc0c6ae3fa5e7d45dec581479459f6",
+ };
+ static const char* const kDigests32x32[kNumDirectionalIntraPredictors] = {
+ "3c91b3b9e2df73ffb718e0bf53c5a5c2",
+ "0dbe27603e111158e70d99e181befb83",
+ "edecbffb32ae1e49b66b6e55ad0af6c6",
+ };
+ static const char* const kDigests32x64[kNumDirectionalIntraPredictors] = {
+ "a3290917f755c7ccdc7b77eb3c6c89a7",
+ "42f89db41fbb366ddb78ef79a043f3e3",
+ "7f7bcbe33aa003b166677c68d12490e9",
+ };
+ static const char* const kDigests64x16[kNumDirectionalIntraPredictors] = {
+ "d4f4c6b70a82695f843e9227bd7d9cc8",
+ "550a0bd87936801651d552e229b683e9",
+ "a4c730ad71f566a930c5672e1b2f48f1",
+ };
+ static const char* const kDigests64x32[kNumDirectionalIntraPredictors] = {
+ "2087c9264c4c5fea9a6fe20dcedbe2b9",
+ "d4dd51d9578a3fc2eb75086fba867c22",
+ "6121a67d63e40107e780d0938aeb3d21",
+ };
+ static const char* const kDigests64x64[kNumDirectionalIntraPredictors] = {
+ "09c3818a07bc54467634c2bfce66f58f",
+ "8da453b8d72d73d71ba15a14ddd59db4",
+ "9bc939aa54445722469b120b8a505cb3",
+ };
+
+ switch (tx_size) {
+ case kTransformSize4x4:
+ return kDigests4x4;
+ case kTransformSize4x8:
+ return kDigests4x8;
+ case kTransformSize4x16:
+ return kDigests4x16;
+ case kTransformSize8x4:
+ return kDigests8x4;
+ case kTransformSize8x8:
+ return kDigests8x8;
+ case kTransformSize8x16:
+ return kDigests8x16;
+ case kTransformSize8x32:
+ return kDigests8x32;
+ case kTransformSize16x4:
+ return kDigests16x4;
+ case kTransformSize16x8:
+ return kDigests16x8;
+ case kTransformSize16x16:
+ return kDigests16x16;
+ case kTransformSize16x32:
+ return kDigests16x32;
+ case kTransformSize16x64:
+ return kDigests16x64;
+ case kTransformSize32x8:
+ return kDigests32x8;
+ case kTransformSize32x16:
+ return kDigests32x16;
+ case kTransformSize32x32:
+ return kDigests32x32;
+ case kTransformSize32x64:
+ return kDigests32x64;
+ case kTransformSize64x16:
+ return kDigests64x16;
+ case kTransformSize64x32:
+ return kDigests64x32;
+ case kTransformSize64x64:
+ return kDigests64x64;
+ default:
+ ADD_FAILURE() << "Unknown transform size: " << tx_size;
+ return nullptr;
+ }
+}
+
+TEST_P(DirectionalIntraPredTest12bpp, DISABLED_Speed) {
+#if LIBGAV1_ENABLE_NEON
+ const int num_runs = static_cast<int>(2e7 / (block_width_ * block_height_));
+#else
+ const int num_runs = static_cast<int>(4e7 / (block_width_ * block_height_));
+#endif
+ for (int i = kZone1; i < kNumZones; ++i) {
+ TestSpeed(GetDirectionalIntraPredDigests12bpp(tx_size_),
+ static_cast<Zone>(i), num_runs);
+ }
+}
+
+TEST_P(DirectionalIntraPredTest12bpp, FixedInput) {
+ for (int i = kZone1; i < kNumZones; ++i) {
+ TestSpeed(GetDirectionalIntraPredDigests12bpp(tx_size_),
+ static_cast<Zone>(i), 1);
+ }
+}
+
+TEST_P(DirectionalIntraPredTest12bpp, Overflow) { TestSaturatedValues(); }
+TEST_P(DirectionalIntraPredTest12bpp, Random) { TestRandomValues(); }
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
constexpr TransformSize kTransformSizes[] = {
kTransformSize4x4, kTransformSize4x8, kTransformSize4x16,
kTransformSize8x4, kTransformSize8x8, kTransformSize8x16,
@@ -938,9 +1109,13 @@ INSTANTIATE_TEST_SUITE_P(SSE41, DirectionalIntraPredTest10bpp,
INSTANTIATE_TEST_SUITE_P(NEON, DirectionalIntraPredTest10bpp,
testing::ValuesIn(kTransformSizes));
#endif // LIBGAV1_ENABLE_NEON
-
#endif // LIBGAV1_MAX_BITDEPTH >= 10
+#if LIBGAV1_MAX_BITDEPTH == 12
+INSTANTIATE_TEST_SUITE_P(C, DirectionalIntraPredTest12bpp,
+ testing::ValuesIn(kTransformSizes));
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
} // namespace
} // namespace dsp
diff --git a/src/dsp/intrapred_filter.cc b/src/dsp/intrapred_filter.cc
index 9a45eff..2d183cf 100644
--- a/src/dsp/intrapred_filter.cc
+++ b/src/dsp/intrapred_filter.cc
@@ -131,6 +131,21 @@ void Init10bpp() {
}
#endif // LIBGAV1_MAX_BITDEPTH >= 10
+#if LIBGAV1_MAX_BITDEPTH == 12
+void Init12bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(12);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->filter_intra_predictor = FilterIntraPredictor_C<12, uint16_t>;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp12bpp_FilterIntraPredictor
+ dsp->filter_intra_predictor = FilterIntraPredictor_C<12, uint16_t>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
} // namespace
void IntraPredFilterInit_C() {
@@ -138,6 +153,9 @@ void IntraPredFilterInit_C() {
#if LIBGAV1_MAX_BITDEPTH >= 10
Init10bpp();
#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+ Init12bpp();
+#endif
}
} // namespace dsp
diff --git a/src/dsp/intrapred_filter_test.cc b/src/dsp/intrapred_filter_test.cc
index fe1efdc..c8d60a0 100644
--- a/src/dsp/intrapred_filter_test.cc
+++ b/src/dsp/intrapred_filter_test.cc
@@ -52,6 +52,7 @@ template <int bitdepth, typename Pixel>
class IntraPredTestBase : public testing::TestWithParam<TransformSize>,
public test_utils::MaxAlignedAllocable {
public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
IntraPredTestBase() {
switch (tx_size_) {
case kNumTransformSizes:
@@ -130,6 +131,7 @@ class IntraPredTestBase : public testing::TestWithParam<TransformSize>,
template <int bitdepth, typename Pixel>
class FilterIntraPredTest : public IntraPredTestBase<bitdepth, Pixel> {
public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
FilterIntraPredTest() = default;
FilterIntraPredTest(const FilterIntraPredTest&) = delete;
FilterIntraPredTest& operator=(const FilterIntraPredTest&) = delete;
@@ -519,6 +521,132 @@ TEST_P(FilterIntraPredTest10bpp, FixedInput) {
TEST_P(FilterIntraPredTest10bpp, Overflow) { TestSaturatedValues(); }
#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+//------------------------------------------------------------------------------
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+using FilterIntraPredTest12bpp = FilterIntraPredTest<12, uint16_t>;
+
+const char* const* GetFilterIntraPredDigests12bpp(TransformSize tx_size) {
+ static const char* const kDigests4x4[kNumFilterIntraPredictors] = {
+ "27682e2763f742e0c7156a263af54fe1", "f6fe9b73d8a2024b3125d25a42028be3",
+ "8a232b8caa41f8c4f0b547f0aa072fd7", "411b24dc872e91de3a607f18b51c4e34",
+ "9a106b70ca2df5317afc90aba0316a98",
+ };
+ static const char* const kDigests4x8[kNumFilterIntraPredictors] = {
+ "a0d3f3a8f498727af0844a6df90da971", "bb02998e3d5d7b4643db616a5ce75c51",
+ "eaa39425427c155dea1836c37fc14f7e", "747cc4fa0c9e3418f4a15ded9f846599",
+ "c1a2aeaa01dd3edac4c26f74e01d8d57",
+ };
+ static const char* const kDigests4x16[kNumFilterIntraPredictors] = {
+ "80c01fdef14e3db28987e323801c998e", "de5a2f59384a096324eebe843d4b8ba5",
+ "f85e18efc9297793392607cdd84d8bc4", "d84bf2d9d4996c2f7fd82b6bbd52577b",
+ "9d73771de09c17bd494f1f5f75ab1111",
+ };
+ static const char* const kDigests8x4[kNumFilterIntraPredictors] = {
+ "7df2b038c4d816eb4949de6b933f0632", "0f1c45dd6e8d5534de0c9a279087ea8b",
+ "1b79f3b10facd9ffc404cbafdd73aa43", "e19adec4f14d72c5157f9faf7fc9b23e",
+ "a30ed988ea6ed797d4bf0945ffe7e330",
+ };
+ static const char* const kDigests8x8[kNumFilterIntraPredictors] = {
+ "097a0c14d89ece69e779fa755a2b75c0", "ebadfc559b20246dcd8d74413ff4d088",
+ "097c91bedc1e703b3eb54361d94df59a", "765bbad37b91e644292beac5f06811be",
+ "f3c809461fa3325f0d33087ca79c47d0",
+ };
+ static const char* const kDigests8x16[kNumFilterIntraPredictors] = {
+ "36464af48b38005b61f7f528a0b0c8ba", "47fa0868224c71d28d3cdcf247282c13",
+ "ca34bb57a37ee3e5428814ec63f52117", "420bdca6b643f4421d465345cc264167",
+ "339c124c07a611a65952dc9996ba6e12",
+ };
+ static const char* const kDigests8x32[kNumFilterIntraPredictors] = {
+ "99ca0d3b3fbdd4661a2c07bdb2752a70", "6fedae1dbfe721210b65e08dc77847dd",
+ "956810089f81dc9334103111afec2fbb", "ede4f0bee06def6d8a2037939415d845",
+ "ca146dfe0edbdac3066a0ca387fb6277",
+ };
+ static const char* const kDigests16x4[kNumFilterIntraPredictors] = {
+ "b0f7d5dbf7f9aa3f0ab13273de80dc9d", "a3537f2b60426e9f83aeef973161fcfd",
+ "d4f868f793ab232bee17b49afcfc28a0", "fc43429761d10723b5f377eb6513e59a",
+ "f59aabb06574ce24e1d1113753edb098",
+ };
+ static const char* const kDigests16x8[kNumFilterIntraPredictors] = {
+ "0b539f1e2ecf0300bf3838ab1d80952c", "44f01a4324cda8d27ea44a8bd3620526",
+ "a57819a22b422e7da9d85f09504a2c57", "dbff6a417a8f3606575acb3c98efe091",
+ "534e8e8cd4b73cb4f6ec22f903727efa",
+ };
+ static const char* const kDigests16x16[kNumFilterIntraPredictors] = {
+ "247192bd6a5c2821b8694e4669361103", "1935044a6220ac6315a58b402465b6da",
+ "bdce29a3e988b804d429da1446a34c2a", "4697132c20395fabac2662cb8b1ce35a",
+ "3d07a7beaff6925175fcd9a8e69542e6",
+ };
+ static const char* const kDigests16x32[kNumFilterIntraPredictors] = {
+ "3429b83b7ba723bdd2e3e368979b51b0", "cd099d0eb7f4a20547f91d9402e3394a",
+ "a6a7cc4e0f8ed34424264107b3657fb8", "0125ace62bec7c7ff7240bf5b6f689c5",
+ "a0722dba921b078a6d569ecb81777bf8",
+ };
+ static const char* const kDigests32x8[kNumFilterIntraPredictors] = {
+ "44b1b086ee37a93406e5db95dca825d7", "fdeed5c4644dc288f6dcc148e8d2867a",
+ "b241d112f6fa7a24c44706fb76e49132", "a782dcf01a16231276dbd20121bad640",
+ "4da9c0efd0bcb31f911af52779317fb9",
+ };
+ static const char* const kDigests32x16[kNumFilterIntraPredictors] = {
+ "bf9704995a0a868c45280cac3415c0a7", "373626072ade7c8d709ab732149fd3ae",
+ "9e4a2062aa86ac8dc5164002c953c7ca", "62eede30996d0e55afcf513fe9ad3c58",
+ "a5f3bb32688d5189341304d12e4e6449",
+ };
+ static const char* const kDigests32x32[kNumFilterIntraPredictors] = {
+ "bd93c4ddbe0f06e3f12be25ce490f68c", "bfe772b203b83c982f35a8ed0682cd16",
+ "d357ae05ce215f4c5af650ae82909081", "bd640d3c511edaac1753b64c81afb75d",
+ "4d05d67e02a7c4af7ae981b0eb8a4d7b",
+ };
+
+ switch (tx_size) {
+ case kTransformSize4x4:
+ return kDigests4x4;
+ case kTransformSize4x8:
+ return kDigests4x8;
+ case kTransformSize4x16:
+ return kDigests4x16;
+ case kTransformSize8x4:
+ return kDigests8x4;
+ case kTransformSize8x8:
+ return kDigests8x8;
+ case kTransformSize8x16:
+ return kDigests8x16;
+ case kTransformSize8x32:
+ return kDigests8x32;
+ case kTransformSize16x4:
+ return kDigests16x4;
+ case kTransformSize16x8:
+ return kDigests16x8;
+ case kTransformSize16x16:
+ return kDigests16x16;
+ case kTransformSize16x32:
+ return kDigests16x32;
+ case kTransformSize32x8:
+ return kDigests32x8;
+ case kTransformSize32x16:
+ return kDigests32x16;
+ case kTransformSize32x32:
+ return kDigests32x32;
+ default:
+ ADD_FAILURE() << "Unknown transform size: " << tx_size;
+ return nullptr;
+ }
+}
+
+TEST_P(FilterIntraPredTest12bpp, DISABLED_Speed) {
+ const auto num_runs =
+ static_cast<int>(2.5e8 / (block_width_ * block_height_));
+ TestSpeed(GetFilterIntraPredDigests12bpp(tx_size_), num_runs);
+}
+
+TEST_P(FilterIntraPredTest12bpp, FixedInput) {
+ TestSpeed(GetFilterIntraPredDigests12bpp(tx_size_), 1);
+}
+
+TEST_P(FilterIntraPredTest12bpp, Overflow) { TestSaturatedValues(); }
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
// Filter-intra and Cfl predictors are available only for transform sizes
// with max(width, height) <= 32.
constexpr TransformSize kTransformSizesSmallerThan32x32[] = {
@@ -549,6 +677,11 @@ INSTANTIATE_TEST_SUITE_P(NEON, FilterIntraPredTest10bpp,
#endif // LIBGAV1_ENABLE_NEON
#endif // LIBGAV1_MAX_BITDEPTH >= 10
+#if LIBGAV1_MAX_BITDEPTH == 12
+INSTANTIATE_TEST_SUITE_P(C, FilterIntraPredTest12bpp,
+ testing::ValuesIn(kTransformSizesSmallerThan32x32));
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
} // namespace
} // namespace dsp
diff --git a/src/dsp/intrapred_smooth.cc b/src/dsp/intrapred_smooth.cc
index 0c7f272..16b8274 100644
--- a/src/dsp/intrapred_smooth.cc
+++ b/src/dsp/intrapred_smooth.cc
@@ -714,6 +714,266 @@ void Init10bpp() {
} // NOLINT(readability/fn_size)
#endif // LIBGAV1_MAX_BITDEPTH >= 10
+#if LIBGAV1_MAX_BITDEPTH == 12
+using DefsHbd = SmoothDefs<uint16_t>;
+
+void Init12bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(12);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ INIT_SMOOTH(DefsHbd);
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmooth] =
+ DefsHbd::_4x4::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothVertical] =
+ DefsHbd::_4x4::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x4_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_4x4::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmooth] =
+ DefsHbd::_4x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothVertical] =
+ DefsHbd::_4x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x8_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_4x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmooth] =
+ DefsHbd::_4x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothVertical] =
+ DefsHbd::_4x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize4x16_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_4x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmooth] =
+ DefsHbd::_8x4::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothVertical] =
+ DefsHbd::_8x4::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x4_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_8x4::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmooth] =
+ DefsHbd::_8x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothVertical] =
+ DefsHbd::_8x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x8_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_8x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmooth] =
+ DefsHbd::_8x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothVertical] =
+ DefsHbd::_8x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x16_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_8x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmooth] =
+ DefsHbd::_8x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothVertical] =
+ DefsHbd::_8x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize8x32_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_8x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmooth] =
+ DefsHbd::_16x4::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothVertical] =
+ DefsHbd::_16x4::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x4_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_16x4::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmooth] =
+ DefsHbd::_16x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothVertical] =
+ DefsHbd::_16x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x8_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_16x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmooth] =
+ DefsHbd::_16x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothVertical] =
+ DefsHbd::_16x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x16_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_16x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmooth] =
+ DefsHbd::_16x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothVertical] =
+ DefsHbd::_16x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x32_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_16x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x64_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmooth] =
+ DefsHbd::_16x64::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x64_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothVertical] =
+ DefsHbd::_16x64::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize16x64_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_16x64::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmooth] =
+ DefsHbd::_32x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothVertical] =
+ DefsHbd::_32x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x8_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_32x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmooth] =
+ DefsHbd::_32x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothVertical] =
+ DefsHbd::_32x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x16_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_32x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmooth] =
+ DefsHbd::_32x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothVertical] =
+ DefsHbd::_32x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x32_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_32x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x64_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmooth] =
+ DefsHbd::_32x64::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x64_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothVertical] =
+ DefsHbd::_32x64::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize32x64_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_32x64::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x16_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmooth] =
+ DefsHbd::_64x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x16_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothVertical] =
+ DefsHbd::_64x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x16_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_64x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x32_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmooth] =
+ DefsHbd::_64x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x32_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothVertical] =
+ DefsHbd::_64x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x32_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_64x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x64_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmooth] =
+ DefsHbd::_64x64::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x64_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothVertical] =
+ DefsHbd::_64x64::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_TransformSize64x64_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_64x64::SmoothHorizontal;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+} // NOLINT(readability/fn_size)
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
#undef INIT_SMOOTH_WxH
#undef INIT_SMOOTH
} // namespace
@@ -723,6 +983,9 @@ void IntraPredSmoothInit_C() {
#if LIBGAV1_MAX_BITDEPTH >= 10
Init10bpp();
#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+ Init12bpp();
+#endif
}
} // namespace dsp
diff --git a/src/dsp/intrapred_smooth.h b/src/dsp/intrapred_smooth.h
index 6802003..06454af 100644
--- a/src/dsp/intrapred_smooth.h
+++ b/src/dsp/intrapred_smooth.h
@@ -38,6 +38,12 @@
namespace libgav1 {
namespace dsp {
+enum {
+ // Weights are quadratic from '1' to '1 / block_size', scaled by
+ // 2^kSmoothWeightScale.
+ kSmoothWeightScale = 8,
+};
+
// Initializes Dsp::intra_predictors[][kIntraPredictorSmooth.*].
// This function is not thread-safe.
void IntraPredSmoothInit_C();
diff --git a/src/dsp/intrapred_test.cc b/src/dsp/intrapred_test.cc
index 335aa2f..cca1c73 100644
--- a/src/dsp/intrapred_test.cc
+++ b/src/dsp/intrapred_test.cc
@@ -47,6 +47,7 @@ template <int bitdepth, typename Pixel>
class IntraPredTestBase : public testing::TestWithParam<TransformSize>,
public test_utils::MaxAlignedAllocable {
public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
IntraPredTestBase() {
switch (tx_size_) {
case kNumTransformSizes:
@@ -125,6 +126,7 @@ class IntraPredTestBase : public testing::TestWithParam<TransformSize>,
template <int bitdepth, typename Pixel>
class IntraPredTest : public IntraPredTestBase<bitdepth, Pixel> {
public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
IntraPredTest() = default;
IntraPredTest(const IntraPredTest&) = delete;
IntraPredTest& operator=(const IntraPredTest&) = delete;
@@ -666,6 +668,203 @@ TEST_P(IntraPredTest10bpp, Overflow) { TestSaturatedValues(); }
TEST_P(IntraPredTest10bpp, Random) { TestRandomValues(); }
#endif // LIBGAV1_MAX_BITDEPTH >= 10
+#if LIBGAV1_MAX_BITDEPTH == 12
+using IntraPredTest12bpp = IntraPredTest<12, uint16_t>;
+
+const char* const* GetIntraPredDigests12bpp(TransformSize tx_size) {
+ static const char* const kDigests4x4[kNumIntraPredictors] = {
+ "f7008e0f65bdeed97375ae5e98e3309b", "a34cc5d9d1ef875df4ee2ce010d0a80a",
+ "74f615beeb217ad317ced813851be36a", "b3312e86313805b061c66a08e09de653",
+ "2db47240c95530b39084bdacccf4bb8e", "76bb839cac394b5777c64b6d4b570a27",
+ "a74ee60527be86059e822f7463f49ad5", "b157a40aaa14391c237471ba6d148a50",
+ "d4f7bd2e97e2b23f7a6a059837a10b2a", "8a9bcb30e9aff59b6feef5d1bf546d28",
+ };
+ static const char* const kDigests4x8[kNumIntraPredictors] = {
+ "4c2a59e1d4a58c129c709f05d1a83f4a", "5fbedd99a90a20727195dfbe8f9969ad",
+ "d4645e21ccf5f6d3c4ca7a3d9b0156ba", "98aa17ea5423192c81a04afd2d2669ed",
+ "67dad5b5eefdeb2af1e4d3875b282c6c", "881dcafd6323509fb80cd5bbdf2870c4",
+ "03ece373dfd56bd2fd86ad00ad6f5000", "41b28f2578d2ed7f38e708d57b231948",
+ "9f935505190f52ff4da9556e43f607be", "815700d2abb055bce6902d130e77416d",
+ };
+ static const char* const kDigests4x16[kNumIntraPredictors] = {
+ "bfc47cd4eef143a6ebf517730756a718", "ef07a3af3e353f9dfaebc48c8ac92c82",
+ "ceec5d9d24254efd3c6a00cbf11dd24d", "4e07f512a69cf95608c3c0c3013ed808",
+ "cedb7c900bb6839026bf79d054edb4fc", "48d958a18a019809f12eb2ad2eb358bc",
+ "8f296f4b9fb621a910368609cc2cccdf", "073a6f2ca8a23d6131ff97e2a3b736e1",
+ "f4772cc60b68c4f958c08c0fd8eb8d48", "2f8946cf19abecf0fda3addbfb8f9dcf",
+ };
+ static const char* const kDigests8x4[kNumIntraPredictors] = {
+ "4f245b07a91e6d604da9f22cf277d6f1", "a6dc25d1e24ba9e842c312f67eea211d",
+ "0475204441f44ea95bfd69c6e04eaed8", "313bcf1e2fc762d31ff765d3c18a6f67",
+ "7e9223ece684a1885c2108741052c6c8", "79f1e6f070d9b1d0f1de2ff77bccc0dc",
+ "63adca1101ee4799b1cfa26d88aa0657", "e8b940a5e39ea5313930c903464de843",
+ "42a8e470d3b000f4f57c44c632f0051b", "e8a57663f73da3d4320f8e82a3fecfc2",
+ };
+ static const char* const kDigests8x8[kNumIntraPredictors] = {
+ "7fa3c8bdd9ce04dc4df27863499cf4d4", "83f1312edd9af928a1cef60613730bc3",
+ "ceb35042adc6095a545b490f20e5d81b", "73aa503f329a055ff59a24093e682c41",
+ "14a9a427525ec38d2eb13e698728e911", "9143ddf66234e74acc156565d684fcac",
+ "05182bbe4fd90f3b496033ee5b7c54f9", "d9c6184c23af1f5a903a4a00539b883a",
+ "c4c2d4000ca2defc7a8169215121d9fc", "0b938bc7782b32796bffece28d17bb69",
+ };
+ static const char* const kDigests8x16[kNumIntraPredictors] = {
+ "50197f063138616c37ef09f8bf8a3016", "ef2008f6d9f2176feb17b7d4312022e2",
+ "0d243ffbba0a2e65738d7ee768620c36", "51b52564a2733c2c56ba319db5d8e3b8",
+ "0e2b41482ac1347c3bb6d0e394fe7bec", "edb43c19850452e6b20dfb2c001adb0b",
+ "6cd29f537b5e4180f5aaefd9140b65ef", "6808f618bdff33e0f3d6db60ea487bc1",
+ "0303c17746192b0c52b4d75ea97ca24d", "225d1debd7828fa01bc9a610a443cda9",
+ };
+ static const char* const kDigests8x32[kNumIntraPredictors] = {
+ "dc047c402c6ac4014d621fbd41b460d5", "49eb33c3a112f059e02d6d4b99da8b41",
+ "c906c9105a406ae6c63e69f57ed2fc7c", "2ead452591ddd2455660f96ce79314ab",
+ "437a2a78562752ee8291227f88e0323a", "51834dbdcf1e89667ffbb931bec9006c",
+ "959c1778e11a7c61a5a97176c79ecb6a", "2e51e44dd1953fc6fccc3b1c1ca602ed",
+ "7f94114cddb0ba780cc0c8d00db3f8d2", "b5b3770e6061249a3206915a3f9464e7",
+ };
+ static const char* const kDigests16x4[kNumIntraPredictors] = {
+ "9deb173fa962d9adde8a9ae256708c32", "264624b41e43cfe9378ee9b4fb5028a6",
+ "404919a41bdc7f1a1f9d089223373bb8", "5294ed9fcc16eaf5f9a1f66a2a36ae7c",
+ "a2ed1fa4262bca265dcc62eb1586f0ac", "58494af62f86464dbe471130b2bc4ab0",
+ "fe1f25f7096fc3426cc7964326cc46ad", "cf7f6c8f7257436b9934cecf3b7523e1",
+ "6325036f243abfcd7777754e6a7bdacc", "9dce11a98e18422b04dd9d7be7d420da",
+ };
+ static const char* const kDigests16x8[kNumIntraPredictors] = {
+ "92d5b7d4033dcd8cb729bf8e166e339a", "6cbd9f198828fd3422c9bfaf8c2f1c1d",
+ "2b204014b6dc477f67b36818bcdab1ca", "2ce0b9cf224d4654168c559d7c1424c2",
+ "ec70341b9dd57b379f5283820c9461c7", "3fe1e2a20e44171c90ebca5a45b83460",
+ "0305852b25351ff472a45f45ec1638fa", "565c78271fbe3b25b0eee542095be005",
+ "8bc15e98659cef6236bcb072541bb2ca", "875c87bf4daba7cb436ea2fdb5a427dd",
+ };
+ static const char* const kDigests16x16[kNumIntraPredictors] = {
+ "c9d12bce78d8846f081345906e1315f4", "0b57c8fde6dec15458b1c289245100cb",
+ "1c11978c4e6bbc77767395c63d2f70a8", "e749f26b26b46d8cb7cb13c1c777db94",
+ "40459af05e865e94ff7adcdec1685c15", "f3ae419e99a60dbde3afa24ba6588a36",
+ "fe3912418bca24cee3132de2c193d1fc", "cdc8e3ce27a12f1cbfe01d1adf2eb6bd",
+ "ce354b30ce15a6918172dea55a292b93", "e762d01726d641194982a5fb8c148eb7",
+ };
+ static const char* const kDigests16x32[kNumIntraPredictors] = {
+ "ad8f118b07e053df3887215449633a07", "e8979aa743aef82937d93d87fc9fdb85",
+ "a8afb62cbf602cfcd4b570832afe1d55", "404183cf003764a4f032f0f4810cd42c",
+ "4afcf1bc5589a13b11679571aa953b86", "202df8f5a2d7eb3816de172608115f2b",
+ "ce42bca92d6d7f9df85dbaac72e35064", "61c463c8070b78ca2bdf578044fec440",
+ "3abf6e4d779208e15e3f9a0dfc0254f9", "13df5504084105af7c66a1b013fe44e1",
+ };
+ static const char* const kDigests16x64[kNumIntraPredictors] = {
+ "3ac1f642019493dec1b737d7a3a1b4e5", "cbf69d5d157c9f3355a4757b1d6e3414",
+ "96d00ddc7537bf7f196006591b733b4e", "8cba1b70a0bde29e8ef235cedc5faa7d",
+ "35f9ee300d7fa3c97338e81a6f21dcd4", "aae335442e77c8ebc280f16ea50ba9c7",
+ "a6140fdac2278644328be094d88731db", "2df93621b6ff100f7008432d509f4161",
+ "c77bf5aee39e7ed4a3dd715f816f452a", "02109bd63557d90225c32a8f1338258e",
+ };
+ static const char* const kDigests32x8[kNumIntraPredictors] = {
+ "155688dec409ff50f2333c14a6367247", "cf935e78abafa6ff7258c5af229f55b6",
+ "b4bf83a28ba319c597151a041ff838c3", "fe97f3e6cd5fe6c5979670c11d940dda",
+ "b898c9a989e1e72461a6f47e913d5383", "bb73baa6476ce90118e83e2fd08f2299",
+ "c93be6d8ec318bd805899466821bb779", "ab366991ef842e9d417d52241f6966e6",
+ "9e7e4c96a271e9e40771eac39c21f661", "9459f2e6d1291b8b8a2fe0635ce1a33d",
+ };
+ static const char* const kDigests32x16[kNumIntraPredictors] = {
+ "48374c1241409e26d81e5106c73da420", "97c918bdba2ece52156dbc776b9b70d4",
+ "a44ce9c03f6622a3e93bfe3b928eb6f1", "2384ad95e3e7302f20857121e187aa48",
+ "47e72c6dc0087b6fd99e91cff854c269", "142dc3cbb05b82a496780f7fc3d66ccc",
+ "4a39fb768efcd4f30d6eae816e6a68c4", "d0c31f9d52d984a0335557eafe2b47fa",
+ "81b3af5c7893729b837e4d304917f7cd", "941cbcd411887dc7fa3a5c7395690d1a",
+ };
+ static const char* const kDigests32x32[kNumIntraPredictors] = {
+ "00892ee43a1bbb11347c1f44fb94b1a2", "d66397ba868e62cec99daf5ea73bebd0",
+ "65fe746e79ac1e779caae8abcc15eb6b", "8e308fe96b9845112d79c54f9d7981a0",
+ "47bc8847a7c9aed3417cd5250ba57875", "1a4008b7f0f61a3c73a2ee1d1452b414",
+ "24d25ef488bb457a5a4c4892e47a363d", "6d9d964f5317ab32a8edf57c23775238",
+ "544fc36c1a35c588359ae492cb5bc143", "ac170d94dbd944e9723de9c18bace1a3",
+ };
+ static const char* const kDigests32x64[kNumIntraPredictors] = {
+ "7d0bd7dea26226741dbca9a97f27fa74", "a8bdc852ef704dd4975c61893e8fbc3f",
+ "f29d6d03c143ddf96fef04c19f2c8333", "ad9cfc395a5c5644a21d958c7274ac14",
+ "45c27c5cca9a91b6ae8379feb0881c9f", "8a0b78df1e001b85c874d686eac4aa1b",
+ "ce9fa75fac54a3f6c0cc3f2083b938f1", "c0dca10d88762c954af18dc9e3791a39",
+ "61df229eddfccab913b8fda4bb02f9ac", "4f4df6bc8d50a5600b573f0e44d70e66",
+ };
+ static const char* const kDigests64x16[kNumIntraPredictors] = {
+ "e99d072de858094c98b01bd4a6772634", "525da4b187acd81b1ff1116b60461141",
+ "1348f249690d9eefe09d9ad7ead2c801", "a5e2f9fb685d5f4a048e9a96affd25a4",
+ "873bfa9dc24693f19721f7c8d527f7d3", "0acfc6507bd3468e9679efc127d6e4b9",
+ "57d03f8d079c7264854e22ac1157cfae", "6c2c4036f70c7d957a9399b5436c0774",
+ "42b8e4a97b7f8416c72a5148c031c0b1", "a38a2c5f79993dfae8530e9e25800893",
+ };
+ static const char* const kDigests64x32[kNumIntraPredictors] = {
+ "68bd283cfd1a125f6b2ee47cee874d36", "b4581311a0a73d95dfac7f8f44591032",
+ "5ecc7fdc52d2f575ad4f2d0e9e6b1e11", "db9d82921fd88b24fdff6f849f2f9c87",
+ "804179f05c032908a5e36077bb87c994", "fc5fd041a8ee779015394d0c066ee43c",
+ "68f5579ccadfe9a1baafb158334a3db2", "fe237e45e215ab06d79046da9ad71e84",
+ "9a8a938a6824551bf7d21b8fd1d70ea1", "eb7332f2017cd96882c76e7136aeaf53",
+ };
+ static const char* const kDigests64x64[kNumIntraPredictors] = {
+ "d9a906c0e692b22e1b4414e71a704b7e", "12ac11889ae5f55b7781454efd706a6a",
+ "3f1ef5f473a49eba743f17a3324adf9d", "a6baa0d4bfb2269a94c7a38f86a4bccf",
+ "47d4cadd56f70c11ff8f3e5d8df81161", "de997744cf24c16c5ac2a36b02b351cc",
+ "23781211ae178ddeb6c4bb97a6bd7d83", "a79d2e28340ca34b9e37daabbf030f63",
+ "0372bd3ddfc258750a6ac106b70587f4", "228ef625d9460cbf6fa253a16a730976",
+ };
+
+ switch (tx_size) {
+ case kTransformSize4x4:
+ return kDigests4x4;
+ case kTransformSize4x8:
+ return kDigests4x8;
+ case kTransformSize4x16:
+ return kDigests4x16;
+ case kTransformSize8x4:
+ return kDigests8x4;
+ case kTransformSize8x8:
+ return kDigests8x8;
+ case kTransformSize8x16:
+ return kDigests8x16;
+ case kTransformSize8x32:
+ return kDigests8x32;
+ case kTransformSize16x4:
+ return kDigests16x4;
+ case kTransformSize16x8:
+ return kDigests16x8;
+ case kTransformSize16x16:
+ return kDigests16x16;
+ case kTransformSize16x32:
+ return kDigests16x32;
+ case kTransformSize16x64:
+ return kDigests16x64;
+ case kTransformSize32x8:
+ return kDigests32x8;
+ case kTransformSize32x16:
+ return kDigests32x16;
+ case kTransformSize32x32:
+ return kDigests32x32;
+ case kTransformSize32x64:
+ return kDigests32x64;
+ case kTransformSize64x16:
+ return kDigests64x16;
+ case kTransformSize64x32:
+ return kDigests64x32;
+ case kTransformSize64x64:
+ return kDigests64x64;
+ default:
+ ADD_FAILURE() << "Unknown transform size: " << tx_size;
+ return nullptr;
+ }
+}
+
+TEST_P(IntraPredTest12bpp, DISABLED_Speed) {
+ const auto num_runs =
+ static_cast<int>(2.0e9 / (block_width_ * block_height_));
+ TestSpeed(GetIntraPredDigests12bpp(tx_size_), num_runs);
+}
+
+TEST_P(IntraPredTest12bpp, FixedInput) {
+ TestSpeed(GetIntraPredDigests12bpp(tx_size_), 1);
+}
+
+TEST_P(IntraPredTest12bpp, Overflow) { TestSaturatedValues(); }
+TEST_P(IntraPredTest12bpp, Random) { TestRandomValues(); }
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
constexpr TransformSize kTransformSizes[] = {
kTransformSize4x4, kTransformSize4x8, kTransformSize4x16,
kTransformSize8x4, kTransformSize8x8, kTransformSize8x16,
@@ -700,6 +899,11 @@ INSTANTIATE_TEST_SUITE_P(NEON, IntraPredTest10bpp,
#endif // LIBGAV1_MAX_BITDEPTH >= 10
+#if LIBGAV1_MAX_BITDEPTH == 12
+INSTANTIATE_TEST_SUITE_P(C, IntraPredTest12bpp,
+ testing::ValuesIn(kTransformSizes));
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
} // namespace
} // namespace dsp
diff --git a/src/dsp/inverse_transform.cc b/src/dsp/inverse_transform.cc
index 1b0064f..0bbdffa 100644
--- a/src/dsp/inverse_transform.cc
+++ b/src/dsp/inverse_transform.cc
@@ -18,6 +18,7 @@
#include <cassert>
#include <cstdint>
#include <cstring>
+#include <type_traits>
#include "src/dsp/dsp.h"
#include "src/utils/array_2d.h"
@@ -25,6 +26,15 @@
#include "src/utils/compiler_attributes.h"
#include "src/utils/logging.h"
+#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION)
+#undef LIBGAV1_ENABLE_TRANSFORM_RANGE_CHECK
+#endif
+
+#if defined(LIBGAV1_ENABLE_TRANSFORM_RANGE_CHECK) && \
+ LIBGAV1_ENABLE_TRANSFORM_RANGE_CHECK
+#include <cinttypes>
+#endif
+
namespace libgav1 {
namespace dsp {
namespace {
@@ -34,24 +44,25 @@ namespace {
constexpr uint8_t kTransformColumnShift = 4;
-#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION)
-#undef LIBGAV1_ENABLE_TRANSFORM_RANGE_CHECK
-#endif
-
-int32_t RangeCheckValue(int32_t value, int8_t range) {
+template <typename T>
+int32_t RangeCheckValue(T value, int8_t range) {
#if defined(LIBGAV1_ENABLE_TRANSFORM_RANGE_CHECK) && \
LIBGAV1_ENABLE_TRANSFORM_RANGE_CHECK
+ static_assert(
+ std::is_same<T, int32_t>::value || std::is_same<T, std::int64_t>::value,
+ "");
assert(range <= 32);
const auto min = static_cast<int32_t>(-(uint32_t{1} << (range - 1)));
const auto max = static_cast<int32_t>((uint32_t{1} << (range - 1)) - 1);
if (min > value || value > max) {
- LIBGAV1_DLOG(ERROR, "coeff out of bit range, value: %d bit range %d\n",
- value, range);
+ LIBGAV1_DLOG(ERROR,
+ "coeff out of bit range, value: %" PRId64 " bit range %d",
+ static_cast<int64_t>(value), range);
assert(min <= value && value <= max);
}
#endif // LIBGAV1_ENABLE_TRANSFORM_RANGE_CHECK
static_cast<void>(range);
- return value;
+ return static_cast<int32_t>(value);
}
template <typename Residual>
@@ -433,7 +444,13 @@ void Adst4_C(void* dest, int8_t range) {
// Section 7.13.2.6: It is a requirement of bitstream conformance that all
// values stored in the s and x arrays by this process are representable by
// a signed integer using range + 12 bits of precision.
- int32_t s[7];
+ // Note the intermediate value can only exceed INT32_MAX with invalid 12-bit
+ // content. For simplicity in unoptimized code, int64_t is used for both 10 &
+ // 12-bit. SIMD implementations can allow these to rollover on platforms
+ // where this has defined behavior.
+ using Intermediate =
+ typename std::conditional<sizeof(Residual) == 2, int32_t, int64_t>::type;
+ Intermediate s[7];
s[0] = RangeCheckValue(kAdst4Multiplier[0] * dst[0], range + 12);
s[1] = RangeCheckValue(kAdst4Multiplier[1] * dst[0], range + 12);
s[2] = RangeCheckValue(kAdst4Multiplier[2] * dst[1], range + 12);
@@ -454,19 +471,23 @@ void Adst4_C(void* dest, int8_t range) {
s[0] = RangeCheckValue(s[0] + s[3], range + 12);
s[1] = RangeCheckValue(s[1] - s[4], range + 12);
s[3] = s[2];
- s[2] = RangeCheckValue(kAdst4Multiplier[2] * b7, range + 12);
+ // With range checking enabled b7 would be trapped above. This prevents an
+ // integer sanitizer warning. In SIMD implementations the multiply can be
+ // allowed to rollover on platforms where this has defined behavior.
+ const auto adst2_b7 = static_cast<Intermediate>(kAdst4Multiplier[2]) * b7;
+ s[2] = RangeCheckValue(adst2_b7, range + 12);
// stage 4.
s[0] = RangeCheckValue(s[0] + s[5], range + 12);
s[1] = RangeCheckValue(s[1] - s[6], range + 12);
// stages 5 and 6.
- const int32_t x0 = RangeCheckValue(s[0] + s[3], range + 12);
- const int32_t x1 = RangeCheckValue(s[1] + s[3], range + 12);
- int32_t x3 = RangeCheckValue(s[0] + s[1], range + 12);
+ const Intermediate x0 = RangeCheckValue(s[0] + s[3], range + 12);
+ const Intermediate x1 = RangeCheckValue(s[1] + s[3], range + 12);
+ Intermediate x3 = RangeCheckValue(s[0] + s[1], range + 12);
x3 = RangeCheckValue(x3 - s[3], range + 12);
- int32_t dst_0 = RightShiftWithRounding(x0, 12);
- int32_t dst_1 = RightShiftWithRounding(x1, 12);
- int32_t dst_2 = RightShiftWithRounding(s[2], 12);
- int32_t dst_3 = RightShiftWithRounding(x3, 12);
+ auto dst_0 = static_cast<int32_t>(RightShiftWithRounding(x0, 12));
+ auto dst_1 = static_cast<int32_t>(RightShiftWithRounding(x1, 12));
+ auto dst_2 = static_cast<int32_t>(RightShiftWithRounding(s[2], 12));
+ auto dst_3 = static_cast<int32_t>(RightShiftWithRounding(x3, 12));
if (sizeof(Residual) == 2) {
// If the first argument to RightShiftWithRounding(..., 12) is only
// slightly smaller than 2^27 - 1 (e.g., 0x7fffe4e), adding 2^11 to it
@@ -840,6 +861,10 @@ void Adst16DcOnly_C(void* dest, int8_t range, bool should_round, int row_shift,
template <typename Residual>
void Identity4Row_C(void* dest, int8_t shift) {
+ // Note the intermediate value can only exceed 32 bits with 12-bit content.
+ // For simplicity in unoptimized code, int64_t is used for both 10 & 12-bit.
+ using Intermediate =
+ typename std::conditional<sizeof(Residual) == 2, int32_t, int64_t>::type;
assert(shift == 0 || shift == 1);
auto* const dst = static_cast<Residual*>(dest);
// If |shift| is 0, |rounding| should be 1 << 11. If |shift| is 1, |rounding|
@@ -847,10 +872,10 @@ void Identity4Row_C(void* dest, int8_t shift) {
// values of |shift|.
const int32_t rounding = (1 + (shift << 1)) << 11;
for (int i = 0; i < 4; ++i) {
- // The intermediate value here will have to fit into an int32_t for it to be
- // bitstream conformant. The multiplication is promoted to int32_t by
- // defining kIdentity4Multiplier as int32_t.
- int32_t dst_i = (dst[i] * kIdentity4Multiplier + rounding) >> (12 + shift);
+ const auto intermediate =
+ static_cast<Intermediate>(dst[i]) * kIdentity4Multiplier;
+ int32_t dst_i =
+ static_cast<int32_t>((intermediate + rounding) >> (12 + shift));
if (sizeof(Residual) == 2) {
dst_i = Clip3(dst_i, INT16_MIN, INT16_MAX);
}
@@ -874,16 +899,24 @@ void Identity4Column_C(void* dest, int8_t /*shift*/) {
template <int bitdepth, typename Residual>
void Identity4DcOnly_C(void* dest, int8_t /*range*/, bool should_round,
int row_shift, bool is_row) {
+ // Note the intermediate value can only exceed 32 bits with 12-bit content.
+ // For simplicity in unoptimized code, int64_t is used for both 10 & 12-bit.
+ using Intermediate =
+ typename std::conditional<sizeof(Residual) == 2, int32_t, int64_t>::type;
auto* const dst = static_cast<Residual*>(dest);
if (is_row) {
if (should_round) {
- dst[0] = RightShiftWithRounding(dst[0] * kTransformRowMultiplier, 12);
+ const auto intermediate =
+ static_cast<Intermediate>(dst[0]) * kTransformRowMultiplier;
+ dst[0] = RightShiftWithRounding(intermediate, 12);
}
const int32_t rounding = (1 + (row_shift << 1)) << 11;
+ const auto intermediate =
+ static_cast<Intermediate>(dst[0]) * kIdentity4Multiplier;
int32_t dst_i =
- (dst[0] * kIdentity4Multiplier + rounding) >> (12 + row_shift);
+ static_cast<int32_t>((intermediate + rounding) >> (12 + row_shift));
if (sizeof(Residual) == 2) {
dst_i = Clip3(dst_i, INT16_MIN, INT16_MAX);
}
@@ -923,11 +956,17 @@ void Identity8Column_C(void* dest, int8_t /*shift*/) {
template <int bitdepth, typename Residual>
void Identity8DcOnly_C(void* dest, int8_t /*range*/, bool should_round,
int row_shift, bool is_row) {
+ // Note the intermediate value can only exceed 32 bits with 12-bit content.
+ // For simplicity in unoptimized code, int64_t is used for both 10 & 12-bit.
+ using Intermediate =
+ typename std::conditional<sizeof(Residual) == 2, int32_t, int64_t>::type;
auto* const dst = static_cast<Residual*>(dest);
if (is_row) {
if (should_round) {
- dst[0] = RightShiftWithRounding(dst[0] * kTransformRowMultiplier, 12);
+ const auto intermediate =
+ static_cast<Intermediate>(dst[0]) * kTransformRowMultiplier;
+ dst[0] = RightShiftWithRounding(intermediate, 12);
}
int32_t dst_i = RightShiftWithRounding(MultiplyBy2(dst[0]), row_shift);
@@ -954,13 +993,19 @@ void Identity8DcOnly_C(void* dest, int8_t /*range*/, bool should_round,
template <typename Residual>
void Identity16Row_C(void* dest, int8_t shift) {
assert(shift == 1 || shift == 2);
+ // Note the intermediate value can only exceed 32 bits with 12-bit content.
+ // For simplicity in unoptimized code, int64_t is used for both 10 & 12-bit.
+ using Intermediate =
+ typename std::conditional<sizeof(Residual) == 2, int32_t, int64_t>::type;
auto* const dst = static_cast<Residual*>(dest);
const int32_t rounding = (1 + (1 << shift)) << 11;
for (int i = 0; i < 16; ++i) {
- // The intermediate value here will have to fit into an int32_t for it to be
- // bitstream conformant. The multiplication is promoted to int32_t by
- // defining kIdentity16Multiplier as int32_t.
- int32_t dst_i = (dst[i] * kIdentity16Multiplier + rounding) >> (12 + shift);
+ // Note the intermediate value can only exceed 32 bits with 12-bit content.
+ // For simplicity in unoptimized code, int64_t is used for all cases.
+ const auto intermediate =
+ static_cast<Intermediate>(dst[i]) * kIdentity16Multiplier;
+ int32_t dst_i =
+ static_cast<int32_t>((intermediate + rounding) >> (12 + shift));
if (sizeof(Residual) == 2) {
dst_i = Clip3(dst_i, INT16_MIN, INT16_MAX);
}
@@ -985,16 +1030,24 @@ void Identity16Column_C(void* dest, int8_t /*shift*/) {
template <int bitdepth, typename Residual>
void Identity16DcOnly_C(void* dest, int8_t /*range*/, bool should_round,
int row_shift, bool is_row) {
+ // Note the intermediate value can only exceed 32 bits with 12-bit content.
+ // For simplicity in unoptimized code, int64_t is used for both 10 & 12-bit.
+ using Intermediate =
+ typename std::conditional<sizeof(Residual) == 2, int32_t, int64_t>::type;
auto* const dst = static_cast<Residual*>(dest);
if (is_row) {
if (should_round) {
- dst[0] = RightShiftWithRounding(dst[0] * kTransformRowMultiplier, 12);
+ const auto intermediate =
+ static_cast<Intermediate>(dst[0]) * kTransformRowMultiplier;
+ dst[0] = RightShiftWithRounding(intermediate, 12);
}
const int32_t rounding = (1 + (1 << row_shift)) << 11;
+ const auto intermediate =
+ static_cast<Intermediate>(dst[0]) * kIdentity16Multiplier;
int32_t dst_i =
- (dst[0] * kIdentity16Multiplier + rounding) >> (12 + row_shift);
+ static_cast<int32_t>((intermediate + rounding) >> (12 + row_shift));
if (sizeof(Residual) == 2) {
dst_i = Clip3(dst_i, INT16_MIN, INT16_MAX);
}
@@ -1034,11 +1087,17 @@ void Identity32Column_C(void* dest, int8_t /*shift*/) {
template <int bitdepth, typename Residual>
void Identity32DcOnly_C(void* dest, int8_t /*range*/, bool should_round,
int row_shift, bool is_row) {
+ // Note the intermediate value can only exceed 32 bits with 12-bit content.
+ // For simplicity in unoptimized code, int64_t is used for both 10 & 12-bit.
+ using Intermediate =
+ typename std::conditional<sizeof(Residual) == 2, int32_t, int64_t>::type;
auto* const dst = static_cast<Residual*>(dest);
if (is_row) {
if (should_round) {
- dst[0] = RightShiftWithRounding(dst[0] * kTransformRowMultiplier, 12);
+ const auto intermediate =
+ static_cast<Intermediate>(dst[0]) * kTransformRowMultiplier;
+ dst[0] = RightShiftWithRounding(intermediate, 12);
}
int32_t dst_i = RightShiftWithRounding(MultiplyBy4(dst[0]), row_shift);
@@ -1612,6 +1671,148 @@ void Init10bpp() {
}
#endif // LIBGAV1_MAX_BITDEPTH >= 10
+#if LIBGAV1_MAX_BITDEPTH == 12
+void Init12bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(12);
+ assert(dsp != nullptr);
+ static_cast<void>(dsp);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ InitAll<12, int32_t, uint16_t>(dsp);
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+#ifndef LIBGAV1_Dsp12bpp_Transform1dSize4_Transform1dDct
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize4][kRow] =
+ TransformLoop_C<12, int32_t, uint16_t, kTransform1dDct,
+ DctDcOnly_C<12, int32_t, 2>, Dct_C<int32_t, 2>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize4][kColumn] =
+ TransformLoop_C<12, int32_t, uint16_t, kTransform1dDct,
+ DctDcOnly_C<12, int32_t, 2>, Dct_C<int32_t, 2>,
+ /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_Transform1dSize8_Transform1dDct
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize8][kRow] =
+ TransformLoop_C<12, int32_t, uint16_t, kTransform1dDct,
+ DctDcOnly_C<12, int32_t, 3>, Dct_C<int32_t, 3>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize8][kColumn] =
+ TransformLoop_C<12, int32_t, uint16_t, kTransform1dDct,
+ DctDcOnly_C<12, int32_t, 3>, Dct_C<int32_t, 3>,
+ /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_Transform1dSize16_Transform1dDct
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize16][kRow] =
+ TransformLoop_C<12, int32_t, uint16_t, kTransform1dDct,
+ DctDcOnly_C<12, int32_t, 4>, Dct_C<int32_t, 4>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize16][kColumn] =
+ TransformLoop_C<12, int32_t, uint16_t, kTransform1dDct,
+ DctDcOnly_C<12, int32_t, 4>, Dct_C<int32_t, 4>,
+ /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_Transform1dSize32_Transform1dDct
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize32][kRow] =
+ TransformLoop_C<12, int32_t, uint16_t, kTransform1dDct,
+ DctDcOnly_C<12, int32_t, 5>, Dct_C<int32_t, 5>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize32][kColumn] =
+ TransformLoop_C<12, int32_t, uint16_t, kTransform1dDct,
+ DctDcOnly_C<12, int32_t, 5>, Dct_C<int32_t, 5>,
+ /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_Transform1dSize64_Transform1dDct
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize64][kRow] =
+ TransformLoop_C<12, int32_t, uint16_t, kTransform1dDct,
+ DctDcOnly_C<12, int32_t, 6>, Dct_C<int32_t, 6>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize64][kColumn] =
+ TransformLoop_C<12, int32_t, uint16_t, kTransform1dDct,
+ DctDcOnly_C<12, int32_t, 6>, Dct_C<int32_t, 6>,
+ /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_Transform1dSize4_Transform1dAdst
+ dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize4][kRow] =
+ TransformLoop_C<12, int32_t, uint16_t, kTransform1dAdst,
+ Adst4DcOnly_C<12, int32_t>, Adst4_C<int32_t>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize4][kColumn] =
+ TransformLoop_C<12, int32_t, uint16_t, kTransform1dAdst,
+ Adst4DcOnly_C<12, int32_t>, Adst4_C<int32_t>,
+ /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_Transform1dSize8_Transform1dAdst
+ dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize8][kRow] =
+ TransformLoop_C<12, int32_t, uint16_t, kTransform1dAdst,
+ Adst8DcOnly_C<12, int32_t>, Adst8_C<int32_t>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize8][kColumn] =
+ TransformLoop_C<12, int32_t, uint16_t, kTransform1dAdst,
+ Adst8DcOnly_C<12, int32_t>, Adst8_C<int32_t>,
+ /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_Transform1dSize16_Transform1dAdst
+ dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize16][kRow] =
+ TransformLoop_C<12, int32_t, uint16_t, kTransform1dAdst,
+ Adst16DcOnly_C<12, int32_t>, Adst16_C<int32_t>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize16][kColumn] =
+ TransformLoop_C<12, int32_t, uint16_t, kTransform1dAdst,
+ Adst16DcOnly_C<12, int32_t>, Adst16_C<int32_t>,
+ /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_Transform1dSize4_Transform1dIdentity
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize4][kRow] =
+ TransformLoop_C<12, int32_t, uint16_t, kTransform1dIdentity,
+ Identity4DcOnly_C<12, int32_t>, Identity4Row_C<int32_t>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize4][kColumn] =
+ TransformLoop_C<12, int32_t, uint16_t, kTransform1dIdentity,
+ Identity4DcOnly_C<12, int32_t>,
+ Identity4Column_C<int32_t>, /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_Transform1dSize8_Transform1dIdentity
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize8][kRow] =
+ TransformLoop_C<12, int32_t, uint16_t, kTransform1dIdentity,
+ Identity8DcOnly_C<12, int32_t>, Identity8Row_C<int32_t>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize8][kColumn] =
+ TransformLoop_C<12, int32_t, uint16_t, kTransform1dIdentity,
+ Identity8DcOnly_C<12, int32_t>,
+ Identity8Column_C<int32_t>, /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_Transform1dSize16_Transform1dIdentity
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize16][kRow] =
+ TransformLoop_C<12, int32_t, uint16_t, kTransform1dIdentity,
+ Identity16DcOnly_C<12, int32_t>, Identity16Row_C<int32_t>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize16][kColumn] =
+ TransformLoop_C<12, int32_t, uint16_t, kTransform1dIdentity,
+ Identity16DcOnly_C<12, int32_t>,
+ Identity16Column_C<int32_t>, /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_Transform1dSize32_Transform1dIdentity
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize32][kRow] =
+ TransformLoop_C<12, int32_t, uint16_t, kTransform1dIdentity,
+ Identity32DcOnly_C<12, int32_t>, Identity32Row_C<int32_t>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize32][kColumn] =
+ TransformLoop_C<12, int32_t, uint16_t, kTransform1dIdentity,
+ Identity32DcOnly_C<12, int32_t>,
+ Identity32Column_C<int32_t>, /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_Transform1dSize4_Transform1dWht
+ dsp->inverse_transforms[kTransform1dWht][kTransform1dSize4][kRow] =
+ TransformLoop_C<12, int32_t, uint16_t, kTransform1dWht,
+ Wht4DcOnly_C<12, int32_t>, Wht4_C<int32_t>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[kTransform1dWht][kTransform1dSize4][kColumn] =
+ TransformLoop_C<12, int32_t, uint16_t, kTransform1dWht,
+ Wht4DcOnly_C<12, int32_t>, Wht4_C<int32_t>,
+ /*is_row=*/false>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
} // namespace
void InverseTransformInit_C() {
@@ -1619,10 +1820,12 @@ void InverseTransformInit_C() {
#if LIBGAV1_MAX_BITDEPTH >= 10
Init10bpp();
#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+ Init12bpp();
+#endif
// Local functions that may be unused depending on the optimizations
// available.
- static_cast<void>(RangeCheckValue);
static_cast<void>(kBitReverseLookup);
}
diff --git a/src/dsp/inverse_transform_test.cc b/src/dsp/inverse_transform_test.cc
index 0ae23df..081dcc1 100644
--- a/src/dsp/inverse_transform_test.cc
+++ b/src/dsp/inverse_transform_test.cc
@@ -69,6 +69,7 @@ template <int bitdepth, typename SrcPixel, typename DstPixel>
class InverseTransformTestBase : public testing::TestWithParam<TransformSize>,
public test_utils::MaxAlignedAllocable {
public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
InverseTransformTestBase() {
switch (tx_size_) {
case kNumTransformSizes:
@@ -148,6 +149,7 @@ template <int bitdepth, typename Pixel, typename DstPixel>
class InverseTransformTest
: public InverseTransformTestBase<bitdepth, Pixel, DstPixel> {
public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
InverseTransformTest() = default;
InverseTransformTest(const InverseTransformTest&) = delete;
InverseTransformTest& operator=(const InverseTransformTest&) = delete;
@@ -533,6 +535,19 @@ INSTANTIATE_TEST_SUITE_P(NEON, InverseTransformTest10bpp,
#endif
#endif // LIBGAV1_MAX_BITDEPTH >= 10
+#if LIBGAV1_MAX_BITDEPTH == 12
+using InverseTransformTest12bpp = InverseTransformTest<12, int32_t, uint16_t>;
+
+TEST_P(InverseTransformTest12bpp, Random) { TestRandomValues(1); }
+
+TEST_P(InverseTransformTest12bpp, DISABLED_Speed) { TestRandomValues(12000); }
+
+TEST_P(InverseTransformTest12bpp, DcRandom) { TestDcOnlyRandomValue(1); }
+
+INSTANTIATE_TEST_SUITE_P(C, InverseTransformTest12bpp,
+ testing::ValuesIn(kTransformSizesAll));
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
} // namespace
} // namespace dsp
diff --git a/src/dsp/libgav1_dsp.cmake b/src/dsp/libgav1_dsp.cmake
index 4bd1443..fedb35b 100644
--- a/src/dsp/libgav1_dsp.cmake
+++ b/src/dsp/libgav1_dsp.cmake
@@ -113,6 +113,7 @@ list(APPEND libgav1_dsp_sources_neon
"${libgav1_source}/dsp/arm/inverse_transform_10bit_neon.cc"
"${libgav1_source}/dsp/arm/inverse_transform_neon.cc"
"${libgav1_source}/dsp/arm/inverse_transform_neon.h"
+ "${libgav1_source}/dsp/arm/loop_filter_10bit_neon.cc"
"${libgav1_source}/dsp/arm/loop_filter_neon.cc"
"${libgav1_source}/dsp/arm/loop_filter_neon.h"
"${libgav1_source}/dsp/arm/loop_restoration_10bit_neon.cc"
diff --git a/src/dsp/loop_filter.cc b/src/dsp/loop_filter.cc
index 14d47bf..bb0583f 100644
--- a/src/dsp/loop_filter.cc
+++ b/src/dsp/loop_filter.cc
@@ -603,6 +603,73 @@ void Init10bpp() {
}
#endif // LIBGAV1_MAX_BITDEPTH >= 10
+#if LIBGAV1_MAX_BITDEPTH == 12
+using Defs12bpp = LoopFilterFuncs_C<12, uint16_t>;
+
+void Init12bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(12);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] =
+ Defs12bpp::Horizontal4;
+ dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] =
+ Defs12bpp::Vertical4;
+
+ dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] =
+ Defs12bpp::Horizontal6;
+ dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] =
+ Defs12bpp::Vertical6;
+
+ dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] =
+ Defs12bpp::Horizontal8;
+ dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] =
+ Defs12bpp::Vertical8;
+
+ dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] =
+ Defs12bpp::Horizontal14;
+ dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] =
+ Defs12bpp::Vertical14;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp12bpp_LoopFilterSize4_LoopFilterTypeHorizontal
+ dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] =
+ Defs12bpp::Horizontal4;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_LoopFilterSize4_LoopFilterTypeVertical
+ dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] =
+ Defs12bpp::Vertical4;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_LoopFilterSize6_LoopFilterTypeHorizontal
+ dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] =
+ Defs12bpp::Horizontal6;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_LoopFilterSize6_LoopFilterTypeVertical
+ dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] =
+ Defs12bpp::Vertical6;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_LoopFilterSize8_LoopFilterTypeHorizontal
+ dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] =
+ Defs12bpp::Horizontal8;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_LoopFilterSize8_LoopFilterTypeVertical
+ dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] =
+ Defs12bpp::Vertical8;
+#endif
+
+#ifndef LIBGAV1_Dsp12bpp_LoopFilterSize14_LoopFilterTypeHorizontal
+ dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] =
+ Defs12bpp::Horizontal14;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_LoopFilterSize14_LoopFilterTypeVertical
+ dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] =
+ Defs12bpp::Vertical14;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
} // namespace
void LoopFilterInit_C() {
@@ -610,6 +677,9 @@ void LoopFilterInit_C() {
#if LIBGAV1_MAX_BITDEPTH >= 10
Init10bpp();
#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+ Init12bpp();
+#endif
// Local functions that may be unused depending on the optimizations
// available.
static_cast<void>(AdjustThresholds);
diff --git a/src/dsp/loop_filter_test.cc b/src/dsp/loop_filter_test.cc
index d013a1b..63ed530 100644
--- a/src/dsp/loop_filter_test.cc
+++ b/src/dsp/loop_filter_test.cc
@@ -106,6 +106,7 @@ void InitInput(Pixel* dst, const int stride, const int bitdepth,
template <int bitdepth, typename Pixel>
class LoopFilterTest : public testing::TestWithParam<LoopFilterSize> {
public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
LoopFilterTest() = default;
LoopFilterTest(const LoopFilterTest&) = delete;
LoopFilterTest& operator=(const LoopFilterTest&) = delete;
@@ -132,6 +133,9 @@ class LoopFilterTest : public testing::TestWithParam<LoopFilterSize> {
}
} else if (absl::StartsWith(test_case, "NEON/")) {
LoopFilterInit_NEON();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ LoopFilterInit10bpp_NEON();
+#endif
} else {
FAIL() << "Unrecognized architecture prefix in test case name: "
<< test_case;
@@ -203,22 +207,23 @@ void LoopFilterTest<bitdepth, Pixel>::TestRandomValues(
template <int bitdepth, typename Pixel>
void LoopFilterTest<bitdepth, Pixel>::TestSaturatedValues() const {
- const LoopFilterType filter = kLoopFilterTypeHorizontal;
- if (cur_loop_filters_[filter] == nullptr) return;
-
Pixel dst[kNumPixels], ref[kNumPixels];
const auto value = static_cast<Pixel>((1 << bitdepth) - 1);
for (auto& r : dst) r = value;
memcpy(ref, dst, sizeof(dst));
- const int outer_thresh = 24;
- const int inner_thresh = 8;
- const int hev_thresh = 0;
- cur_loop_filters_[filter](dst + 8 + kBlockStride * 8, kBlockStride,
- outer_thresh, inner_thresh, hev_thresh);
- ASSERT_TRUE(test_utils::CompareBlocks(ref, dst, kBlockStride, kBlockStride,
- kBlockStride, kBlockStride, true))
- << "kLoopFilterTypeHorizontal output doesn't match reference";
+ for (int i = 0; i < kNumLoopFilterTypes; ++i) {
+ if (cur_loop_filters_[i] == nullptr) return;
+ const int outer_thresh = 24;
+ const int inner_thresh = 8;
+ const int hev_thresh = 0;
+ cur_loop_filters_[i](dst + 8 + kBlockStride * 8, kBlockStride, outer_thresh,
+ inner_thresh, hev_thresh);
+ ASSERT_TRUE(test_utils::CompareBlocks(ref, dst, kBlockStride, kBlockStride,
+ kBlockStride, kBlockStride, true))
+ << ToString(static_cast<LoopFilterType>(i))
+ << " output doesn't match reference";
+ }
}
//------------------------------------------------------------------------------
@@ -328,6 +333,8 @@ TEST_P(LoopFilterTest10bpp, FixedInput) {
TestRandomValues(GetDigests10bpp(size_), kNumTests);
}
+TEST_P(LoopFilterTest10bpp, SaturatedValues) { TestSaturatedValues(); }
+
INSTANTIATE_TEST_SUITE_P(C, LoopFilterTest10bpp,
testing::ValuesIn(kLoopFilterSizes));
@@ -339,7 +346,59 @@ INSTANTIATE_TEST_SUITE_P(SSE41, LoopFilterTest10bpp,
INSTANTIATE_TEST_SUITE_P(NEON, LoopFilterTest10bpp,
testing::ValuesIn(kLoopFilterSizes));
#endif
-#endif
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+//------------------------------------------------------------------------------
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+using LoopFilterTest12bpp = LoopFilterTest<12, uint16_t>;
+
+const char* const* GetDigests12bpp(LoopFilterSize size) {
+ static const char* const kDigestsSize4[kNumLoopFilterTypes] = {
+ "a14599cbfe2daee633d556a15c47b1f6",
+ "1f0a0794832de1012e2fed6b1cb02e69",
+ };
+ static const char* const kDigestsSize6[kNumLoopFilterTypes] = {
+ "c76b24a73139239db10f16f36e01a625",
+ "3f75d904e9dcb1886e84a0f03f60f31e",
+ };
+ static const char* const kDigestsSize8[kNumLoopFilterTypes] = {
+ "57c6f0efe2ab3957f5500ca2a9670f37",
+ "caa1f90c2eb2b65b280d678f8fcf6be8",
+ };
+ static const char* const kDigestsSize14[kNumLoopFilterTypes] = {
+ "0c58f7466c36c3f4a2c1b4aa1b80f0b3",
+ "63077978326e6dddb5b2c3bfe6d684f5",
+ };
+
+ switch (size) {
+ case kLoopFilterSize4:
+ return kDigestsSize4;
+ case kLoopFilterSize6:
+ return kDigestsSize6;
+ case kLoopFilterSize8:
+ return kDigestsSize8;
+ case kLoopFilterSize14:
+ return kDigestsSize14;
+ default:
+ ADD_FAILURE() << "Unknown loop filter size" << size;
+ return nullptr;
+ }
+}
+
+TEST_P(LoopFilterTest12bpp, DISABLED_Speed) {
+ TestRandomValues(nullptr, kNumSpeedTests);
+}
+
+TEST_P(LoopFilterTest12bpp, FixedInput) {
+ TestRandomValues(GetDigests12bpp(size_), kNumTests);
+}
+
+TEST_P(LoopFilterTest12bpp, SaturatedValues) { TestSaturatedValues(); }
+
+INSTANTIATE_TEST_SUITE_P(C, LoopFilterTest12bpp,
+ testing::ValuesIn(kLoopFilterSizes));
+#endif // LIBGAV1_MAX_BITDEPTH == 12
} // namespace
diff --git a/src/dsp/loop_restoration.cc b/src/dsp/loop_restoration.cc
index 2301a3e..eb8052c 100644
--- a/src/dsp/loop_restoration.cc
+++ b/src/dsp/loop_restoration.cc
@@ -922,7 +922,6 @@ void Init8bpp() {
}
#if LIBGAV1_MAX_BITDEPTH >= 10
-
void Init10bpp() {
Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
assert(dsp != nullptr);
@@ -939,8 +938,27 @@ void Init10bpp() {
#endif
#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
}
-
#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+void Init12bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(12);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->loop_restorations[0] = WienerFilter_C<12, uint16_t>;
+ dsp->loop_restorations[1] = SelfGuidedFilter_C<12, uint16_t>;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp12bpp_WienerFilter
+ dsp->loop_restorations[0] = WienerFilter_C<12, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_SelfGuidedFilter
+ dsp->loop_restorations[1] = SelfGuidedFilter_C<12, uint16_t>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
} // namespace
void LoopRestorationInit_C() {
@@ -948,6 +966,9 @@ void LoopRestorationInit_C() {
#if LIBGAV1_MAX_BITDEPTH >= 10
Init10bpp();
#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+ Init12bpp();
+#endif
}
} // namespace dsp
diff --git a/src/dsp/loop_restoration.h b/src/dsp/loop_restoration.h
index de80926..8fefc40 100644
--- a/src/dsp/loop_restoration.h
+++ b/src/dsp/loop_restoration.h
@@ -39,16 +39,6 @@
namespace libgav1 {
namespace dsp {
-enum {
- // Precision of a division table (mtable)
- kSgrProjScaleBits = 20,
- kSgrProjReciprocalBits = 12,
- // Core self-guided restoration precision bits.
- kSgrProjSgrBits = 8,
- // Precision bits of generated values higher than source before projection.
- kSgrProjRestoreBits = 4
-}; // anonymous enum
-
extern const uint8_t kSgrMaLookup[256];
// Initializes Dsp::loop_restorations. This function is not thread-safe.
diff --git a/src/dsp/loop_restoration_test.cc b/src/dsp/loop_restoration_test.cc
index 4c54bc6..5c645b8 100644
--- a/src/dsp/loop_restoration_test.cc
+++ b/src/dsp/loop_restoration_test.cc
@@ -55,6 +55,7 @@ template <int bitdepth, typename Pixel>
class SelfGuidedFilterTest : public testing::TestWithParam<int>,
public test_utils::MaxAlignedAllocable {
public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
SelfGuidedFilterTest() = default;
SelfGuidedFilterTest(const SelfGuidedFilterTest&) = delete;
SelfGuidedFilterTest& operator=(const SelfGuidedFilterTest&) = delete;
@@ -159,26 +160,34 @@ void SelfGuidedFilterTest<bitdepth, Pixel>::SetInputData(
template <int bitdepth, typename Pixel>
void SelfGuidedFilterTest<bitdepth, Pixel>::TestFixedValues(int test_index,
Pixel value) {
- static const char* const kDigest[][2][kNumRadiusTypes] = {
+ static const char* const kDigest[][3][kNumRadiusTypes] = {
{{"7b78783ff4f03625a50c2ebfd574adca", "4faa0810639016f11a9f761ce28c38b0",
"a03314fc210bee68c7adbb44d2bbdac7"},
{"fce031d1339cfef5016e76a643538a71", "d439e1060de3f07b5b29c9b0b7c08e54",
- "a6583fe9359877f4a259c81d900fc4fb"}},
+ "a6583fe9359877f4a259c81d900fc4fb"},
+ {"8f9b6944c8965f34d444a667da3b0ebe", "84fa62c491c67c3a435fd5140e7a4f82",
+ "d04b62d97228789e5c6928d40d5d900e"}},
{{"948ea16a90c4cefef87ce5b0ee105fc6", "76740629877b721432b84dbbdb4e352a",
"27100f37b3e42a5f2a051e1566edb6f8"},
{"dd320de3bc82f4ba69738b2190ea9f85", "bf82f271e30a1aca91e53b086e133fb3",
- "69c274ac59c99999e1bfbf2fc4586ebd"}},
+ "69c274ac59c99999e1bfbf2fc4586ebd"},
+ {"86ff2318bf8a584b8d5edd710681d621", "f6e1c104a764d6766cc278d5b216855a",
+ "6d928703526ab114efba865ff5b11886"}},
{{"9fbf1b246011250f38532a543cc6dd74", "d5c1e0142390ebb51b075c49f8ee9ff4",
"92f31086ba2f9e1508983b22d93a4e5c"},
{"2198321e6b95e7199738e60f5ddc6966", "34f74626027ffca010c824ddf0942b13",
- "43dd7df2c2a601262c68cd8af1c61b82"}},
+ "43dd7df2c2a601262c68cd8af1c61b82"},
+ {"1ab6138c3a82ac8ccd840f0553fdfb58", "be3bf92633f7165d3ad9c327d2dd53fe",
+ "41115efff3adeb541e04db23faa22f23"}},
{{"42364ff8dbdbd6706fa3b8855a4258be", "a7843fdfd4d3c0d80ba812b353b4d6b4",
"f8a6a025827f29f857bed3e28ba3ea33"},
{"b83c1f8d7712e37f9b21b033822e37ed", "589daf2e3e6f8715873920515cfc1b42",
- "20dcbe8e317a4373bebf11d56adc5f02"}}};
+ "20dcbe8e317a4373bebf11d56adc5f02"},
+ {"7971a60337fcdb662c92db051bd0bb41", "75f89f346c2a37bf0c6695c0482531e6",
+ "1595eeacd62cdce4d2fb094534c22c1e"}}};
if (target_self_guided_filter_func_ == nullptr) return;
ASSERT_LT(value, 1 << bitdepth);
- constexpr int bd_index = (bitdepth == 8) ? 0 : 1;
+ constexpr int bd_index = (bitdepth - 8) / 2;
libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
const Pixel* const src = src_ + kOffset;
Pixel* const dst = dst_ + kOffset;
@@ -207,29 +216,39 @@ void SelfGuidedFilterTest<bitdepth, Pixel>::TestFixedValues(int test_index,
template <int bitdepth, typename Pixel>
void SelfGuidedFilterTest<bitdepth, Pixel>::TestRandomValues(bool speed) {
- static const char* const kDigest[][2][kNumRadiusTypes] = {
+ static const char* const kDigest[][3][kNumRadiusTypes] = {
{{"9f8358ed820943fa0abe3a8ebb5887db", "fb5d48870165522341843bcbfa8674fb",
"ca67159cd29475ac5d52ca4a0df3ea10"},
{"a78641886ea0cf8757057d1d91e01434", "1b95172a5f2f9c514c78afa4cf8e5678",
- "a8ba988283d9e1ad1f0dcdbf6bbdaade"}},
+ "a8ba988283d9e1ad1f0dcdbf6bbdaade"},
+ {"d95e98d031f9ba290e5183777d1e4905", "f806853cfadb50e6dbd4898412b92934",
+ "741fbfdb79cda695afedda3d51dbb27f"}},
{{"f219b445e5c80ffb5dd0359cc2cb4dd4", "699b2c9ddca1cbb0d4fc24cbcbe951e9",
"a4005899fa8d3c3c4669910f93ff1290"},
{"10a75cab3c78b891c8c6d92d55f685d1", "d46f158f57c628136f6f298ee8ca6e0e",
- "07203ad761775d5d317f2b7884afd9fe"}},
+ "07203ad761775d5d317f2b7884afd9fe"},
+ {"76b9ef906090fa81af64cce3bba0a54a", "8eecc59acdef8953aa9a96648c0ccd2c",
+ "6e45a0ef60e0475f470dc93552047f07"}},
{{"000d4e382be4003b514c9135893d0a37", "8fb082dca975be363bfc9c2d317ae084",
"475bcb6a58f87da7723f6227bc2aca0e"},
{"4d589683f69ccc5b416149dcc5c835d5", "986b6832df1f6020d50be61ae121e42f",
- "7cb5c5dbdb3d1c54cfa00def450842dc"}},
+ "7cb5c5dbdb3d1c54cfa00def450842dc"},
+ {"0e3dc23150d18c9d366d15e174727311", "8495122917770d822f1842ceff987b03",
+ "4aeb9db902072cefd6af0aff8aaabd24"}},
{{"fd43bfe34d63614554dd29fb24b12173", "5c1ba74ba3062c769d5c3c86a85ac9b9",
"f1eda6d15b37172199d9949c2315832f"},
{"a11be3117fb77e8fe113581b06f98bd1", "df94d12b774ad5cf744c871e707c36c8",
- "b23dc0b54c3500248d53377030428a61"}},
+ "b23dc0b54c3500248d53377030428a61"},
+ {"9c331f2b9410354685fe904f6c022dfa", "b540b0045b7723fbe962fd675db4b077",
+ "3cecd1158126c9c9cc2873ecc8c1a135"}},
{{"f3079b3b21d8dc6fce7bb1fd104be359", "c6fcbc686cfb97ab3a64f445d73aad36",
"23966cba3e0e7803eeb951905861e0dd"},
{"7210391a6fe26e5ca5ea205bc38aa035", "4c3e6eccad3ea152d320ecd1077169de",
- "dcee48f94126a2132963e86e93dd4903"}}};
+ "dcee48f94126a2132963e86e93dd4903"},
+ {"beb3dd8a2dbc5f83ef171b0ffcead3ab", "c373bd9c46bdb89a3d1e41759c315025",
+ "cd407b212ab46fd4a451d5dc93a0ce4a"}}};
if (target_self_guided_filter_func_ == nullptr) return;
- constexpr int bd_index = (bitdepth == 8) ? 0 : 1;
+ constexpr int bd_index = (bitdepth - 8) / 2;
const int num_inputs = speed ? 1 : 5;
#if LIBGAV1_ENABLE_NEON
const int num_tests = speed ? 4000 : 1;
@@ -324,10 +343,28 @@ INSTANTIATE_TEST_SUITE_P(NEON, SelfGuidedFilterTest10bpp,
#endif // LIBGAV1_MAX_BITDEPTH >= 10
+#if LIBGAV1_MAX_BITDEPTH == 12
+using SelfGuidedFilterTest12bpp = SelfGuidedFilterTest<12, uint16_t>;
+
+TEST_P(SelfGuidedFilterTest12bpp, Correctness) {
+ TestFixedValues(0, 0);
+ TestFixedValues(1, 1);
+ TestFixedValues(2, 2048);
+ TestFixedValues(3, 4095);
+ TestRandomValues(false);
+}
+
+TEST_P(SelfGuidedFilterTest12bpp, DISABLED_Speed) { TestRandomValues(true); }
+
+INSTANTIATE_TEST_SUITE_P(C, SelfGuidedFilterTest12bpp,
+ testing::ValuesIn(kUnitWidths));
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
template <int bitdepth, typename Pixel>
class WienerFilterTest : public testing::TestWithParam<int>,
public test_utils::MaxAlignedAllocable {
public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
WienerFilterTest() = default;
WienerFilterTest(const WienerFilterTest&) = delete;
WienerFilterTest& operator=(const WienerFilterTest&) = delete;
@@ -433,14 +470,17 @@ void WienerFilterTest<bitdepth, Pixel>::SetInputData(
template <int bitdepth, typename Pixel>
void WienerFilterTest<bitdepth, Pixel>::TestFixedValues(int digest_id,
Pixel value) {
- static const char* const kDigest[2][4] = {
+ static const char* const kDigest[3][4] = {
{"74fc90760a14b13340cb718f200ba350", "5bacaca0128cd36f4805330b3787771d",
"1109e17545cc4fbd5810b8b77e19fc36", "e7f914ec9d065aba92338016e17a526c"},
{"c8cc38790ceb0bea1eb989686755e1e5", "70f573b7e8875262c638a68d2f317916",
- "193b19065899c835cb513149eb36d135", "f1dff65e3e53558b303ef0a2e3f3ba98"}};
+ "193b19065899c835cb513149eb36d135", "f1dff65e3e53558b303ef0a2e3f3ba98"},
+ {"c8cc38790ceb0bea1eb989686755e1e5", "70f573b7e8875262c638a68d2f317916",
+ "961eeb92bd9d85eb47e3961ee93d279a", "039a279232bc90eebc0ec2fe3e18a7e1"},
+ };
if (target_wiener_filter_func_ == nullptr) return;
ASSERT_LT(value, 1 << bitdepth);
- constexpr int bd_index = (bitdepth == 8) ? 0 : 1;
+ constexpr int bd_index = (bitdepth - 8) / 2;
const Pixel* const src = src_ + kOffset;
Pixel* const dst = dst_ + kOffset;
for (const auto vertical_order : kWienerOrders) {
@@ -470,7 +510,7 @@ void WienerFilterTest<bitdepth, Pixel>::TestFixedValues(int digest_id,
template <int bitdepth, typename Pixel>
void WienerFilterTest<bitdepth, Pixel>::TestRandomValues(bool speed) {
- static const char* const kDigest[2][kNumWienerOrders][kNumWienerOrders] = {
+ static const char* const kDigest[3][kNumWienerOrders][kNumWienerOrders] = {
{{"40d0cf56d2ffb4f581e68b0fc97f547f", "5c04745209b684ba98004ebb0f64e70b",
"545ed7d3f7e7ca3b86b4ada31f7aaee7", "0d6b2967f1bd1d99b720e563fe0cf03f"},
{"44b37076f0cf27f6eb506aca50c1d3e4", "e927d64dc9249e05a65e10ee75baa7d9",
@@ -488,9 +528,19 @@ void WienerFilterTest<bitdepth, Pixel>::TestRandomValues(bool speed) {
"d77430783e173ebd1b30e5d9336c8b69", "e159a3620747458dff7ed3d20da1a4b7"},
{"5346fa07d195c257548a332753b057a3", "c77674bc0a638abc4d38d58e494fc7cf",
"7cbc1562a9dd08e1973b3b9ac1afc765",
- "3c91bf1a34672cd40bf261c5820d3ec3"}}};
+ "3c91bf1a34672cd40bf261c5820d3ec3"}},
+ {{"501b57370c781372b514accd03d161af", "a4569b5eff7f7e8b696934d192619be5",
+ "24eb2aa43118a8822f7a6a7384ab9ea7", "edd7ac227733b5a4496bfdbdf4eb34d7"},
+ {"77624cf73299a1bd928eae3eb8945dbe", "b3f311cacbf45fa892761462d31b2598",
+ "977c063d93a4b95cb365363763faa4da", "02313c9d360a1e0180ed05d3e4444c3d"},
+ {"f499655ecdcbe0ac48553f1eee758589", "a009c83c03e47cbd05c1243e28579bd9",
+ "d5f0b4fd761ff51efce949e6c5ec4833", "e3a9a57aacd2e6cfe0f792a885b3e0e3"},
+ {"b4cf906e9bb02ffca15c1e9575962ca2", "d0ca9f933978c0c31175ba1b28a44ae8",
+ "81ac1475530ffbd1c8d3ce7da87ffe6b",
+ "b96412949c2e31b29388222ac8914fa2"}},
+ };
if (target_wiener_filter_func_ == nullptr) return;
- constexpr int bd_index = (bitdepth == 8) ? 0 : 1;
+ constexpr int bd_index = (bitdepth - 8) / 2;
#if LIBGAV1_ENABLE_NEON
const int num_tests = speed ? 5000 : 1;
#else
@@ -630,9 +680,27 @@ INSTANTIATE_TEST_SUITE_P(SSE41, WienerFilterTest10bpp,
INSTANTIATE_TEST_SUITE_P(NEON, WienerFilterTest10bpp,
testing::ValuesIn(kUnitWidths));
#endif
-
#endif // LIBGAV1_MAX_BITDEPTH >= 10
+#if LIBGAV1_MAX_BITDEPTH == 12
+using WienerFilterTest12bpp = WienerFilterTest<12, uint16_t>;
+
+TEST_P(WienerFilterTest12bpp, Correctness) {
+ TestFixedValues(0, 0);
+ TestFixedValues(1, 1);
+ TestFixedValues(2, 2048);
+ TestFixedValues(3, 4095);
+ TestRandomValues(false);
+}
+
+TEST_P(WienerFilterTest12bpp, DISABLED_Speed) { TestRandomValues(true); }
+
+TEST_P(WienerFilterTest12bpp, TestCompare2C) { TestCompare2C(); }
+
+INSTANTIATE_TEST_SUITE_P(C, WienerFilterTest12bpp,
+ testing::ValuesIn(kUnitWidths));
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
} // namespace
} // namespace dsp
} // namespace libgav1
diff --git a/src/dsp/mask_blend.cc b/src/dsp/mask_blend.cc
index 207fde0..34d7fe8 100644
--- a/src/dsp/mask_blend.cc
+++ b/src/dsp/mask_blend.cc
@@ -197,7 +197,50 @@ void Init10bpp() {
dsp->inter_intra_mask_blend_8bpp[2] = nullptr;
#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
}
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+void Init12bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(12);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->mask_blend[0][0] = MaskBlend_C<12, uint16_t, false, 0, 0>;
+ dsp->mask_blend[1][0] = MaskBlend_C<12, uint16_t, false, 1, 0>;
+ dsp->mask_blend[2][0] = MaskBlend_C<12, uint16_t, false, 1, 1>;
+ dsp->mask_blend[0][1] = MaskBlend_C<12, uint16_t, true, 0, 0>;
+ dsp->mask_blend[1][1] = MaskBlend_C<12, uint16_t, true, 1, 0>;
+ dsp->mask_blend[2][1] = MaskBlend_C<12, uint16_t, true, 1, 1>;
+ // These are only used with 8-bit.
+ dsp->inter_intra_mask_blend_8bpp[0] = nullptr;
+ dsp->inter_intra_mask_blend_8bpp[1] = nullptr;
+ dsp->inter_intra_mask_blend_8bpp[2] = nullptr;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp12bpp_MaskBlend444
+ dsp->mask_blend[0][0] = MaskBlend_C<12, uint16_t, false, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_MaskBlend422
+ dsp->mask_blend[1][0] = MaskBlend_C<12, uint16_t, false, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_MaskBlend420
+ dsp->mask_blend[2][0] = MaskBlend_C<12, uint16_t, false, 1, 1>;
#endif
+#ifndef LIBGAV1_Dsp12bpp_MaskBlendInterIntra444
+ dsp->mask_blend[0][1] = MaskBlend_C<12, uint16_t, true, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_MaskBlendInterIntra422
+ dsp->mask_blend[1][1] = MaskBlend_C<12, uint16_t, true, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_MaskBlendInterIntra420
+ dsp->mask_blend[2][1] = MaskBlend_C<12, uint16_t, true, 1, 1>;
+#endif
+ // These are only used with 8-bit.
+ dsp->inter_intra_mask_blend_8bpp[0] = nullptr;
+ dsp->inter_intra_mask_blend_8bpp[1] = nullptr;
+ dsp->inter_intra_mask_blend_8bpp[2] = nullptr;
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif // LIBGAV1_MAX_BITDEPTH == 12
} // namespace
@@ -206,6 +249,9 @@ void MaskBlendInit_C() {
#if LIBGAV1_MAX_BITDEPTH >= 10
Init10bpp();
#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+ Init12bpp();
+#endif
}
} // namespace dsp
diff --git a/src/dsp/mask_blend_test.cc b/src/dsp/mask_blend_test.cc
index be80b11..29dd43b 100644
--- a/src/dsp/mask_blend_test.cc
+++ b/src/dsp/mask_blend_test.cc
@@ -14,6 +14,7 @@
#include "src/dsp/mask_blend.h"
+#include <cassert>
#include <cstddef>
#include <cstdint>
#include <cstring>
@@ -103,6 +104,8 @@ const char* GetDigest8bpp(int id) {
"beb984e88b6f9b96ae6efe5da23ad16b", "1083b829ea766b1d4eb0bb96e9fb3bff",
"be8abad1da69e4d238a45fc02a0061cf",
};
+ assert(id >= 0);
+ assert(id < sizeof(kDigest) / sizeof(kDigest[0]));
return kDigest[id];
}
@@ -157,10 +160,69 @@ const char* GetDigest10bpp(int id) {
"56823ef9a8e21c9c7441cc9ed870d648", "52f4c7a0b7177175302652cbc482f442",
"f4a4f4d7c8b93c0486cf3cbaa26fbc19",
};
+ assert(id >= 0);
+ assert(id < sizeof(kDigest) / sizeof(kDigest[0]));
return kDigest[id];
}
#endif // LIBGAV1_MAX_BITDEPTH >= 10
+#if LIBGAV1_MAX_BITDEPTH == 12
+const char* GetDigest12bpp(int id) {
+ static const char* const kDigest[] = {
+ "79a505b3877177197c94f0faeb0c9ec6", "cd22657d242f30c88bb83eae9efbbcce",
+ "c4c60a60976d119df3832ff6956e0181", "796bd78bf2346e8dfd61cecbf508ea0e",
+ "79e06cc6f880daf6cdb59b9b3a8efe1c", "f0643108e6b57bd566bc0d47b2dc64a1",
+ "8272a471e538ca469eaf5c997309589c", "3094741b63a29925da83dc1dc187654a",
+ "d0141df80f2335ed6051397cb2a5bc61", "33d9fd317b74f4572afbe004f991ca83",
+ "ea2413cd11bf1da93de9285381b471df", "c4f78ae2b994a3a999cb3f5dac2bb498",
+ "44804ec226453bc5f688506b56ad2a8a", "9de9c12a5f3bb8d4af13da8807dfe53f",
+ "c190dac15c08f2e591b222e1d75b60c2", "c46889b58b44d242e24b91ef531e9176",
+ "b6697e1256b60b3426a8980c7c6f9a80", "1e0eb156152fbb74b0cff41bdbdf98b5",
+ "98ab6c0abc45fd44565f84e66dc71133", "f2f2126fac1b7c0c7b7ff511c6f3c91e",
+ "0cc720e878cfa35f9b72762d08adb1bf", "6efee9ce87e098122dd05525f4c74a2f",
+ "187270514a93bd7065d2cfdb02146959", "947be7f2921b5a192d4296b2060a215c",
+ "42f02b046eda2a94133032184fdaa26d", "487e94b20867e7021dd1f10d477c3acf",
+ "9f9eac4394d8821f5c14857a28c5549b", "75d781b60c1f4aa44ceb6bc65f597a52",
+ "779f9ac3c01a86812964ccc38da2711a", "16dc8824efbd7a47808ccdbf8e37df56",
+ "e72899a8ddf6cc816e1917c25739a512", "96a4bcaedae79b55399d931fecd64312",
+ "5c5e8f4a4f0153315133e4e86a02c3a6", "d1c339b6f6cc0eabdd6674028e1f4260",
+ "4ef5868adaf6712d033dce9e51837c0b", "ed90a4ddfc463dddfe71314bc3415b4e",
+ "2312299492a47246269d6d37e67c8c0c", "56baf1c4453c5cf5ce3d6857cff4aa8f",
+ "d534ce3430377b355c3f59695cfb188b", "f40248f1a6fac4299c9645350138f598",
+ "f2e3cbbd066d9d28304667d82312d950", "e8a7784eb367b72b96486bec856b873c",
+ "02941ae2cf8272b353268a30cf9c2ee0", "8f6273a5fa62b9a4225ebdbf2ce44e27",
+ "85bb0aaba73fe8c89dcee6b5c55d5cfc", "c28c63a4e46ee2a98dd2b58379971c8c",
+ "4af35738c29d27ca9930a488bacdffe6", "34a419cc3e6ab21cf099d244169d253e",
+ "7c5b8d19ac8a81b37011fabac10143d0", "e582811e05def83270d8f65060fe8966",
+ "24662536326615a3c325409e780f65bf", "717a7f7e99d329a74391477ef3c6d738",
+ "e0f38a3dba4c6e060b6ca12a18d75fc2", "fbd0cba6a27eb06e74c5ed376187e05c",
+ "14dfb487c4a7e989629a195810b814ee", "3cf6d595317ec46e08f6eaa0f0e99b43",
+ "b3cb98c418ea854e433b612fc532bac5", "262206cee670c082361497e51cbd0f43",
+ "84c11b103a9b0a61f07493dcd269e6fd", "bd9bd9994057371252398bf52c7586f0",
+ "72e5537ba5f04fe17b7a371bd12ca0e2", "5986a20b406ceed273f9e41bc0c4c775",
+ "d5eb9ea00ce19079b49562ba4a8cb574", "3205e6f3c532a63f8d5d939fa46bc444",
+ "cfb21ac467f21954903948d4e6c9a2a1", "bd9fd6aab18bbba8096746f9ed35a640",
+ "d42ec4f13f042014c5b4af5f03d19034", "8a7fdee2b57ac641e03365625850f5d6",
+ "d18638521275b3aa9dd463d067d6a390", "a7a71c433d85576198b52608c99cab47",
+ "96e2a2443bf8cfe32d7590c5011c7523", "6fbe7cd83208937229c11a8e3be5e1e9",
+ "ecf66dac310e332a108be639171b5cf3", "327b1656c61d795c30a914f52e3d7629",
+ "157d26190bde1a6f34680708bff5d02e", "d927bba0073263a7914a4076a5edfe29",
+ "b88930ec68e5e49da8204ef21635cea2", "58e174ed0036b1ac1f5a9bdd44860222",
+ "415055dfa80c6fe7c12e4d16cac22168", "9058939bfb5998d6ecd71d87a52be893",
+ "847894efa35f1528732ec3584f62f86f", "8aa9b33c0d9695690cb4088c32f31214",
+ "11e28ab9a3192a2bc9ffd3fd0a466a13", "f246009c5efafd9310fa8e365d23cab4",
+ "2381fcd9ee0ffceba5509879d9f5709d", "1cf1dc7c7c6ecf1f3381455c99e2239e",
+ "e74601883b53791045f50bbcbbbcc803", "22926eecefa94f9f39b9bb9dbb183e5b",
+ "128c24f5a5342aebb21bdaa87907daf7", "11c39f844a2e51cc4c80ffe1afa58e70",
+ "2c0548cff2145031e304d8f97abfd751", "66e1a3daf84029341b999b18bf86e5b3",
+ "0f790f210d5366bbad7eb352b4909dd9",
+ };
+ assert(id >= 0);
+ assert(id < sizeof(kDigest) / sizeof(kDigest[0]));
+ return kDigest[id];
+}
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
struct MaskBlendTestParam {
MaskBlendTestParam(BlockSize block_size, int subsampling_x, int subsampling_y,
bool is_inter_intra, bool is_wedge_inter_intra)
@@ -192,6 +254,7 @@ template <int bitdepth, typename Pixel>
class MaskBlendTest : public testing::TestWithParam<MaskBlendTestParam>,
public test_utils::MaxAlignedAllocable {
public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
MaskBlendTest() = default;
~MaskBlendTest() override = default;
@@ -310,6 +373,7 @@ void MaskBlendTest<bitdepth, Pixel>::Test(const char* const digest,
PredType* src_2 = source2_;
uint8_t* src_2_8bpp = source2_8bpp_;
const ptrdiff_t src_2_stride = param_.is_inter_intra ? kStride : width;
+ const ptrdiff_t mask_stride = param_.width;
uint8_t* mask_row = mask_;
const int range_mask = (1 << (bitdepth)) - 1;
for (int y = 0; y < height; ++y) {
@@ -340,7 +404,7 @@ void MaskBlendTest<bitdepth, Pixel>::Test(const char* const digest,
mask_row[x] = rnd.Rand8() & 63;
mask_row[x] += rnd.Rand8() & 1; // Range of mask is [0, 64].
}
- mask_row += kStride;
+ mask_row += mask_stride;
}
absl::Duration elapsed_time;
@@ -351,7 +415,7 @@ void MaskBlendTest<bitdepth, Pixel>::Test(const char* const digest,
static_assert(sizeof(source2_8bpp_cache_) == sizeof(source2_8bpp_), "");
// source2_8bpp_ is modified in the call.
memcpy(source2_8bpp_cache_, source2_8bpp_, sizeof(source2_8bpp_));
- func_8bpp_(source1_8bpp_, source2_8bpp_, src_2_stride, mask_, kStride,
+ func_8bpp_(source1_8bpp_, source2_8bpp_, src_2_stride, mask_, mask_stride,
width, height);
for (int y = 0; y < height; ++y) {
for (int x = 0; x < width; ++x) {
@@ -363,7 +427,7 @@ void MaskBlendTest<bitdepth, Pixel>::Test(const char* const digest,
if (bitdepth != 8) {
ASSERT_EQ(func_8bpp_, nullptr);
}
- func_(source1_, source2_, src_2_stride, mask_, kStride, width, height,
+ func_(source1_, source2_, src_2_stride, mask_, mask_stride, width, height,
dest_, kDestStride);
}
elapsed_time += absl::Now() - start;
@@ -520,6 +584,19 @@ INSTANTIATE_TEST_SUITE_P(NEON, MaskBlendTest10bpp,
#endif
#endif // LIBGAV1_MAX_BITDEPTH >= 10
+#if LIBGAV1_MAX_BITDEPTH == 12
+using MaskBlendTest12bpp = MaskBlendTest<12, uint16_t>;
+
+TEST_P(MaskBlendTest12bpp, Blending) { Test(GetDigest12bpp(GetDigestId()), 1); }
+
+TEST_P(MaskBlendTest12bpp, DISABLED_Speed) {
+ Test(GetDigest12bpp(GetDigestId()), kNumSpeedTests);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, MaskBlendTest12bpp,
+ testing::ValuesIn(kMaskBlendTestParam));
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
} // namespace
} // namespace dsp
} // namespace libgav1
diff --git a/src/dsp/obmc.cc b/src/dsp/obmc.cc
index 6b5c6e3..479cb1d 100644
--- a/src/dsp/obmc.cc
+++ b/src/dsp/obmc.cc
@@ -116,7 +116,28 @@ void Init10bpp() {
#endif
#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
}
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+void Init12bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(12);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendVertical_C<uint16_t>;
+ dsp->obmc_blend[kObmcDirectionHorizontal] =
+ OverlapBlendHorizontal_C<uint16_t>;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp12bpp_ObmcVertical
+ dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendVertical_C<uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp12bpp_ObmcHorizontal
+ dsp->obmc_blend[kObmcDirectionHorizontal] =
+ OverlapBlendHorizontal_C<uint16_t>;
#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif // LIBGAV1_MAX_BITDEPTH == 12
} // namespace
@@ -125,6 +146,9 @@ void ObmcInit_C() {
#if LIBGAV1_MAX_BITDEPTH >= 10
Init10bpp();
#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+ Init12bpp();
+#endif
}
} // namespace dsp
diff --git a/src/dsp/obmc_test.cc b/src/dsp/obmc_test.cc
index 3672e12..a10feb2 100644
--- a/src/dsp/obmc_test.cc
+++ b/src/dsp/obmc_test.cc
@@ -15,6 +15,7 @@
#include "src/dsp/obmc.h"
#include <algorithm>
+#include <cassert>
#include <cstddef>
#include <cstdint>
#include <cstring>
@@ -58,6 +59,8 @@ const char* GetDigest8bpp(int id) {
"98a9be6245720d4e0da18115c1a1dbd7", "7e7afe3136ad681b5ea05664fe916548",
"33971753243f09106173199b7bae1ef5", "65413f33c19a42c112d395121aa4b3b4",
};
+ assert(id >= 0);
+ assert(id < sizeof(kDigest) / sizeof(kDigest[0]));
return kDigest[id];
}
@@ -75,6 +78,8 @@ const char* GetDigestSpeed8bpp(int id) {
"29f8a6fc2a650f3945a4ea6d3b975b6d", "8f300a257e913a42666b4921b2b0b5c5",
"a526265c4b3c8593736a82ddc1fd1603", "76e248f6756ac96343204b0e48d72a9e",
};
+ assert(id >= 0);
+ assert(id < sizeof(kDigest) / sizeof(kDigest[0]));
return kDigest[id];
}
@@ -93,6 +98,8 @@ const char* GetDigest10bpp(int id) {
"e4a01e492ddc0398b5c5b60c81468242", "f1b4f7ab5c8b949e51db104f2e33565a",
"b1fb9ecc6a552e2b23ee92e2f3e4122a", "a683d20129a91bb20b904aa20c0499b1",
};
+ assert(id >= 0);
+ assert(id < sizeof(kDigest) / sizeof(kDigest[0]));
return kDigest[id];
}
@@ -110,10 +117,52 @@ const char* GetDigestSpeed10bpp(int id) {
"b543855cbe384b88861c881853c28192", "5faaafc124e94eedc69dc0f5d33dacac",
"13ca4d01bd20085459e6126555e1f7b5", "46d46fae3c8a7d9e4725154d8d2b76d8",
};
+ assert(id >= 0);
+ assert(id < sizeof(kDigest) / sizeof(kDigest[0]));
return kDigest[id];
}
#endif // LIBGAV1_MAX_BITDEPTH >= 10
+#if LIBGAV1_MAX_BITDEPTH == 12
+const char* GetDigest12bpp(int id) {
+ static const char* const kDigest[] = {
+ "eb18c776d7b56280f01cca40b04a9c44", "058d4a6ed025eac5dcf7aec3203c0882",
+ "8355884d7470e9c6af9309ab23bee859", "2ba330551ac58d1d034b947d7ab9b59f",
+ "0d25cd773c81e4c57f82513e3b031f01", "b9075f7c3b9a240dbb015a24454eeb71",
+ "563ed8683723d1e4f2746280bca3db0a", "d7125306bd8c952d0f85fe1515ca16a7",
+ "5bf99c7e4a918c9b6a7e251484ea6527", "38ac9c685e8d2bd2771b6f2b38268301",
+ "abc39dbde7470e08b15417ee97c704b2", "37e12753d23b7a8df92b1d32f3170d9f",
+ "9a609776cfa31f64826225d0a6b7afdd", "ccdd89e70e94f751fd891b124c1c3210",
+ "2bbf7b095e26ed4f27e7d05e20117084", "9a1b403c3a7c00da5686bcb87f1270e8",
+ "701d651e391043ab8ebbd0023a430980", "0047f10bdd8321494e8e82597fe2f969",
+ "f97e662d139b2811e3d3227de95135a2", "852933b90d4a70f9254157381ed641e0",
+ "cfcda707ec8e4361ef741dc716888348", "95e34eab83b3159f61685db248c6a881",
+ };
+ assert(id >= 0);
+ assert(id < sizeof(kDigest) / sizeof(kDigest[0]));
+ return kDigest[id];
+}
+
+const char* GetDigestSpeed12bpp(int id) {
+ static const char* const kDigest[] = {
+ "6c0f37c41d72ce40d95545ac0f08d88a", "8a8efeb7d8b2f852d76d0176b6c6878f",
+ "5757c88d1cdc0cd29c47c346474161f0", "fef8cf06d16ba7357bfc061e43080cd3",
+ "6bd11582448532bce8b91cc8807ab6a0", "1e6dd42eada2d636e210f4e20a771102",
+ "377a0472f45fcb42f1712243ea845530", "e3760f2b6e69c1b40e71ecde711d227c",
+ "6721638d1a5dadb96ddd0ca067c737ca", "3d3a23210a8496a76991bcec5045808b",
+ "2cbd26ecf7d4e927ab569083d3ddb4ca", "7d61af2d7841d1a39a2e930bac166804",
+ "dd929506442fb1f2e67130fe8cdf487b", "c0e57f8d2546d5bcb646a24d09d83d7c",
+ "2989c6487456c92eb003c8e17e904f45", "5cfb60a3be6ee5c41e0f655a3020f687",
+ "28f37d47cb07aa382659ff556a55a4c6", "b6478ab317b11f592deb60d02ce62f2f",
+ "bc78e7250c101f82e794d4fa0ee55025", "24304ed23d336a46f205206d3c5d48ef",
+ "dc1e71d95d06c1086bb7f9e05e38bf39", "32606ef72985e7de608df2e8760784b7",
+ };
+ assert(id >= 0);
+ assert(id < sizeof(kDigest) / sizeof(kDigest[0]));
+ return kDigest[id];
+}
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
struct ObmcTestParam {
ObmcTestParam(int width, int height, ObmcDirection blending_direction)
: width(width), height(height), blending_direction(blending_direction) {}
@@ -130,6 +179,7 @@ std::ostream& operator<<(std::ostream& os, const ObmcTestParam& param) {
template <int bitdepth, typename Pixel>
class ObmcBlendTest : public testing::TestWithParam<ObmcTestParam> {
public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
ObmcBlendTest() = default;
~ObmcBlendTest() override = default;
@@ -206,11 +256,12 @@ void ObmcBlendTest<bitdepth, Pixel>::Test(const char* const digest,
src_2[x] = rnd.Rand16() & mask;
}
src_1 += kMaxBlendingBlockSize;
- src_2 += kMaxBlendingBlockSize;
+ src_2 += width_;
}
}
const ptrdiff_t stride = kMaxBlendingBlockSize * sizeof(Pixel);
- func_(source1_, stride, width_, height_, source2_, stride);
+ func_(source1_, stride, width_, height_, source2_,
+ width_ * sizeof(source2_[0]));
if (use_fixed_values) {
const bool success = test_utils::CompareBlocks(
source1_, source2_, width_, height_, kMaxBlendingBlockSize,
@@ -238,7 +289,7 @@ void ObmcBlendTest<bitdepth, Pixel>::TestSpeed(const char* const digest,
src_2[x] = rnd.Rand16() & mask;
}
src_1 += kMaxBlendingBlockSize;
- src_2 += kMaxBlendingBlockSize;
+ src_2 += width_;
}
const ptrdiff_t stride = kMaxBlendingBlockSize * sizeof(Pixel);
uint8_t dest[sizeof(Pixel) * kMaxBlendingBlockSize * kMaxBlendingBlockSize];
@@ -247,7 +298,8 @@ void ObmcBlendTest<bitdepth, Pixel>::TestSpeed(const char* const digest,
memcpy(dest, source1_,
sizeof(Pixel) * kMaxBlendingBlockSize * kMaxBlendingBlockSize);
const absl::Time start = absl::Now();
- func_(dest, stride, width_, height_, source2_, stride);
+ func_(dest, stride, width_, height_, source2_,
+ width_ * sizeof(source2_[0]));
elapsed_time += absl::Now() - start;
}
memcpy(source1_, dest,
@@ -338,6 +390,26 @@ INSTANTIATE_TEST_SUITE_P(NEON, ObmcBlendTest10bpp,
#endif
#endif // LIBGAV1_MAX_BITDEPTH >= 10
+#if LIBGAV1_MAX_BITDEPTH == 12
+using ObmcBlendTest12bpp = ObmcBlendTest<12, uint16_t>;
+
+TEST_P(ObmcBlendTest12bpp, Blending) {
+ Test(/*digest=*/nullptr, /*use_fixed_values=*/true, 0);
+ Test(/*digest=*/nullptr, /*use_fixed_values=*/true, 1);
+ Test(/*digest=*/nullptr, /*use_fixed_values=*/true, 128);
+ Test(/*digest=*/nullptr, /*use_fixed_values=*/true, (1 << 12) - 1);
+ Test(GetDigest12bpp(GetDigestId()), /*use_fixed_values=*/false, -1);
+}
+
+TEST_P(ObmcBlendTest12bpp, DISABLED_Speed) {
+ TestSpeed(GetDigestSpeed12bpp(GetDigestId()),
+ kNumSpeedTests / (GetParam().height * GetParam().width));
+}
+
+INSTANTIATE_TEST_SUITE_P(C, ObmcBlendTest12bpp,
+ testing::ValuesIn(kObmcTestParam));
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
} // namespace
} // namespace dsp
} // namespace libgav1
diff --git a/src/dsp/super_res.cc b/src/dsp/super_res.cc
index 570ba73..7593729 100644
--- a/src/dsp/super_res.cc
+++ b/src/dsp/super_res.cc
@@ -95,7 +95,23 @@ void Init10bpp() {
#endif
#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
}
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+void Init12bpp() {
+ Dsp* dsp = dsp_internal::GetWritableDspTable(12);
+ assert(dsp != nullptr);
+ dsp->super_res_coefficients = nullptr;
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->super_res = SuperRes_C<12, uint16_t>;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp12bpp_SuperRes
+ dsp->super_res = SuperRes_C<12, uint16_t>;
#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif // LIBGAV1_MAX_BITDEPTH == 12
} // namespace
@@ -104,6 +120,9 @@ void SuperResInit_C() {
#if LIBGAV1_MAX_BITDEPTH >= 10
Init10bpp();
#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+ Init12bpp();
+#endif
}
} // namespace dsp
diff --git a/src/dsp/super_res_test.cc b/src/dsp/super_res_test.cc
index a93fc31..7b253ff 100644
--- a/src/dsp/super_res_test.cc
+++ b/src/dsp/super_res_test.cc
@@ -56,7 +56,16 @@ const char* GetDigest10bpp(int id) {
"126cd7727e787e0625ec3f5ce97f8fa0", "85c806c41d40b841764bcb54f6d3a712"};
return kDigestSuperRes[id];
}
-#endif
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+const char* GetDigest12bpp(int id) {
+ static const char* const kDigestSuperRes[] = {
+ "9a08983d82df4983700976f18919201b", "6e5edbafcb6c38db37258bf79c00ea32",
+ "f5c57e6d3b518f9585f768ed19b91568", "b5de9b93c8a1a50580e7c7c9456fb615"};
+ return kDigestSuperRes[id];
+}
+#endif // LIBGAV1_MAX_BITDEPTH == 12
struct SuperResTestParam {
SuperResTestParam(int downscaled_width, int upscaled_width)
@@ -69,6 +78,7 @@ template <int bitdepth, typename Pixel, typename Coefficient>
class SuperResTest : public testing::TestWithParam<SuperResTestParam>,
public test_utils::MaxAlignedAllocable {
public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
SuperResTest() = default;
void SetUp() override {
test_utils::ResetDspTable(bitdepth);
@@ -174,14 +184,23 @@ void SuperResTest<bitdepth, Pixel, Coefficient>::TestComputeSuperRes(
}
}
}
- const char* expected_digest;
- if (bitdepth == 8) {
- expected_digest = GetDigest8bpp(test_id_);
- } else {
+ const char* expected_digest = nullptr;
+ switch (bitdepth) {
+ case 8:
+ expected_digest = GetDigest8bpp(test_id_);
+ break;
#if LIBGAV1_MAX_BITDEPTH >= 10
- expected_digest = GetDigest10bpp(test_id_);
+ case 10:
+ expected_digest = GetDigest10bpp(test_id_);
+ break;
+#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+ case 12:
+ expected_digest = GetDigest12bpp(test_id_);
+ break;
#endif
}
+ ASSERT_NE(expected_digest, nullptr);
test_utils::CheckMd5Digest(
"SuperRes",
absl::StrFormat("width %d, step %d, start %d", kUpscaledWidth, step,
@@ -259,6 +278,25 @@ INSTANTIATE_TEST_SUITE_P(NEON, SuperResTest10bpp,
#endif
#endif // LIBGAV1_MAX_BITDEPTH >= 10
+#if LIBGAV1_MAX_BITDEPTH == 12
+using SuperResTest12bpp = SuperResTest<12, uint16_t, int16_t>;
+
+TEST_P(SuperResTest12bpp, FixedValues) {
+ TestComputeSuperRes(100, 1);
+ TestComputeSuperRes(2047, 1);
+ TestComputeSuperRes(1, 1);
+}
+
+TEST_P(SuperResTest12bpp, RandomValues) { TestComputeSuperRes(0, 1); }
+
+TEST_P(SuperResTest12bpp, DISABLED_Speed) {
+ TestComputeSuperRes(0, kNumSpeedTests);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, SuperResTest12bpp,
+ testing::ValuesIn(kSuperResTestParams));
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
} // namespace
} // namespace dsp
} // namespace libgav1
diff --git a/src/dsp/warp.cc b/src/dsp/warp.cc
index dd467ea..f62f1ed 100644
--- a/src/dsp/warp.cc
+++ b/src/dsp/warp.cc
@@ -111,14 +111,8 @@ void Warp_C(const void* LIBGAV1_RESTRICT const source, ptrdiff_t source_stride,
start_x += 8) {
const int src_x = (start_x + 4) << subsampling_x;
const int src_y = (start_y + 4) << subsampling_y;
- const int dst_x =
- src_x * warp_params[2] + src_y * warp_params[3] + warp_params[0];
- const int dst_y =
- src_x * warp_params[4] + src_y * warp_params[5] + warp_params[1];
- const int x4 = dst_x >> subsampling_x;
- const int y4 = dst_y >> subsampling_y;
- const int ix4 = x4 >> kWarpedModelPrecisionBits;
- const int iy4 = y4 >> kWarpedModelPrecisionBits;
+ const WarpFilterParams filter_params = GetWarpFilterParams(
+ src_x, src_y, subsampling_x, subsampling_y, warp_params);
// A prediction block may fall outside the frame's boundaries. If a
// prediction block is calculated using only samples outside the frame's
@@ -172,22 +166,24 @@ void Warp_C(const void* LIBGAV1_RESTRICT const source, ptrdiff_t source_stride,
// border index (source_width - 1 or 0, respectively). Then for each x,
// the inner for loop of the horizontal filter is reduced to multiplying
// the border pixel by the sum of the filter coefficients.
- if (ix4 - 7 >= source_width - 1 || ix4 + 7 <= 0) {
+ if (filter_params.ix4 - 7 >= source_width - 1 ||
+ filter_params.ix4 + 7 <= 0) {
// Regions 1 and 2.
// Points to the left or right border of the first row of |src|.
const Pixel* first_row_border =
- (ix4 + 7 <= 0) ? src : src + source_width - 1;
+ (filter_params.ix4 + 7 <= 0) ? src : src + source_width - 1;
// In general, for y in [-7, 8), the row number iy4 + y is clipped:
// const int row = Clip3(iy4 + y, 0, source_height - 1);
// In two special cases, iy4 + y is clipped to either 0 or
// source_height - 1 for all y. In the rest of the cases, iy4 + y is
// bounded and we can avoid clipping iy4 + y by relying on a reference
// frame's boundary extension on the top and bottom.
- if (iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0) {
+ if (filter_params.iy4 - 7 >= source_height - 1 ||
+ filter_params.iy4 + 7 <= 0) {
// Region 1.
// Every sample used to calculate the prediction block has the same
// value. So the whole prediction block has the same value.
- const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1;
+ const int row = (filter_params.iy4 + 7 <= 0) ? 0 : source_height - 1;
const Pixel row_border_pixel = first_row_border[row * source_stride];
DestType* dst_row = dst + start_x - block_start_x;
if (is_compound) {
@@ -220,15 +216,15 @@ void Warp_C(const void* LIBGAV1_RESTRICT const source, ptrdiff_t source_stride,
for (int y = -7; y < 8; ++y) {
// We may over-read up to 13 pixels above the top source row, or up
// to 13 pixels below the bottom source row. This is proved below.
- const int row = iy4 + y;
+ const int row = filter_params.iy4 + y;
int sum = first_row_border[row * source_stride];
sum <<= kFilterBits - kRoundBitsHorizontal;
intermediate_result_column[y + 7] = sum;
}
// Vertical filter.
DestType* dst_row = dst + start_x - block_start_x;
- int sy4 =
- (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta);
+ int sy4 = (filter_params.y4 & ((1 << kWarpedModelPrecisionBits) - 1)) -
+ MultiplyBy4(delta);
for (int y = 0; y < 8; ++y) {
int sy = sy4 - MultiplyBy4(gamma);
for (int x = 0; x < 8; ++x) {
@@ -269,12 +265,14 @@ void Warp_C(const void* LIBGAV1_RESTRICT const source, ptrdiff_t source_stride,
// source_height - 1 for all y. In the rest of the cases, iy4 + y is
// bounded and we can avoid clipping iy4 + y by relying on a reference
// frame's boundary extension on the top and bottom.
- if (iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0) {
+ if (filter_params.iy4 - 7 >= source_height - 1 ||
+ filter_params.iy4 + 7 <= 0) {
// Region 3.
// Horizontal filter.
- const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1;
+ const int row = (filter_params.iy4 + 7 <= 0) ? 0 : source_height - 1;
const Pixel* const src_row = src + row * source_stride;
- int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7;
+ int sx4 = (filter_params.x4 & ((1 << kWarpedModelPrecisionBits) - 1)) -
+ beta * 7;
for (int y = -7; y < 8; ++y) {
int sx = sx4 - MultiplyBy4(alpha);
for (int x = -4; x < 4; ++x) {
@@ -300,7 +298,7 @@ void Warp_C(const void* LIBGAV1_RESTRICT const source, ptrdiff_t source_stride,
// -13 <= column <= (source_width - 1) + 13.
// Therefore we may over-read up to 13 pixels before the source
// row, or up to 13 pixels after the source row.
- const int column = ix4 + x + k - 3;
+ const int column = filter_params.ix4 + x + k - 3;
sum += kWarpedFilters[offset][k] * src_row[column];
}
intermediate_result[y + 7][x + 4] =
@@ -315,7 +313,8 @@ void Warp_C(const void* LIBGAV1_RESTRICT const source, ptrdiff_t source_stride,
// At this point, we know iy4 - 7 < source_height - 1 and iy4 + 7 > 0.
// It follows that -6 <= iy4 <= source_height + 5. This inequality is
// used below.
- int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7;
+ int sx4 = (filter_params.x4 & ((1 << kWarpedModelPrecisionBits) - 1)) -
+ beta * 7;
for (int y = -7; y < 8; ++y) {
// We assume the source frame has top and bottom borders of at least
// 13 pixels that extend the frame boundary pixels.
@@ -326,7 +325,7 @@ void Warp_C(const void* LIBGAV1_RESTRICT const source, ptrdiff_t source_stride,
// -13 <= row <= (source_height - 1) + 13.
// Therefore we may over-read up to 13 pixels above the top source
// row, or up to 13 pixels below the bottom source row.
- const int row = iy4 + y;
+ const int row = filter_params.iy4 + y;
const Pixel* const src_row = src + row * source_stride;
int sx = sx4 - MultiplyBy4(alpha);
for (int x = -4; x < 4; ++x) {
@@ -352,7 +351,7 @@ void Warp_C(const void* LIBGAV1_RESTRICT const source, ptrdiff_t source_stride,
// -13 <= column <= (source_width - 1) + 13.
// Therefore we may over-read up to 13 pixels before the source
// row, or up to 13 pixels after the source row.
- const int column = ix4 + x + k - 3;
+ const int column = filter_params.ix4 + x + k - 3;
sum += kWarpedFilters[offset][k] * src_row[column];
}
intermediate_result[y + 7][x + 4] =
@@ -367,8 +366,8 @@ void Warp_C(const void* LIBGAV1_RESTRICT const source, ptrdiff_t source_stride,
// Regions 3 and 4.
// Vertical filter.
DestType* dst_row = dst + start_x - block_start_x;
- int sy4 =
- (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta);
+ int sy4 = (filter_params.y4 & ((1 << kWarpedModelPrecisionBits) - 1)) -
+ MultiplyBy4(delta);
// The spec says we should use the following loop condition:
// y < std::min(4, block_start_y + block_height - start_y - 4);
// We can prove that block_start_y + block_height - start_y >= 8, which
@@ -460,7 +459,26 @@ void Init10bpp() {
#endif
#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
}
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+void Init12bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(12);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->warp = Warp_C</*is_compound=*/false, 12, uint16_t>;
+ dsp->warp_compound = Warp_C</*is_compound=*/true, 12, uint16_t>;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp12bpp_Warp
+ dsp->warp = Warp_C</*is_compound=*/false, 12, uint16_t>;
#endif
+#ifndef LIBGAV1_Dsp12bpp_WarpCompound
+ dsp->warp_compound = Warp_C</*is_compound=*/true, 12, uint16_t>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif // LIBGAV1_MAX_BITDEPTH == 12
} // namespace
@@ -469,6 +487,9 @@ void WarpInit_C() {
#if LIBGAV1_MAX_BITDEPTH >= 10
Init10bpp();
#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+ Init12bpp();
+#endif
}
} // namespace dsp
diff --git a/src/dsp/warp.h b/src/dsp/warp.h
index 7367a9b..9c20f12 100644
--- a/src/dsp/warp.h
+++ b/src/dsp/warp.h
@@ -38,9 +38,39 @@
namespace libgav1 {
namespace dsp {
+// Section 7.11.3.5.
+struct WarpFilterParams {
+ int64_t x4;
+ int64_t y4;
+ int ix4;
+ int iy4;
+};
+
// Initializes Dsp::warp. This function is not thread-safe.
void WarpInit_C();
+// Section 7.11.3.5.
+inline WarpFilterParams GetWarpFilterParams(int src_x, int src_y,
+ int subsampling_x,
+ int subsampling_y,
+ const int* warp_params) {
+ WarpFilterParams filter_params;
+ // warp_params[2]/[5] require 17 bits (the others 14). With large resolutions
+ // the result of the multiplication will require 33.
+ const int64_t dst_x = static_cast<int64_t>(src_x) * warp_params[2] +
+ src_y * warp_params[3] + warp_params[0];
+ const int64_t dst_y = src_x * warp_params[4] +
+ static_cast<int64_t>(src_y) * warp_params[5] +
+ warp_params[1];
+ filter_params.x4 = dst_x >> subsampling_x;
+ filter_params.y4 = dst_y >> subsampling_y;
+ filter_params.ix4 =
+ static_cast<int>(filter_params.x4 >> kWarpedModelPrecisionBits);
+ filter_params.iy4 =
+ static_cast<int>(filter_params.y4 >> kWarpedModelPrecisionBits);
+ return filter_params;
+}
+
} // namespace dsp
} // namespace libgav1
diff --git a/src/dsp/warp_test.cc b/src/dsp/warp_test.cc
index 4d13051..c64c8d6 100644
--- a/src/dsp/warp_test.cc
+++ b/src/dsp/warp_test.cc
@@ -105,6 +105,8 @@ const char* GetDigest8bpp(int id) {
"4d412349a25a832c1fb3fb29e3f0e2b3", "2c6dd2a9a4ede9fa00adb567ba646f30",
"b2a0ce68db3cadd207299f73112bed74",
};
+ assert(id >= 0);
+ assert(id < sizeof(kDigest) / sizeof(kDigest[0]));
return is_compound ? kCompoundDigest[id] : kDigest[id];
}
@@ -129,9 +131,38 @@ const char* GetDigest10bpp(int id) {
"f3be504bbb609ce4cc71c5539252638a", "fcde83b54e14e9de23460644f244b047",
"42eb66e752e9ef289b47053b5c73fdd6",
};
+ assert(id >= 0);
+ assert(id < sizeof(kDigest) / sizeof(kDigest[0]));
return is_compound ? kCompoundDigest[id] : kDigest[id];
}
-#endif
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+template <bool is_compound>
+const char* GetDigest12bpp(int id) {
+ static const char* const kDigest[] = {
+ "cd5d5e2102b8917ad70778f523d24bdf", "374a5f1b53a3fdf2eefa741eb71e6889",
+ "311636841770ec2427084891df96bee5", "c40c537917b1f0d1d84c99dfcecd8219",
+ "a1d9bb920e6c3d20c0cf84adc18e1f15", "13b5659acdb39b717526cb358c6f4026",
+ "f81ea4f6fd1f4ebed1262e3fae37b5bb", "c1452fefcd9b9562fe3a0b7f9302809c",
+ "8fed8a3159dc7b6b59a39ab2be6bee13", "b46458bc0e5cf1cee92aac4f0f608749",
+ "2e6a1039ab111add89f5b44b13565f40", "9c666691860bdc89b03f601b40126196",
+ "418a47157d992b94c302ca2e2f6ee07e",
+ };
+ static const char* const kCompoundDigest[] = {
+ "8e6986ae143260e0b8b4887f15a141a1", "0a7f0db8316b8c3569f08834dd0c6f50",
+ "90705b2e7dbe083e8a1f70f29d6f257e", "e428a75bea77d769d21f3f7a1d2b0b38",
+ "a570b13d790c085c4ab50d71dd085d56", "e5d043c6cd6ff6dbab6e38a8877e93bd",
+ "12ea96991e46e3e9aa78ab812ffa0525", "84293a94a53f1cf814fa25e793c3fe27",
+ "b98a7502c84ac8437266f702dcc0a92e", "d8db5d52e9b0a5be0ad2d517d5bd16e9",
+ "f3be504bbb609ce4cc71c5539252638a", "fcde83b54e14e9de23460644f244b047",
+ "42eb66e752e9ef289b47053b5c73fdd6",
+ };
+ assert(id >= 0);
+ assert(id < sizeof(kDigest) / sizeof(kDigest[0]));
+ return is_compound ? kCompoundDigest[id] : kDigest[id];
+}
+#endif // LIBGAV1_MAX_BITDEPTH == 12
int RandomWarpedParam(int seed_offset, int bits) {
libvpx_test::ACMRandom rnd(seed_offset +
@@ -228,6 +259,7 @@ struct WarpTestParam {
template <bool is_compound, int bitdepth, typename Pixel>
class WarpTest : public testing::TestWithParam<WarpTestParam> {
public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
WarpTest() = default;
~WarpTest() override = default;
@@ -389,14 +421,23 @@ void WarpTest<is_compound, bitdepth, Pixel>::Test(bool use_fixed_values,
id = 2 + 3 * static_cast<int>(FloorLog2(param_.height) - 3);
}
- const char* expected_digest;
- if (bitdepth == 8) {
- expected_digest = GetDigest8bpp<is_compound>(id);
- } else {
+ const char* expected_digest = nullptr;
+ switch (bitdepth) {
+ case 8:
+ expected_digest = GetDigest8bpp<is_compound>(id);
+ break;
#if LIBGAV1_MAX_BITDEPTH >= 10
- expected_digest = GetDigest10bpp<is_compound>(id);
+ case 10:
+ expected_digest = GetDigest10bpp<is_compound>(id);
+ break;
+#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+ case 12:
+ expected_digest = GetDigest12bpp<is_compound>(id);
+ break;
#endif
}
+ ASSERT_NE(expected_digest, nullptr);
test_utils::CheckMd5Digest(
"Warp", absl::StrFormat("%dx%d", param_.width, param_.height).c_str(),
expected_digest, dest_, sizeof(dest_), elapsed_time);
@@ -643,7 +684,22 @@ INSTANTIATE_TEST_SUITE_P(C, WarpTest10bpp, testing::ValuesIn(warp_test_param));
INSTANTIATE_TEST_SUITE_P(NEON, WarpTest10bpp,
testing::ValuesIn(warp_test_param));
#endif
-#endif
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+using WarpTest12bpp = WarpTest</*is_compound=*/false, 12, uint16_t>;
+// TODO(jzern): Coverage could be added for kInterRoundBitsCompoundVertical via
+// WarpCompoundTest.
+// using WarpCompoundTest12bpp = WarpTest</*is_compound=*/true, 12, uint16_t>;
+
+TEST_P(WarpTest12bpp, FixedValues) { TestFixedValues(); }
+
+TEST_P(WarpTest12bpp, RandomValues) { TestRandomValues(); }
+
+TEST_P(WarpTest12bpp, DISABLED_Speed) { TestSpeed(); }
+
+INSTANTIATE_TEST_SUITE_P(C, WarpTest12bpp, testing::ValuesIn(warp_test_param));
+#endif // LIBGAV1_MAX_BITDEPTH == 12
std::ostream& operator<<(std::ostream& os, const WarpTestParam& warp_param) {
return os << "BlockSize" << warp_param.width << "x" << warp_param.height;
diff --git a/src/dsp/weight_mask.cc b/src/dsp/weight_mask.cc
index 41f4c70..ee3808b 100644
--- a/src/dsp/weight_mask.cc
+++ b/src/dsp/weight_mask.cc
@@ -213,7 +213,86 @@ void Init10bpp() {
#endif
#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
}
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+void Init12bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(12);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ INIT_WEIGHT_MASK(8, 8, 12, 0, 0);
+ INIT_WEIGHT_MASK(8, 16, 12, 0, 1);
+ INIT_WEIGHT_MASK(8, 32, 12, 0, 2);
+ INIT_WEIGHT_MASK(16, 8, 12, 1, 0);
+ INIT_WEIGHT_MASK(16, 16, 12, 1, 1);
+ INIT_WEIGHT_MASK(16, 32, 12, 1, 2);
+ INIT_WEIGHT_MASK(16, 64, 12, 1, 3);
+ INIT_WEIGHT_MASK(32, 8, 12, 2, 0);
+ INIT_WEIGHT_MASK(32, 16, 12, 2, 1);
+ INIT_WEIGHT_MASK(32, 32, 12, 2, 2);
+ INIT_WEIGHT_MASK(32, 64, 12, 2, 3);
+ INIT_WEIGHT_MASK(64, 16, 12, 3, 1);
+ INIT_WEIGHT_MASK(64, 32, 12, 3, 2);
+ INIT_WEIGHT_MASK(64, 64, 12, 3, 3);
+ INIT_WEIGHT_MASK(64, 128, 12, 3, 4);
+ INIT_WEIGHT_MASK(128, 64, 12, 4, 3);
+ INIT_WEIGHT_MASK(128, 128, 12, 4, 4);
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp12bpp_WeightMask_8x8
+ INIT_WEIGHT_MASK(8, 8, 12, 0, 0);
+#endif
+#ifndef LIBGAV1_Dsp12bpp_WeightMask_8x16
+ INIT_WEIGHT_MASK(8, 16, 12, 0, 1);
+#endif
+#ifndef LIBGAV1_Dsp12bpp_WeightMask_8x32
+ INIT_WEIGHT_MASK(8, 32, 12, 0, 2);
+#endif
+#ifndef LIBGAV1_Dsp12bpp_WeightMask_16x8
+ INIT_WEIGHT_MASK(16, 8, 12, 1, 0);
+#endif
+#ifndef LIBGAV1_Dsp12bpp_WeightMask_16x16
+ INIT_WEIGHT_MASK(16, 16, 12, 1, 1);
+#endif
+#ifndef LIBGAV1_Dsp12bpp_WeightMask_16x32
+ INIT_WEIGHT_MASK(16, 32, 12, 1, 2);
+#endif
+#ifndef LIBGAV1_Dsp12bpp_WeightMask_16x64
+ INIT_WEIGHT_MASK(16, 64, 12, 1, 3);
+#endif
+#ifndef LIBGAV1_Dsp12bpp_WeightMask_32x8
+ INIT_WEIGHT_MASK(32, 8, 12, 2, 0);
#endif
+#ifndef LIBGAV1_Dsp12bpp_WeightMask_32x16
+ INIT_WEIGHT_MASK(32, 16, 12, 2, 1);
+#endif
+#ifndef LIBGAV1_Dsp12bpp_WeightMask_32x32
+ INIT_WEIGHT_MASK(32, 32, 12, 2, 2);
+#endif
+#ifndef LIBGAV1_Dsp12bpp_WeightMask_32x64
+ INIT_WEIGHT_MASK(32, 64, 12, 2, 3);
+#endif
+#ifndef LIBGAV1_Dsp12bpp_WeightMask_64x16
+ INIT_WEIGHT_MASK(64, 16, 12, 3, 1);
+#endif
+#ifndef LIBGAV1_Dsp12bpp_WeightMask_64x32
+ INIT_WEIGHT_MASK(64, 32, 12, 3, 2);
+#endif
+#ifndef LIBGAV1_Dsp12bpp_WeightMask_64x64
+ INIT_WEIGHT_MASK(64, 64, 12, 3, 3);
+#endif
+#ifndef LIBGAV1_Dsp12bpp_WeightMask_64x128
+ INIT_WEIGHT_MASK(64, 128, 12, 3, 4);
+#endif
+#ifndef LIBGAV1_Dsp12bpp_WeightMask_128x64
+ INIT_WEIGHT_MASK(128, 64, 12, 4, 3);
+#endif
+#ifndef LIBGAV1_Dsp12bpp_WeightMask_128x128
+ INIT_WEIGHT_MASK(128, 128, 12, 4, 4);
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif // LIBGAV1_MAX_BITDEPTH == 12
} // namespace
@@ -222,6 +301,9 @@ void WeightMaskInit_C() {
#if LIBGAV1_MAX_BITDEPTH >= 10
Init10bpp();
#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+ Init12bpp();
+#endif
}
} // namespace dsp
diff --git a/src/dsp/weight_mask_test.cc b/src/dsp/weight_mask_test.cc
index 77b608e..74ec03c 100644
--- a/src/dsp/weight_mask_test.cc
+++ b/src/dsp/weight_mask_test.cc
@@ -54,42 +54,42 @@ constexpr int kCompoundPredictionRange[3][2] = {
const char* GetDigest8bpp(int id) {
static const char* const kDigest[] = {
- "035267cb2ac5a0f8ff50c2d30ad52226",
- "3231f4972dd858b734e0cc48c4cd001e",
- "7e163b69721a13ec9f75b5cd74ffee3f",
+ "eaca5b6a96dcfe5e44f3926a071b48b3",
+ "1d82c75cfdf8e57925eb1d5301647538",
+ "25bd455d74fb891b97b133c528f8db60",
"" /*kBlock4x16*/,
- "b75e90abc224acca8754c82039b3ba93",
- "9f555f3a2c1a933a663d6103b8118dea",
- "8539e54f34cd6668ff6e6606210be201",
- "20f85c9db7c878c21fbf2052936f269e",
- "620ec166de57b0639260b2d72eebfc3e",
- "be666394b5a894d78f4097b6cca272fe",
- "57a96816e84cdb381f596c23827b5922",
- "f2e0d348f608f246b6d8d799b66c189e",
- "161ac051f38372d9339d36728b9926ba",
- "d5fad48aaf132a81cb62bba4f07bbebb",
- "e10be2dca2f7dae38dae75150fc1612d",
- "7f744481eb551bbc224b5236c82cbade",
+ "1d82c75cfdf8e57925eb1d5301647538",
+ "25bd455d74fb891b97b133c528f8db60",
+ "62a08776db35a186406a11ab92dee71c",
+ "95131d1dc0e05fcf4bd234d5ce9eea11",
+ "25bd455d74fb891b97b133c528f8db60",
+ "62a08776db35a186406a11ab92dee71c",
+ "95131d1dc0e05fcf4bd234d5ce9eea11",
+ "0b3c75272e0fb0747b9850145d340c4c",
+ "95131d1dc0e05fcf4bd234d5ce9eea11",
+ "0b3c75272e0fb0747b9850145d340c4c",
+ "f26c43d4bc823a89c1ed47ab8708bc06",
+ "0d99bbf31ecddc1c2d5063a68c0e9375",
"0d99bbf31ecddc1c2d5063a68c0e9375",
"5fb8ec5f582f0ebfe519ed55860f67c4",
// mask_is_inverse = true.
- "a4250ca39daa700836138371d36d465f",
- "abe9a9a1c3a5accda9bfefd4d6e81ccb",
- "e95b08878d0bb5f2293c27c3a6fe0253",
+ "96811f3b192828ff679e4c9ad8069d7d",
+ "a04dc180c028d55af70240163445523a",
+ "8513e3988233d0a7de316a0179bb6139",
"" /*kBlock4x16*/,
- "e1c52be02ce9ab2800015bb08b866c31",
- "eea1dc73811f73866edfeb4555865f20",
- "3178e64085645bd819256a8ab43c7b0a",
- "ee83884e4d5cd2c9ac04879116bab681",
- "d107eff7d5ae9ba14d2c6b3b8d9fca49",
- "400aeea7d299626fc336c46b1ad7a9d8",
- "e9e26a400f67f3ad36350fe4171fc613",
- "4c31ad714f470f34127febaf1bac714b",
- "bbdcb1097c66d561dd4ea16b3fb73f97",
- "3a21dfbf53e4c964e303a75a3308ce15",
- "3416dab4512fd0dc61d788b433cd624e",
- "68ace8f01fdd74aec3fee528c8167738",
+ "a04dc180c028d55af70240163445523a",
+ "8513e3988233d0a7de316a0179bb6139",
+ "f7356d42fb44a6ccb41253ba35b8b3c7",
+ "3d2d61ffc203ee64fe91c9d16168a19d",
+ "8513e3988233d0a7de316a0179bb6139",
+ "f7356d42fb44a6ccb41253ba35b8b3c7",
+ "3d2d61ffc203ee64fe91c9d16168a19d",
+ "87a2011ac69fb597ca4f71bb3c35ebb0",
+ "3d2d61ffc203ee64fe91c9d16168a19d",
+ "87a2011ac69fb597ca4f71bb3c35ebb0",
+ "97100a3639d567046dc8a99fcb84cb2e",
+ "9fabe05a6523da81a45150e19f75acff",
"9fabe05a6523da81a45150e19f75acff",
"7c0643e4d02421d06d7ca71822a94e1d",
};
@@ -99,42 +99,42 @@ const char* GetDigest8bpp(int id) {
#if LIBGAV1_MAX_BITDEPTH >= 10
const char* GetDigest10bpp(int id) {
static const char* const kDigest[] = {
- "1dc9bdd042e5228705b857b42798e364",
- "c054c8644bd482ce78a139d8e063e013",
- "bbe4ac48f013f34c84779da05b0bcbe0",
+ "5ae8d64b65a671301a457b8a73368ab5",
+ "61535217f179054d4b76a8d9352a223d",
+ "1aa6614773570e7b021cd509849c4180",
"" /*kBlock4x16*/,
- "13d4759277637a607f25439182553708",
- "f089667610561a47d50f9f930ad7c454",
- "46715e6f7819f59725bdb083f4403255",
- "3774541c339ae3af920ef2b1d6abf6a1",
- "94913b01d226cb5eb273dfee84b51f65",
- "be0c0847629dfff8e0e991ed67697a7d",
- "716b5398b77d7459274d4ea9c91ebd8e",
- "f5c1b0b461df4182529949472242b421",
- "5e9576ea4cf107249ce4ae89a72b9c95",
- "da021bcdf7936f7bd9a2399c69e4d37c",
- "b3a310a39c1900e00f992839ff188656",
- "9f3a15351af5945615f296242ec56a38",
+ "61535217f179054d4b76a8d9352a223d",
+ "1aa6614773570e7b021cd509849c4180",
+ "f04c2825cfb6408c7778658f71fa176e",
+ "e1694ea1f026dac7fe7e86a84482cf86",
+ "1aa6614773570e7b021cd509849c4180",
+ "f04c2825cfb6408c7778658f71fa176e",
+ "e1694ea1f026dac7fe7e86a84482cf86",
+ "9c4855d44c013fbddb373b2e9e311080",
+ "e1694ea1f026dac7fe7e86a84482cf86",
+ "9c4855d44c013fbddb373b2e9e311080",
+ "f510e743c3efe3b83374a98ef8a30838",
+ "b6e0bd03c521c5f00e90530daa7d4432",
"b6e0bd03c521c5f00e90530daa7d4432",
"3270d7f621d488aec5b76bcf121debd0",
// mask_is_inverse = true.
- "33df96dd246683133eefe4caea6e3f7d",
- "73e0ccc5d42806548a4b59f856256c1e",
- "3561a0358cf831aee9477d07feafae2d",
+ "9aa00fcfe21b71e30c5393699122a020",
+ "4d8ce33262cf6b5375f363530815189a",
+ "428625c51ac1bd4585988f7b36dff1db",
"" /*kBlock4x16*/,
- "c5a2e633c0cd6925e68f21f47f0e2d84",
- "8755a2d3840dde5fd6a0cce6bd6642c5",
- "85ec538b72cecd6ea1fddab5ce3b4e64",
- "a53e0dec84c675c4c6b1f5792b0232ff",
- "86180da325f9727670a98cf2dbf7410e",
- "a5fdc95104948047e179b2bc3d47f51d",
- "9b95b3858187838e4669180e2ddb295e",
- "6e40ca55608f6bf2f8cd91c8dbf3ddbf",
- "d3a092672e921b588279d57e50b31888",
- "9883eb19b733ee9f1cb6a6b6a1a00bb5",
- "dd34764e068b228b7820321b06864e63",
- "6c743dc9c8c87c7044151d29993e5042",
+ "4d8ce33262cf6b5375f363530815189a",
+ "428625c51ac1bd4585988f7b36dff1db",
+ "1ef63c06a2d9c42da293fdf924032981",
+ "5dd3f201d755d1c22c126a633bfbb3c0",
+ "428625c51ac1bd4585988f7b36dff1db",
+ "1ef63c06a2d9c42da293fdf924032981",
+ "5dd3f201d755d1c22c126a633bfbb3c0",
+ "fe1e6843e6f214939da516dcbea04a79",
+ "5dd3f201d755d1c22c126a633bfbb3c0",
+ "fe1e6843e6f214939da516dcbea04a79",
+ "240187f27389b5e89f9ec6bdbd7d20a7",
+ "44925dab01011a98b8ab1f0308fa852a",
"44925dab01011a98b8ab1f0308fa852a",
"6d984b2ccfa056278e2130771127a943",
};
@@ -142,6 +142,52 @@ const char* GetDigest10bpp(int id) {
}
#endif // LIBGAV1_MAX_BITDEPTH >= 10
+#if LIBGAV1_MAX_BITDEPTH == 12
+const char* GetDigest12bpp(int id) {
+ static const char* const kDigest[] = {
+ "57629d3872fd52ff4bbec439c5517ec5",
+ "dba421ceeb534756c77167e00ae91a2c",
+ "72e8ac1d450ef0c6c6b03e93856d5cc2",
+ "" /*kBlock4x16*/,
+ "dba421ceeb534756c77167e00ae91a2c",
+ "72e8ac1d450ef0c6c6b03e93856d5cc2",
+ "ae573eb368df04e6a0133b4e15471728",
+ "ceede597b2729357b15e0d08bb9bb760",
+ "72e8ac1d450ef0c6c6b03e93856d5cc2",
+ "ae573eb368df04e6a0133b4e15471728",
+ "ceede597b2729357b15e0d08bb9bb760",
+ "c4976af803d7ad3f92ef26f25b9f3754",
+ "ceede597b2729357b15e0d08bb9bb760",
+ "c4976af803d7ad3f92ef26f25b9f3754",
+ "1d957d49f71bb7f304705a11a597f0cb",
+ "9522d5713fb951b79f42d78fbff914cf",
+ "9522d5713fb951b79f42d78fbff914cf",
+ "422c046013f79a9f46e2c855967570ba",
+
+ // mask_is_inverse = true.
+ "a585cca9bc459d10e081bc0eb847b6e3",
+ "2fa4ec5f74fad2831d216c51c2cdad5a",
+ "d6c9ac69a9eb3059f5bb6e42b486ebcd",
+ "" /*kBlock4x16*/,
+ "2fa4ec5f74fad2831d216c51c2cdad5a",
+ "d6c9ac69a9eb3059f5bb6e42b486ebcd",
+ "2ddd8c8a1841501964011030e2557e20",
+ "97ef2575023dda008711015cf08d7590",
+ "d6c9ac69a9eb3059f5bb6e42b486ebcd",
+ "2ddd8c8a1841501964011030e2557e20",
+ "97ef2575023dda008711015cf08d7590",
+ "d69aff1e0d43395ce305c9be0dfb4c89",
+ "97ef2575023dda008711015cf08d7590",
+ "d69aff1e0d43395ce305c9be0dfb4c89",
+ "48786f640191dcbee5b3321672778519",
+ "6ad4718230353440b01f2bb78348157e",
+ "6ad4718230353440b01f2bb78348157e",
+ "ad49bd7af0ea17c84f434c7dfd0a911d",
+ };
+ return kDigest[id];
+}
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
struct WeightMaskTestParam {
WeightMaskTestParam(int width, int height, bool mask_is_inverse)
: width(width), height(height), mask_is_inverse(mask_is_inverse) {}
@@ -159,6 +205,7 @@ template <int bitdepth>
class WeightMaskTest : public testing::TestWithParam<WeightMaskTestParam>,
public test_utils::MaxAlignedAllocable {
public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
WeightMaskTest() = default;
~WeightMaskTest() override = default;
@@ -276,7 +323,7 @@ void WeightMaskTest<bitdepth>::Test(const int num_runs,
SetInputData(use_fixed_values, value_1, value_2);
const absl::Time start = absl::Now();
for (int i = 0; i < num_runs; ++i) {
- func_(block_1_, block_2_, mask_, kMaxPredictionSize);
+ func_(block_1_, block_2_, mask_, width_);
}
const absl::Duration elapsed_time = absl::Now() - start;
if (use_fixed_values) {
@@ -284,8 +331,7 @@ void WeightMaskTest<bitdepth>::Test(const int num_runs,
if (mask_is_inverse_) fixed_value = 64 - fixed_value;
for (int y = 0; y < height_; ++y) {
for (int x = 0; x < width_; ++x) {
- ASSERT_EQ(static_cast<int>(mask_[y * kMaxPredictionSize + x]),
- fixed_value)
+ ASSERT_EQ(static_cast<int>(mask_[y * width_ + x]), fixed_value)
<< "x: " << x << " y: " << y;
}
}
@@ -293,17 +339,26 @@ void WeightMaskTest<bitdepth>::Test(const int num_runs,
const int id_offset = mask_is_inverse_ ? kMaxBlockSizes - 4 : 0;
const int id = id_offset +
static_cast<int>(DimensionsToBlockSize(width_, height_)) - 4;
- if (bitdepth == 8) {
- test_utils::CheckMd5Digest(
- absl::StrFormat("BlockSize %dx%d", width_, height_).c_str(),
- "WeightMask", GetDigest8bpp(id), mask_, sizeof(mask_), elapsed_time);
+ const char* expected_digest = nullptr;
+ switch (bitdepth) {
+ case 8:
+ expected_digest = GetDigest8bpp(id);
+ break;
#if LIBGAV1_MAX_BITDEPTH >= 10
- } else {
- test_utils::CheckMd5Digest(
- absl::StrFormat("BlockSize %dx%d", width_, height_).c_str(),
- "WeightMask", GetDigest10bpp(id), mask_, sizeof(mask_), elapsed_time);
+ case 10:
+ expected_digest = GetDigest10bpp(id);
+ break;
+#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+ case 12:
+ expected_digest = GetDigest12bpp(id);
+ break;
#endif
}
+ ASSERT_NE(expected_digest, nullptr);
+ test_utils::CheckMd5Digest(
+ absl::StrFormat("BlockSize %dx%d", width_, height_).c_str(),
+ "WeightMask", expected_digest, mask_, sizeof(mask_), elapsed_time);
}
}
@@ -385,6 +440,28 @@ INSTANTIATE_TEST_SUITE_P(SSE41, WeightMaskTest10bpp,
#endif
#endif // LIBGAV1_MAX_BITDEPTH >= 10
+#if LIBGAV1_MAX_BITDEPTH == 12
+using WeightMaskTest12bpp = WeightMaskTest<12>;
+
+TEST_P(WeightMaskTest12bpp, FixedValues) {
+ const int min = kCompoundPredictionRange[2][0];
+ const int max = kCompoundPredictionRange[2][1];
+ Test(1, true, min, min);
+ Test(1, true, min, max);
+ Test(1, true, max, min);
+ Test(1, true, max, max);
+}
+
+TEST_P(WeightMaskTest12bpp, RandomValues) { Test(1, false, -1, -1); }
+
+TEST_P(WeightMaskTest12bpp, DISABLED_Speed) {
+ Test(kNumSpeedTests, false, -1, -1);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, WeightMaskTest12bpp,
+ testing::ValuesIn(weight_mask_test_param));
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
} // namespace
} // namespace dsp
} // namespace libgav1
diff --git a/src/dsp/x86/average_blend_sse4.cc b/src/dsp/x86/average_blend_sse4.cc
index 911c5a9..c08b3d6 100644
--- a/src/dsp/x86/average_blend_sse4.cc
+++ b/src/dsp/x86/average_blend_sse4.cc
@@ -35,24 +35,46 @@ namespace {
constexpr int kInterPostRoundBit = 4;
-inline void AverageBlend4Row(const int16_t* LIBGAV1_RESTRICT prediction_0,
- const int16_t* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT dest) {
- const __m128i pred_0 = LoadLo8(prediction_0);
- const __m128i pred_1 = LoadLo8(prediction_1);
- __m128i res = _mm_add_epi16(pred_0, pred_1);
- res = RightShiftWithRounding_S16(res, kInterPostRoundBit + 1);
- Store4(dest, _mm_packus_epi16(res, res));
+inline void AverageBlend4x4Row(const int16_t* LIBGAV1_RESTRICT prediction_0,
+ const int16_t* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT dest,
+ const ptrdiff_t dest_stride) {
+ const __m128i pred_00 = LoadAligned16(prediction_0);
+ const __m128i pred_10 = LoadAligned16(prediction_1);
+ __m128i res_0 = _mm_add_epi16(pred_00, pred_10);
+ res_0 = RightShiftWithRounding_S16(res_0, kInterPostRoundBit + 1);
+ const __m128i pred_01 = LoadAligned16(prediction_0 + 8);
+ const __m128i pred_11 = LoadAligned16(prediction_1 + 8);
+ __m128i res_1 = _mm_add_epi16(pred_01, pred_11);
+ res_1 = RightShiftWithRounding_S16(res_1, kInterPostRoundBit + 1);
+ const __m128i result_pixels = _mm_packus_epi16(res_0, res_1);
+ Store4(dest, result_pixels);
+ dest += dest_stride;
+ const int result_1 = _mm_extract_epi32(result_pixels, 1);
+ memcpy(dest, &result_1, sizeof(result_1));
+ dest += dest_stride;
+ const int result_2 = _mm_extract_epi32(result_pixels, 2);
+ memcpy(dest, &result_2, sizeof(result_2));
+ dest += dest_stride;
+ const int result_3 = _mm_extract_epi32(result_pixels, 3);
+ memcpy(dest, &result_3, sizeof(result_3));
}
inline void AverageBlend8Row(const int16_t* LIBGAV1_RESTRICT prediction_0,
const int16_t* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT dest) {
- const __m128i pred_0 = LoadAligned16(prediction_0);
- const __m128i pred_1 = LoadAligned16(prediction_1);
- __m128i res = _mm_add_epi16(pred_0, pred_1);
- res = RightShiftWithRounding_S16(res, kInterPostRoundBit + 1);
- StoreLo8(dest, _mm_packus_epi16(res, res));
+ uint8_t* LIBGAV1_RESTRICT dest,
+ const ptrdiff_t dest_stride) {
+ const __m128i pred_00 = LoadAligned16(prediction_0);
+ const __m128i pred_10 = LoadAligned16(prediction_1);
+ __m128i res_0 = _mm_add_epi16(pred_00, pred_10);
+ res_0 = RightShiftWithRounding_S16(res_0, kInterPostRoundBit + 1);
+ const __m128i pred_01 = LoadAligned16(prediction_0 + 8);
+ const __m128i pred_11 = LoadAligned16(prediction_1 + 8);
+ __m128i res_1 = _mm_add_epi16(pred_01, pred_11);
+ res_1 = RightShiftWithRounding_S16(res_1, kInterPostRoundBit + 1);
+ const __m128i result_pixels = _mm_packus_epi16(res_0, res_1);
+ StoreLo8(dest, result_pixels);
+ StoreHi8(dest + dest_stride, result_pixels);
}
inline void AverageBlendLargeRow(const int16_t* LIBGAV1_RESTRICT prediction_0,
@@ -85,35 +107,27 @@ void AverageBlend_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
int y = height;
if (width == 4) {
+ const ptrdiff_t dest_stride4 = dest_stride << 2;
+ constexpr ptrdiff_t width4 = 4 << 2;
do {
- // TODO(b/150326556): |prediction_[01]| values are packed. It is possible
- // to load 8 values at a time.
- AverageBlend4Row(pred_0, pred_1, dst);
- dst += dest_stride;
- pred_0 += width;
- pred_1 += width;
-
- AverageBlend4Row(pred_0, pred_1, dst);
- dst += dest_stride;
- pred_0 += width;
- pred_1 += width;
+ AverageBlend4x4Row(pred_0, pred_1, dst, dest_stride);
+ dst += dest_stride4;
+ pred_0 += width4;
+ pred_1 += width4;
- y -= 2;
+ y -= 4;
} while (y != 0);
return;
}
if (width == 8) {
+ const ptrdiff_t dest_stride2 = dest_stride << 1;
+ constexpr ptrdiff_t width2 = 8 << 1;
do {
- AverageBlend8Row(pred_0, pred_1, dst);
- dst += dest_stride;
- pred_0 += width;
- pred_1 += width;
-
- AverageBlend8Row(pred_0, pred_1, dst);
- dst += dest_stride;
- pred_0 += width;
- pred_1 += width;
+ AverageBlend8Row(pred_0, pred_1, dst, dest_stride);
+ dst += dest_stride2;
+ pred_0 += width2;
+ pred_1 += width2;
y -= 2;
} while (y != 0);
diff --git a/src/dsp/x86/common_sse4_test.cc b/src/dsp/x86/common_sse4_test.cc
index 4ea811a..3288cfc 100644
--- a/src/dsp/x86/common_sse4_test.cc
+++ b/src/dsp/x86/common_sse4_test.cc
@@ -31,7 +31,7 @@ namespace {
// INT16_MAX - ((1 << bits) >> 1). In particular, if bits == 16, then
// RightShiftWithRounding_S16() is equal to RightShiftWithRounding() only for
// negative values.
-TEST(CommonDspTest, SSE4RightShiftWithRoundingS16) {
+TEST(CommonDspTest, SSE41RightShiftWithRoundingS16) {
for (int bits = 0; bits < 16; ++bits) {
const int bias = (1 << bits) >> 1;
for (int32_t value = INT16_MIN; value <= INT16_MAX; ++value) {
@@ -56,7 +56,7 @@ TEST(CommonDspTest, SSE4RightShiftWithRoundingS16) {
#else // !LIBGAV1_TARGETING_SSE4_1
-TEST(CommonDspTest, SSE4) {
+TEST(CommonDspTest, SSE41) {
GTEST_SKIP() << "Build this module for x86(-64) with SSE4 enabled to enable "
"the tests.";
}
diff --git a/src/dsp/x86/convolve_avx2.cc b/src/dsp/x86/convolve_avx2.cc
index 4126ca9..6e94347 100644
--- a/src/dsp/x86/convolve_avx2.cc
+++ b/src/dsp/x86/convolve_avx2.cc
@@ -39,17 +39,17 @@ namespace {
// Multiply every entry in |src[]| by the corresponding entry in |taps[]| and
// sum. The filters in |taps[]| are pre-shifted by 1. This prevents the final
// sum from outranging int16_t.
-template <int filter_index>
+template <int num_taps>
__m256i SumOnePassTaps(const __m256i* const src, const __m256i* const taps) {
__m256i sum;
- if (filter_index < 2) {
+ if (num_taps == 6) {
// 6 taps.
const __m256i v_madd_21 = _mm256_maddubs_epi16(src[0], taps[0]); // k2k1
const __m256i v_madd_43 = _mm256_maddubs_epi16(src[1], taps[1]); // k4k3
const __m256i v_madd_65 = _mm256_maddubs_epi16(src[2], taps[2]); // k6k5
sum = _mm256_add_epi16(v_madd_21, v_madd_43);
sum = _mm256_add_epi16(sum, v_madd_65);
- } else if (filter_index == 2) {
+ } else if (num_taps == 8) {
// 8 taps.
const __m256i v_madd_10 = _mm256_maddubs_epi16(src[0], taps[0]); // k1k0
const __m256i v_madd_32 = _mm256_maddubs_epi16(src[1], taps[1]); // k3k2
@@ -58,7 +58,7 @@ __m256i SumOnePassTaps(const __m256i* const src, const __m256i* const taps) {
const __m256i v_sum_3210 = _mm256_add_epi16(v_madd_10, v_madd_32);
const __m256i v_sum_7654 = _mm256_add_epi16(v_madd_54, v_madd_76);
sum = _mm256_add_epi16(v_sum_7654, v_sum_3210);
- } else if (filter_index == 3) {
+ } else if (num_taps == 2) {
// 2 taps.
sum = _mm256_maddubs_epi16(src[0], taps[0]); // k4k3
} else {
@@ -70,7 +70,7 @@ __m256i SumOnePassTaps(const __m256i* const src, const __m256i* const taps) {
return sum;
}
-template <int filter_index>
+template <int num_taps>
__m256i SumHorizontalTaps(const __m256i* const src,
const __m256i* const v_tap) {
__m256i v_src[4];
@@ -78,32 +78,32 @@ __m256i SumHorizontalTaps(const __m256i* const src,
const __m256i src_long_dup_lo = _mm256_unpacklo_epi8(src_long, src_long);
const __m256i src_long_dup_hi = _mm256_unpackhi_epi8(src_long, src_long);
- if (filter_index < 2) {
+ if (num_taps == 6) {
// 6 taps.
v_src[0] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 3); // _21
v_src[1] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 7); // _43
v_src[2] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 11); // _65
- } else if (filter_index == 2) {
+ } else if (num_taps == 8) {
// 8 taps.
v_src[0] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 1); // _10
v_src[1] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 5); // _32
v_src[2] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 9); // _54
v_src[3] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 13); // _76
- } else if (filter_index == 3) {
+ } else if (num_taps == 2) {
// 2 taps.
v_src[0] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 7); // _43
- } else if (filter_index > 3) {
+ } else {
// 4 taps.
v_src[0] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 5); // _32
v_src[1] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 9); // _54
}
- return SumOnePassTaps<filter_index>(v_src, v_tap);
+ return SumOnePassTaps<num_taps>(v_src, v_tap);
}
-template <int filter_index>
+template <int num_taps>
__m256i SimpleHorizontalTaps(const __m256i* const src,
const __m256i* const v_tap) {
- __m256i sum = SumHorizontalTaps<filter_index>(src, v_tap);
+ __m256i sum = SumHorizontalTaps<num_taps>(src, v_tap);
// Normally the Horizontal pass does the downshift in two passes:
// kInterRoundBitsHorizontal - 1 and then (kFilterBits -
@@ -116,17 +116,16 @@ __m256i SimpleHorizontalTaps(const __m256i* const src,
return _mm256_packus_epi16(sum, sum);
}
-template <int filter_index>
+template <int num_taps>
__m256i HorizontalTaps8To16(const __m256i* const src,
const __m256i* const v_tap) {
- const __m256i sum = SumHorizontalTaps<filter_index>(src, v_tap);
+ const __m256i sum = SumHorizontalTaps<num_taps>(src, v_tap);
return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
}
// Filter 2xh sizes.
-template <int num_taps, int filter_index, bool is_2d = false,
- bool is_compound = false>
+template <int num_taps, bool is_2d = false, bool is_compound = false>
void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
const ptrdiff_t src_stride,
void* LIBGAV1_RESTRICT const dest,
@@ -145,14 +144,14 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
do {
if (is_2d) {
const __m128i sum =
- HorizontalTaps8To16_2x2<filter_index>(src, src_stride, v_tap);
+ HorizontalTaps8To16_2x2<num_taps>(src, src_stride, v_tap);
Store4(&dest16[0], sum);
dest16 += pred_stride;
Store4(&dest16[0], _mm_srli_si128(sum, 8));
dest16 += pred_stride;
} else {
const __m128i sum =
- SimpleHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
+ SimpleHorizontalTaps2x2<num_taps>(src, src_stride, v_tap);
Store2(dest8, sum);
dest8 += pred_stride;
Store2(dest8, _mm_srli_si128(sum, 4));
@@ -169,7 +168,7 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
assert(height % 2 == 1);
__m128i sum;
const __m128i input = LoadLo8(&src[2]);
- if (filter_index == 3) {
+ if (num_taps == 2) {
// 03 04 04 05 05 06 06 07 ....
const __m128i v_src_43 =
_mm_srli_si128(_mm_unpacklo_epi8(input, input), 3);
@@ -194,8 +193,7 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
}
// Filter widths >= 4.
-template <int num_taps, int filter_index, bool is_2d = false,
- bool is_compound = false>
+template <int num_taps, bool is_2d = false, bool is_compound = false>
void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
const ptrdiff_t src_stride,
void* LIBGAV1_RESTRICT const dest,
@@ -214,11 +212,11 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
const __m256i src_long =
SetrM128i(LoadUnaligned16(&src[x]), LoadUnaligned16(&src[x + 8]));
const __m256i result =
- HorizontalTaps8To16<filter_index>(&src_long, v_tap);
+ HorizontalTaps8To16<num_taps>(&src_long, v_tap);
const __m256i src_long2 = SetrM128i(LoadUnaligned16(&src[x + 16]),
LoadUnaligned16(&src[x + 24]));
const __m256i result2 =
- HorizontalTaps8To16<filter_index>(&src_long2, v_tap);
+ HorizontalTaps8To16<num_taps>(&src_long2, v_tap);
if (is_2d) {
StoreAligned32(&dest16[x], result);
StoreAligned32(&dest16[x + 16], result2);
@@ -230,11 +228,11 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
// Load src used to calculate dest8[7:0] and dest8[23:16].
const __m256i src_long = LoadUnaligned32(&src[x]);
const __m256i result =
- SimpleHorizontalTaps<filter_index>(&src_long, v_tap);
+ SimpleHorizontalTaps<num_taps>(&src_long, v_tap);
// Load src used to calculate dest8[15:8] and dest8[31:24].
const __m256i src_long2 = LoadUnaligned32(&src[x + 8]);
const __m256i result2 =
- SimpleHorizontalTaps<filter_index>(&src_long2, v_tap);
+ SimpleHorizontalTaps<num_taps>(&src_long2, v_tap);
// Combine results and store.
StoreUnaligned32(&dest8[x], _mm256_unpacklo_epi64(result, result2));
}
@@ -252,13 +250,12 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
// Load into 2 128 bit lanes.
const __m256i src_long =
SetrM128i(LoadUnaligned16(&src[0]), LoadUnaligned16(&src[8]));
- const __m256i result =
- HorizontalTaps8To16<filter_index>(&src_long, v_tap);
+ const __m256i result = HorizontalTaps8To16<num_taps>(&src_long, v_tap);
const __m256i src_long2 =
SetrM128i(LoadUnaligned16(&src[src_stride]),
LoadUnaligned16(&src[8 + src_stride]));
const __m256i result2 =
- HorizontalTaps8To16<filter_index>(&src_long2, v_tap);
+ HorizontalTaps8To16<num_taps>(&src_long2, v_tap);
if (is_2d) {
StoreAligned32(&dest16[0], result);
StoreAligned32(&dest16[pred_stride], result2);
@@ -270,12 +267,11 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
// Load into 2 128 bit lanes.
const __m256i src_long = SetrM128i(LoadUnaligned16(&src[0]),
LoadUnaligned16(&src[src_stride]));
- const __m256i result =
- SimpleHorizontalTaps<filter_index>(&src_long, v_tap);
+ const __m256i result = SimpleHorizontalTaps<num_taps>(&src_long, v_tap);
const __m256i src_long2 = SetrM128i(
LoadUnaligned16(&src[8]), LoadUnaligned16(&src[8 + src_stride]));
const __m256i result2 =
- SimpleHorizontalTaps<filter_index>(&src_long2, v_tap);
+ SimpleHorizontalTaps<num_taps>(&src_long2, v_tap);
const __m256i packed_result = _mm256_unpacklo_epi64(result, result2);
StoreUnaligned16(&dest8[0], _mm256_castsi256_si128(packed_result));
StoreUnaligned16(&dest8[pred_stride],
@@ -292,8 +288,7 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
if (is_2d) {
const __m256i src_long =
SetrM128i(LoadUnaligned16(&src[0]), LoadUnaligned16(&src[8]));
- const __m256i result =
- HorizontalTaps8To16<filter_index>(&src_long, v_tap);
+ const __m256i result = HorizontalTaps8To16<num_taps>(&src_long, v_tap);
StoreAligned32(&dest16[0], result);
}
@@ -306,8 +301,7 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
const __m128i next_row = LoadUnaligned16(&src[src_stride]);
const __m256i src_long = SetrM128i(this_row, next_row);
if (is_2d || is_compound) {
- const __m256i result =
- HorizontalTaps8To16<filter_index>(&src_long, v_tap);
+ const __m256i result = HorizontalTaps8To16<num_taps>(&src_long, v_tap);
if (is_2d) {
StoreAligned16(&dest16[0], _mm256_castsi256_si128(result));
StoreAligned16(&dest16[pred_stride],
@@ -322,8 +316,7 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
const __m128i next_row = LoadUnaligned16(&src[src_stride]);
// Load into 2 128 bit lanes.
const __m256i src_long = SetrM128i(this_row, next_row);
- const __m256i result =
- SimpleHorizontalTaps<filter_index>(&src_long, v_tap);
+ const __m256i result = SimpleHorizontalTaps<num_taps>(&src_long, v_tap);
StoreLo8(&dest8[0], _mm256_castsi256_si128(result));
StoreLo8(&dest8[pred_stride], _mm256_extracti128_si256(result, 1));
}
@@ -337,8 +330,7 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
// filter the remaining row.
if (is_2d) {
const __m256i src_long = _mm256_castsi128_si256(LoadUnaligned16(&src[0]));
- const __m256i result =
- HorizontalTaps8To16<filter_index>(&src_long, v_tap);
+ const __m256i result = HorizontalTaps8To16<num_taps>(&src_long, v_tap);
StoreAligned16(&dest16[0], _mm256_castsi256_si128(result));
}
@@ -351,8 +343,7 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
const __m128i next_row = LoadUnaligned16(&src[src_stride]);
const __m256i src_long = SetrM128i(this_row, next_row);
if (is_2d || is_compound) {
- const __m256i result =
- HorizontalTaps8To16<filter_index>(&src_long, v_tap);
+ const __m256i result = HorizontalTaps8To16<num_taps>(&src_long, v_tap);
StoreLo8(&dest16[0], _mm256_castsi256_si128(result));
StoreLo8(&dest16[pred_stride], _mm256_extracti128_si256(result, 1));
} else {
@@ -360,8 +351,7 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
const __m128i next_row = LoadUnaligned16(&src[src_stride]);
// Load into 2 128 bit lanes.
const __m256i src_long = SetrM128i(this_row, next_row);
- const __m256i result =
- SimpleHorizontalTaps<filter_index>(&src_long, v_tap);
+ const __m256i result = SimpleHorizontalTaps<num_taps>(&src_long, v_tap);
Store4(&dest8[0], _mm256_castsi256_si128(result));
Store4(&dest8[pred_stride], _mm256_extracti128_si256(result, 1));
}
@@ -375,8 +365,7 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
// filter the remaining row.
if (is_2d) {
const __m256i src_long = _mm256_castsi128_si256(LoadUnaligned16(&src[0]));
- const __m256i result =
- HorizontalTaps8To16<filter_index>(&src_long, v_tap);
+ const __m256i result = HorizontalTaps8To16<num_taps>(&src_long, v_tap);
StoreLo8(&dest16[0], _mm256_castsi256_si128(result));
}
}
@@ -554,18 +543,15 @@ LIBGAV1_ALWAYS_INLINE void DoHorizontalPass2xH(
const __m128i v_horizontal_filter =
LoadLo8(kHalfSubPixelFilters[filter_index][filter_id]);
- if (filter_index == 4) { // 4 tap.
- SetupTaps<4>(&v_horizontal_filter, v_tap);
- FilterHorizontal<4, 4, is_2d, is_compound>(src, src_stride, dst, dst_stride,
- width, height, v_tap);
- } else if (filter_index == 5) { // 4 tap.
+ if ((filter_index & 0x4) != 0) { // 4 tap.
+ // ((filter_index == 4) | (filter_index == 5))
SetupTaps<4>(&v_horizontal_filter, v_tap);
- FilterHorizontal<4, 5, is_2d, is_compound>(src, src_stride, dst, dst_stride,
- width, height, v_tap);
+ FilterHorizontal<4, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+ width, height, v_tap);
} else { // 2 tap.
SetupTaps<2>(&v_horizontal_filter, v_tap);
- FilterHorizontal<2, 3, is_2d, is_compound>(src, src_stride, dst, dst_stride,
- width, height, v_tap);
+ FilterHorizontal<2, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+ width, height, v_tap);
}
}
@@ -582,28 +568,25 @@ LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
if (filter_index == 2) { // 8 tap.
SetupTaps<8>(&v_horizontal_filter, v_tap);
- FilterHorizontal<8, 2, is_2d, is_compound>(src, src_stride, dst, dst_stride,
- width, height, v_tap);
+ FilterHorizontal<8, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+ width, height, v_tap);
} else if (filter_index == 1) { // 6 tap.
SetupTaps<6>(&v_horizontal_filter, v_tap);
- FilterHorizontal<6, 1, is_2d, is_compound>(src, src_stride, dst, dst_stride,
- width, height, v_tap);
+ FilterHorizontal<6, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+ width, height, v_tap);
} else if (filter_index == 0) { // 6 tap.
SetupTaps<6>(&v_horizontal_filter, v_tap);
- FilterHorizontal<6, 0, is_2d, is_compound>(src, src_stride, dst, dst_stride,
- width, height, v_tap);
- } else if (filter_index == 4) { // 4 tap.
- SetupTaps<4>(&v_horizontal_filter, v_tap);
- FilterHorizontal<4, 4, is_2d, is_compound>(src, src_stride, dst, dst_stride,
- width, height, v_tap);
- } else if (filter_index == 5) { // 4 tap.
+ FilterHorizontal<6, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+ width, height, v_tap);
+ } else if ((filter_index & 0x4) != 0) { // 4 tap.
+ // ((filter_index == 4) | (filter_index == 5))
SetupTaps<4>(&v_horizontal_filter, v_tap);
- FilterHorizontal<4, 5, is_2d, is_compound>(src, src_stride, dst, dst_stride,
- width, height, v_tap);
+ FilterHorizontal<4, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+ width, height, v_tap);
} else { // 2 tap.
SetupTaps<2>(&v_horizontal_filter, v_tap);
- FilterHorizontal<2, 3, is_2d, is_compound>(src, src_stride, dst, dst_stride,
- width, height, v_tap);
+ FilterHorizontal<2, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+ width, height, v_tap);
}
}
@@ -617,7 +600,8 @@ void Convolve2D_AVX2(const void* LIBGAV1_RESTRICT const reference,
const ptrdiff_t pred_stride) {
const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
- const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
+ const int vertical_taps =
+ GetNumTapsInFilter(vert_filter_index, vertical_filter_id);
// The output of the horizontal filter is guaranteed to fit in 16 bits.
alignas(32) uint16_t
@@ -730,61 +714,60 @@ __m256i Compound1DShift(const __m256i sum) {
return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
}
-template <int filter_index, bool unpack_high = false>
+template <int num_taps, bool unpack_high = false>
__m256i SumVerticalTaps(const __m256i* const srcs, const __m256i* const v_tap) {
__m256i v_src[4];
if (!unpack_high) {
- if (filter_index < 2) {
+ if (num_taps == 6) {
// 6 taps.
v_src[0] = _mm256_unpacklo_epi8(srcs[0], srcs[1]);
v_src[1] = _mm256_unpacklo_epi8(srcs[2], srcs[3]);
v_src[2] = _mm256_unpacklo_epi8(srcs[4], srcs[5]);
- } else if (filter_index == 2) {
+ } else if (num_taps == 8) {
// 8 taps.
v_src[0] = _mm256_unpacklo_epi8(srcs[0], srcs[1]);
v_src[1] = _mm256_unpacklo_epi8(srcs[2], srcs[3]);
v_src[2] = _mm256_unpacklo_epi8(srcs[4], srcs[5]);
v_src[3] = _mm256_unpacklo_epi8(srcs[6], srcs[7]);
- } else if (filter_index == 3) {
+ } else if (num_taps == 2) {
// 2 taps.
v_src[0] = _mm256_unpacklo_epi8(srcs[0], srcs[1]);
- } else if (filter_index > 3) {
+ } else {
// 4 taps.
v_src[0] = _mm256_unpacklo_epi8(srcs[0], srcs[1]);
v_src[1] = _mm256_unpacklo_epi8(srcs[2], srcs[3]);
}
} else {
- if (filter_index < 2) {
+ if (num_taps == 6) {
// 6 taps.
v_src[0] = _mm256_unpackhi_epi8(srcs[0], srcs[1]);
v_src[1] = _mm256_unpackhi_epi8(srcs[2], srcs[3]);
v_src[2] = _mm256_unpackhi_epi8(srcs[4], srcs[5]);
- } else if (filter_index == 2) {
+ } else if (num_taps == 8) {
// 8 taps.
v_src[0] = _mm256_unpackhi_epi8(srcs[0], srcs[1]);
v_src[1] = _mm256_unpackhi_epi8(srcs[2], srcs[3]);
v_src[2] = _mm256_unpackhi_epi8(srcs[4], srcs[5]);
v_src[3] = _mm256_unpackhi_epi8(srcs[6], srcs[7]);
- } else if (filter_index == 3) {
+ } else if (num_taps == 2) {
// 2 taps.
v_src[0] = _mm256_unpackhi_epi8(srcs[0], srcs[1]);
- } else if (filter_index > 3) {
+ } else {
// 4 taps.
v_src[0] = _mm256_unpackhi_epi8(srcs[0], srcs[1]);
v_src[1] = _mm256_unpackhi_epi8(srcs[2], srcs[3]);
}
}
- return SumOnePassTaps<filter_index>(v_src, v_tap);
+ return SumOnePassTaps<num_taps>(v_src, v_tap);
}
-template <int filter_index, bool is_compound = false>
+template <int num_taps, bool is_compound = false>
void FilterVertical32xH(const uint8_t* LIBGAV1_RESTRICT src,
const ptrdiff_t src_stride,
void* LIBGAV1_RESTRICT const dst,
const ptrdiff_t dst_stride, const int width,
const int height, const __m256i* const v_tap) {
- const int num_taps = GetNumTapsInFilter(filter_index);
const int next_row = num_taps - 1;
auto* dst8 = static_cast<uint8_t*>(dst);
auto* dst16 = static_cast<uint16_t*>(dst);
@@ -821,9 +804,9 @@ void FilterVertical32xH(const uint8_t* LIBGAV1_RESTRICT src,
srcs[next_row] = LoadUnaligned32(src_x);
src_x += src_stride;
- const __m256i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ const __m256i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
const __m256i sums_hi =
- SumVerticalTaps<filter_index, /*unpack_high=*/true>(srcs, v_tap);
+ SumVerticalTaps<num_taps, /*unpack_high=*/true>(srcs, v_tap);
if (is_compound) {
const __m256i results =
Compound1DShift(_mm256_permute2x128_si256(sums, sums_hi, 0x20));
@@ -861,13 +844,12 @@ void FilterVertical32xH(const uint8_t* LIBGAV1_RESTRICT src,
} while (x < width);
}
-template <int filter_index, bool is_compound = false>
+template <int num_taps, bool is_compound = false>
void FilterVertical16xH(const uint8_t* LIBGAV1_RESTRICT src,
const ptrdiff_t src_stride,
void* LIBGAV1_RESTRICT const dst,
const ptrdiff_t dst_stride, const int /*width*/,
const int height, const __m256i* const v_tap) {
- const int num_taps = GetNumTapsInFilter(filter_index);
const int next_row = num_taps;
auto* dst8 = static_cast<uint8_t*>(dst);
auto* dst16 = static_cast<uint16_t*>(dst);
@@ -922,9 +904,9 @@ void FilterVertical16xH(const uint8_t* LIBGAV1_RESTRICT src,
srcs[next_row - 1] = _mm256_inserti128_si256(
srcs[next_row - 1], _mm256_castsi256_si128(srcs[next_row]), 1);
- const __m256i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ const __m256i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
const __m256i sums_hi =
- SumVerticalTaps<filter_index, /*unpack_high=*/true>(srcs, v_tap);
+ SumVerticalTaps<num_taps, /*unpack_high=*/true>(srcs, v_tap);
if (is_compound) {
const __m256i results =
Compound1DShift(_mm256_permute2x128_si256(sums, sums_hi, 0x20));
@@ -964,13 +946,12 @@ void FilterVertical16xH(const uint8_t* LIBGAV1_RESTRICT src,
} while (y != 0);
}
-template <int filter_index, bool is_compound = false>
+template <int num_taps, bool is_compound = false>
void FilterVertical8xH(const uint8_t* LIBGAV1_RESTRICT src,
const ptrdiff_t src_stride,
void* LIBGAV1_RESTRICT const dst,
const ptrdiff_t dst_stride, const int /*width*/,
const int height, const __m256i* const v_tap) {
- const int num_taps = GetNumTapsInFilter(filter_index);
const int next_row = num_taps;
auto* dst8 = static_cast<uint8_t*>(dst);
auto* dst16 = static_cast<uint16_t*>(dst);
@@ -1025,7 +1006,7 @@ void FilterVertical8xH(const uint8_t* LIBGAV1_RESTRICT src,
srcs[next_row - 1] = _mm256_inserti128_si256(
srcs[next_row - 1], _mm256_castsi256_si128(srcs[next_row]), 1);
- const __m256i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ const __m256i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
if (is_compound) {
const __m256i results = Compound1DShift(sums);
const __m128i this_dst = _mm256_castsi256_si128(results);
@@ -1062,13 +1043,12 @@ void FilterVertical8xH(const uint8_t* LIBGAV1_RESTRICT src,
} while (y != 0);
}
-template <int filter_index, bool is_compound = false>
+template <int num_taps, bool is_compound = false>
void FilterVertical8xH(const uint8_t* LIBGAV1_RESTRICT src,
const ptrdiff_t src_stride,
void* LIBGAV1_RESTRICT const dst,
const ptrdiff_t dst_stride, const int /*width*/,
const int height, const __m128i* const v_tap) {
- const int num_taps = GetNumTapsInFilter(filter_index);
const int next_row = num_taps - 1;
auto* dst8 = static_cast<uint8_t*>(dst);
auto* dst16 = static_cast<uint16_t*>(dst);
@@ -1101,7 +1081,7 @@ void FilterVertical8xH(const uint8_t* LIBGAV1_RESTRICT src,
srcs[next_row] = LoadLo8(src_x);
src_x += src_stride;
- const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
if (is_compound) {
const __m128i results = Compound1DShift(sums);
StoreUnaligned16(dst16, results);
@@ -1137,7 +1117,8 @@ void ConvolveVertical_AVX2(const void* LIBGAV1_RESTRICT const reference,
const int height, void* LIBGAV1_RESTRICT prediction,
const ptrdiff_t pred_stride) {
const int filter_index = GetFilterIndex(vertical_filter_index, height);
- const int vertical_taps = GetNumTapsInFilter(filter_index);
+ const int vertical_taps =
+ GetNumTapsInFilter(filter_index, vertical_filter_id);
const ptrdiff_t src_stride = reference_stride;
const auto* src = static_cast<const uint8_t*>(reference) -
(vertical_taps / 2 - 1) * src_stride;
@@ -1151,43 +1132,43 @@ void ConvolveVertical_AVX2(const void* LIBGAV1_RESTRICT const reference,
// Use 256 bits for width > 4.
if (width > 4) {
__m256i taps_256[4];
- if (filter_index < 2) { // 6 tap.
+ if (vertical_taps == 6) { // 6 tap.
SetupTaps<6>(&v_filter, taps_256);
if (width == 8) {
- FilterVertical8xH<0>(src, src_stride, dest, dest_stride, width, height,
+ FilterVertical8xH<6>(src, src_stride, dest, dest_stride, width, height,
taps_256);
} else if (width == 16) {
- FilterVertical16xH<0>(src, src_stride, dest, dest_stride, width, height,
+ FilterVertical16xH<6>(src, src_stride, dest, dest_stride, width, height,
taps_256);
} else {
- FilterVertical32xH<0>(src, src_stride, dest, dest_stride, width, height,
+ FilterVertical32xH<6>(src, src_stride, dest, dest_stride, width, height,
taps_256);
}
- } else if (filter_index == 2) { // 8 tap.
+ } else if (vertical_taps == 8) { // 8 tap.
SetupTaps<8>(&v_filter, taps_256);
if (width == 8) {
- FilterVertical8xH<2>(src, src_stride, dest, dest_stride, width, height,
+ FilterVertical8xH<8>(src, src_stride, dest, dest_stride, width, height,
taps_256);
} else if (width == 16) {
- FilterVertical16xH<2>(src, src_stride, dest, dest_stride, width, height,
+ FilterVertical16xH<8>(src, src_stride, dest, dest_stride, width, height,
taps_256);
} else {
- FilterVertical32xH<2>(src, src_stride, dest, dest_stride, width, height,
+ FilterVertical32xH<8>(src, src_stride, dest, dest_stride, width, height,
taps_256);
}
- } else if (filter_index == 3) { // 2 tap.
+ } else if (vertical_taps == 2) { // 2 tap.
SetupTaps<2>(&v_filter, taps_256);
if (width == 8) {
- FilterVertical8xH<3>(src, src_stride, dest, dest_stride, width, height,
+ FilterVertical8xH<2>(src, src_stride, dest, dest_stride, width, height,
taps_256);
} else if (width == 16) {
- FilterVertical16xH<3>(src, src_stride, dest, dest_stride, width, height,
+ FilterVertical16xH<2>(src, src_stride, dest, dest_stride, width, height,
taps_256);
} else {
- FilterVertical32xH<3>(src, src_stride, dest, dest_stride, width, height,
+ FilterVertical32xH<2>(src, src_stride, dest, dest_stride, width, height,
taps_256);
}
- } else if (filter_index == 4) { // 4 tap.
+ } else { // 4 tap.
SetupTaps<4>(&v_filter, taps_256);
if (width == 8) {
FilterVertical8xH<4>(src, src_stride, dest, dest_stride, width, height,
@@ -1199,67 +1180,38 @@ void ConvolveVertical_AVX2(const void* LIBGAV1_RESTRICT const reference,
FilterVertical32xH<4>(src, src_stride, dest, dest_stride, width, height,
taps_256);
}
- } else {
- SetupTaps<4>(&v_filter, taps_256);
- if (width == 8) {
- FilterVertical8xH<5>(src, src_stride, dest, dest_stride, width, height,
- taps_256);
- } else if (width == 16) {
- FilterVertical16xH<5>(src, src_stride, dest, dest_stride, width, height,
- taps_256);
- } else {
- FilterVertical32xH<5>(src, src_stride, dest, dest_stride, width, height,
- taps_256);
- }
}
} else { // width <= 8
// Use 128 bit code.
__m128i taps[4];
- if (filter_index < 2) { // 6 tap.
+ if (vertical_taps == 6) { // 6 tap.
SetupTaps<6>(&v_filter, taps);
if (width == 2) {
- FilterVertical2xH<6, 0>(src, src_stride, dest, dest_stride, height,
- taps);
+ FilterVertical2xH<6>(src, src_stride, dest, dest_stride, height, taps);
} else {
- FilterVertical4xH<6, 0>(src, src_stride, dest, dest_stride, height,
- taps);
+ FilterVertical4xH<6>(src, src_stride, dest, dest_stride, height, taps);
}
- } else if (filter_index == 2) { // 8 tap.
+ } else if (vertical_taps == 8) { // 8 tap.
SetupTaps<8>(&v_filter, taps);
if (width == 2) {
- FilterVertical2xH<8, 2>(src, src_stride, dest, dest_stride, height,
- taps);
+ FilterVertical2xH<8>(src, src_stride, dest, dest_stride, height, taps);
} else {
- FilterVertical4xH<8, 2>(src, src_stride, dest, dest_stride, height,
- taps);
+ FilterVertical4xH<8>(src, src_stride, dest, dest_stride, height, taps);
}
- } else if (filter_index == 3) { // 2 tap.
+ } else if (vertical_taps == 2) { // 2 tap.
SetupTaps<2>(&v_filter, taps);
if (width == 2) {
- FilterVertical2xH<2, 3>(src, src_stride, dest, dest_stride, height,
- taps);
- } else {
- FilterVertical4xH<2, 3>(src, src_stride, dest, dest_stride, height,
- taps);
- }
- } else if (filter_index == 4) { // 4 tap.
- SetupTaps<4>(&v_filter, taps);
- if (width == 2) {
- FilterVertical2xH<4, 4>(src, src_stride, dest, dest_stride, height,
- taps);
+ FilterVertical2xH<2>(src, src_stride, dest, dest_stride, height, taps);
} else {
- FilterVertical4xH<4, 4>(src, src_stride, dest, dest_stride, height,
- taps);
+ FilterVertical4xH<2>(src, src_stride, dest, dest_stride, height, taps);
}
- } else {
+ } else { // 4 tap.
SetupTaps<4>(&v_filter, taps);
if (width == 2) {
- FilterVertical2xH<4, 5>(src, src_stride, dest, dest_stride, height,
- taps);
+ FilterVertical2xH<4>(src, src_stride, dest, dest_stride, height, taps);
} else {
- FilterVertical4xH<4, 5>(src, src_stride, dest, dest_stride, height,
- taps);
+ FilterVertical4xH<4>(src, src_stride, dest, dest_stride, height, taps);
}
}
}
@@ -1272,7 +1224,8 @@ void ConvolveCompoundVertical_AVX2(
const int vertical_filter_id, const int width, const int height,
void* LIBGAV1_RESTRICT prediction, const ptrdiff_t /*pred_stride*/) {
const int filter_index = GetFilterIndex(vertical_filter_index, height);
- const int vertical_taps = GetNumTapsInFilter(filter_index);
+ const int vertical_taps =
+ GetNumTapsInFilter(filter_index, vertical_filter_id);
const ptrdiff_t src_stride = reference_stride;
const auto* src = static_cast<const uint8_t*>(reference) -
(vertical_taps / 2 - 1) * src_stride;
@@ -1286,43 +1239,43 @@ void ConvolveCompoundVertical_AVX2(
// Use 256 bits for width > 4.
if (width > 4) {
__m256i taps_256[4];
- if (filter_index < 2) { // 6 tap.
+ if (vertical_taps == 6) { // 6 tap.
SetupTaps<6>(&v_filter, taps_256);
if (width == 8) {
- FilterVertical8xH<0, /*is_compound=*/true>(
+ FilterVertical8xH<6, /*is_compound=*/true>(
src, src_stride, dest, dest_stride, width, height, taps_256);
} else if (width == 16) {
- FilterVertical16xH<0, /*is_compound=*/true>(
+ FilterVertical16xH<6, /*is_compound=*/true>(
src, src_stride, dest, dest_stride, width, height, taps_256);
} else {
- FilterVertical32xH<0, /*is_compound=*/true>(
+ FilterVertical32xH<6, /*is_compound=*/true>(
src, src_stride, dest, dest_stride, width, height, taps_256);
}
- } else if (filter_index == 2) { // 8 tap.
+ } else if (vertical_taps == 8) { // 8 tap.
SetupTaps<8>(&v_filter, taps_256);
if (width == 8) {
- FilterVertical8xH<2, /*is_compound=*/true>(
+ FilterVertical8xH<8, /*is_compound=*/true>(
src, src_stride, dest, dest_stride, width, height, taps_256);
} else if (width == 16) {
- FilterVertical16xH<2, /*is_compound=*/true>(
+ FilterVertical16xH<8, /*is_compound=*/true>(
src, src_stride, dest, dest_stride, width, height, taps_256);
} else {
- FilterVertical32xH<2, /*is_compound=*/true>(
+ FilterVertical32xH<8, /*is_compound=*/true>(
src, src_stride, dest, dest_stride, width, height, taps_256);
}
- } else if (filter_index == 3) { // 2 tap.
+ } else if (vertical_taps == 2) { // 2 tap.
SetupTaps<2>(&v_filter, taps_256);
if (width == 8) {
- FilterVertical8xH<3, /*is_compound=*/true>(
+ FilterVertical8xH<2, /*is_compound=*/true>(
src, src_stride, dest, dest_stride, width, height, taps_256);
} else if (width == 16) {
- FilterVertical16xH<3, /*is_compound=*/true>(
+ FilterVertical16xH<2, /*is_compound=*/true>(
src, src_stride, dest, dest_stride, width, height, taps_256);
} else {
- FilterVertical32xH<3, /*is_compound=*/true>(
+ FilterVertical32xH<2, /*is_compound=*/true>(
src, src_stride, dest, dest_stride, width, height, taps_256);
}
- } else if (filter_index == 4) { // 4 tap.
+ } else { // 4 tap.
SetupTaps<4>(&v_filter, taps_256);
if (width == 8) {
FilterVertical8xH<4, /*is_compound=*/true>(
@@ -1334,43 +1287,27 @@ void ConvolveCompoundVertical_AVX2(
FilterVertical32xH<4, /*is_compound=*/true>(
src, src_stride, dest, dest_stride, width, height, taps_256);
}
- } else {
- SetupTaps<4>(&v_filter, taps_256);
- if (width == 8) {
- FilterVertical8xH<5, /*is_compound=*/true>(
- src, src_stride, dest, dest_stride, width, height, taps_256);
- } else if (width == 16) {
- FilterVertical16xH<5, /*is_compound=*/true>(
- src, src_stride, dest, dest_stride, width, height, taps_256);
- } else {
- FilterVertical32xH<5, /*is_compound=*/true>(
- src, src_stride, dest, dest_stride, width, height, taps_256);
- }
}
} else { // width <= 4
// Use 128 bit code.
__m128i taps[4];
- if (filter_index < 2) { // 6 tap.
+ if (vertical_taps == 6) { // 6 tap.
SetupTaps<6>(&v_filter, taps);
- FilterVertical4xH<6, 0, /*is_compound=*/true>(src, src_stride, dest,
- dest_stride, height, taps);
- } else if (filter_index == 2) { // 8 tap.
+ FilterVertical4xH<6, /*is_compound=*/true>(src, src_stride, dest,
+ dest_stride, height, taps);
+ } else if (vertical_taps == 8) { // 8 tap.
SetupTaps<8>(&v_filter, taps);
- FilterVertical4xH<8, 2, /*is_compound=*/true>(src, src_stride, dest,
- dest_stride, height, taps);
- } else if (filter_index == 3) { // 2 tap.
+ FilterVertical4xH<8, /*is_compound=*/true>(src, src_stride, dest,
+ dest_stride, height, taps);
+ } else if (vertical_taps == 2) { // 2 tap.
SetupTaps<2>(&v_filter, taps);
- FilterVertical4xH<2, 3, /*is_compound=*/true>(src, src_stride, dest,
- dest_stride, height, taps);
- } else if (filter_index == 4) { // 4 tap.
- SetupTaps<4>(&v_filter, taps);
- FilterVertical4xH<4, 4, /*is_compound=*/true>(src, src_stride, dest,
- dest_stride, height, taps);
- } else {
+ FilterVertical4xH<2, /*is_compound=*/true>(src, src_stride, dest,
+ dest_stride, height, taps);
+ } else { // 4 tap.
SetupTaps<4>(&v_filter, taps);
- FilterVertical4xH<4, 5, /*is_compound=*/true>(src, src_stride, dest,
- dest_stride, height, taps);
+ FilterVertical4xH<4, /*is_compound=*/true>(src, src_stride, dest,
+ dest_stride, height, taps);
}
}
}
@@ -1430,7 +1367,8 @@ void ConvolveCompound2D_AVX2(
void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) {
const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
- const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
+ const int vertical_taps =
+ GetNumTapsInFilter(vert_filter_index, vertical_filter_id);
// The output of the horizontal filter is guaranteed to fit in 16 bits.
alignas(32) uint16_t
diff --git a/src/dsp/x86/convolve_sse4.cc b/src/dsp/x86/convolve_sse4.cc
index f7e5a71..f427c4c 100644
--- a/src/dsp/x86/convolve_sse4.cc
+++ b/src/dsp/x86/convolve_sse4.cc
@@ -36,7 +36,7 @@ namespace {
#include "src/dsp/x86/convolve_sse4.inc"
-template <int filter_index>
+template <int num_taps>
__m128i SumHorizontalTaps(const uint8_t* LIBGAV1_RESTRICT const src,
const __m128i* const v_tap) {
__m128i v_src[4];
@@ -44,33 +44,33 @@ __m128i SumHorizontalTaps(const uint8_t* LIBGAV1_RESTRICT const src,
const __m128i src_long_dup_lo = _mm_unpacklo_epi8(src_long, src_long);
const __m128i src_long_dup_hi = _mm_unpackhi_epi8(src_long, src_long);
- if (filter_index < 2) {
+ if (num_taps == 6) {
// 6 taps.
v_src[0] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 3); // _21
v_src[1] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 7); // _43
v_src[2] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 11); // _65
- } else if (filter_index == 2) {
+ } else if (num_taps == 8) {
// 8 taps.
v_src[0] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 1); // _10
v_src[1] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 5); // _32
v_src[2] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 9); // _54
v_src[3] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 13); // _76
- } else if (filter_index == 3) {
+ } else if (num_taps == 2) {
// 2 taps.
v_src[0] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 7); // _43
- } else if (filter_index > 3) {
+ } else {
// 4 taps.
v_src[0] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 5); // _32
v_src[1] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 9); // _54
}
- const __m128i sum = SumOnePassTaps<filter_index>(v_src, v_tap);
+ const __m128i sum = SumOnePassTaps<num_taps>(v_src, v_tap);
return sum;
}
-template <int filter_index>
+template <int num_taps>
__m128i SimpleHorizontalTaps(const uint8_t* LIBGAV1_RESTRICT const src,
const __m128i* const v_tap) {
- __m128i sum = SumHorizontalTaps<filter_index>(src, v_tap);
+ __m128i sum = SumHorizontalTaps<num_taps>(src, v_tap);
// Normally the Horizontal pass does the downshift in two passes:
// kInterRoundBitsHorizontal - 1 and then (kFilterBits -
@@ -83,16 +83,15 @@ __m128i SimpleHorizontalTaps(const uint8_t* LIBGAV1_RESTRICT const src,
return _mm_packus_epi16(sum, sum);
}
-template <int filter_index>
+template <int num_taps>
__m128i HorizontalTaps8To16(const uint8_t* LIBGAV1_RESTRICT const src,
const __m128i* const v_tap) {
- const __m128i sum = SumHorizontalTaps<filter_index>(src, v_tap);
+ const __m128i sum = SumHorizontalTaps<num_taps>(src, v_tap);
return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
}
-template <int num_taps, int filter_index, bool is_2d = false,
- bool is_compound = false>
+template <int num_taps, bool is_2d = false, bool is_compound = false>
void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
const ptrdiff_t src_stride,
void* LIBGAV1_RESTRICT const dest,
@@ -108,16 +107,14 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
int x = 0;
do {
if (is_2d || is_compound) {
- const __m128i v_sum =
- HorizontalTaps8To16<filter_index>(&src[x], v_tap);
+ const __m128i v_sum = HorizontalTaps8To16<num_taps>(&src[x], v_tap);
if (is_2d) {
StoreAligned16(&dest16[x], v_sum);
} else {
StoreUnaligned16(&dest16[x], v_sum);
}
} else {
- const __m128i result =
- SimpleHorizontalTaps<filter_index>(&src[x], v_tap);
+ const __m128i result = SimpleHorizontalTaps<num_taps>(&src[x], v_tap);
StoreLo8(&dest8[x], result);
}
x += 8;
@@ -138,10 +135,10 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
int y = height;
do {
if (is_2d || is_compound) {
- const __m128i v_sum = HorizontalTaps8To16<filter_index>(src, v_tap);
+ const __m128i v_sum = HorizontalTaps8To16<num_taps>(src, v_tap);
StoreLo8(dest16, v_sum);
} else {
- const __m128i result = SimpleHorizontalTaps<filter_index>(src, v_tap);
+ const __m128i result = SimpleHorizontalTaps<num_taps>(src, v_tap);
Store4(&dest8[0], result);
}
src += src_stride;
@@ -157,14 +154,14 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
do {
if (is_2d) {
const __m128i sum =
- HorizontalTaps8To16_2x2<filter_index>(src, src_stride, v_tap);
+ HorizontalTaps8To16_2x2<num_taps>(src, src_stride, v_tap);
Store4(&dest16[0], sum);
dest16 += pred_stride;
Store4(&dest16[0], _mm_srli_si128(sum, 8));
dest16 += pred_stride;
} else {
const __m128i sum =
- SimpleHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
+ SimpleHorizontalTaps2x2<num_taps>(src, src_stride, v_tap);
Store2(dest8, sum);
dest8 += pred_stride;
Store2(dest8, _mm_srli_si128(sum, 4));
@@ -181,7 +178,7 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
assert(height % 2 == 1);
__m128i sum;
const __m128i input = LoadLo8(&src[2]);
- if (filter_index == 3) {
+ if (num_taps == 2) {
// 03 04 04 05 05 06 06 07 ....
const __m128i v_src_43 =
_mm_srli_si128(_mm_unpacklo_epi8(input, input), 3);
@@ -218,28 +215,25 @@ LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
if (filter_index == 2) { // 8 tap.
SetupTaps<8>(&v_horizontal_filter, v_tap);
- FilterHorizontal<8, 2, is_2d, is_compound>(src, src_stride, dst, dst_stride,
- width, height, v_tap);
+ FilterHorizontal<8, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+ width, height, v_tap);
} else if (filter_index == 1) { // 6 tap.
SetupTaps<6>(&v_horizontal_filter, v_tap);
- FilterHorizontal<6, 1, is_2d, is_compound>(src, src_stride, dst, dst_stride,
- width, height, v_tap);
+ FilterHorizontal<6, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+ width, height, v_tap);
} else if (filter_index == 0) { // 6 tap.
SetupTaps<6>(&v_horizontal_filter, v_tap);
- FilterHorizontal<6, 0, is_2d, is_compound>(src, src_stride, dst, dst_stride,
- width, height, v_tap);
- } else if (filter_index == 4) { // 4 tap.
- SetupTaps<4>(&v_horizontal_filter, v_tap);
- FilterHorizontal<4, 4, is_2d, is_compound>(src, src_stride, dst, dst_stride,
- width, height, v_tap);
- } else if (filter_index == 5) { // 4 tap.
+ FilterHorizontal<6, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+ width, height, v_tap);
+ } else if ((filter_index & 0x4) != 0) { // 4 tap.
+ // ((filter_index == 4) | (filter_index == 5))
SetupTaps<4>(&v_horizontal_filter, v_tap);
- FilterHorizontal<4, 5, is_2d, is_compound>(src, src_stride, dst, dst_stride,
- width, height, v_tap);
+ FilterHorizontal<4, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+ width, height, v_tap);
} else { // 2 tap.
SetupTaps<2>(&v_horizontal_filter, v_tap);
- FilterHorizontal<2, 3, is_2d, is_compound>(src, src_stride, dst, dst_stride,
- width, height, v_tap);
+ FilterHorizontal<2, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+ width, height, v_tap);
}
}
@@ -253,7 +247,8 @@ void Convolve2D_SSE4_1(const void* LIBGAV1_RESTRICT const reference,
const ptrdiff_t pred_stride) {
const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
- const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
+ const int vertical_taps =
+ GetNumTapsInFilter(vert_filter_index, vertical_filter_id);
// The output of the horizontal filter is guaranteed to fit in 16 bits.
alignas(16) uint16_t
@@ -329,13 +324,12 @@ void Convolve2D_SSE4_1(const void* LIBGAV1_RESTRICT const reference,
}
}
-template <int filter_index, bool is_compound = false>
+template <int num_taps, bool is_compound = false>
void FilterVertical(const uint8_t* LIBGAV1_RESTRICT src,
const ptrdiff_t src_stride,
void* LIBGAV1_RESTRICT const dst,
const ptrdiff_t dst_stride, const int width,
const int height, const __m128i* const v_tap) {
- const int num_taps = GetNumTapsInFilter(filter_index);
const int next_row = num_taps - 1;
auto* dst8 = static_cast<uint8_t*>(dst);
auto* dst16 = static_cast<uint16_t*>(dst);
@@ -373,7 +367,7 @@ void FilterVertical(const uint8_t* LIBGAV1_RESTRICT src,
srcs[next_row] = LoadLo8(src_x);
src_x += src_stride;
- const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
if (is_compound) {
const __m128i results = Compound1DShift(sums);
StoreUnaligned16(dst16_x, results);
@@ -410,7 +404,8 @@ void ConvolveVertical_SSE4_1(
const int vertical_filter_id, const int width, const int height,
void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) {
const int filter_index = GetFilterIndex(vertical_filter_index, height);
- const int vertical_taps = GetNumTapsInFilter(filter_index);
+ const int vertical_taps =
+ GetNumTapsInFilter(filter_index, vertical_filter_id);
const ptrdiff_t src_stride = reference_stride;
const auto* src = static_cast<const uint8_t*>(reference) -
(vertical_taps / 2 - 1) * src_stride;
@@ -422,63 +417,50 @@ void ConvolveVertical_SSE4_1(
const __m128i v_filter =
LoadLo8(kHalfSubPixelFilters[filter_index][vertical_filter_id]);
- if (filter_index < 2) { // 6 tap.
+ if (vertical_taps == 6) { // 6 tap.
SetupTaps<6>(&v_filter, taps);
if (width == 2) {
- FilterVertical2xH<6, 0>(src, src_stride, dest, dest_stride, height, taps);
+ FilterVertical2xH<6>(src, src_stride, dest, dest_stride, height, taps);
} else if (width == 4) {
- FilterVertical4xH<6, 0>(src, src_stride, dest, dest_stride, height, taps);
+ FilterVertical4xH<6>(src, src_stride, dest, dest_stride, height, taps);
} else {
- FilterVertical<0>(src, src_stride, dest, dest_stride, width, height,
+ FilterVertical<6>(src, src_stride, dest, dest_stride, width, height,
taps);
}
- } else if (filter_index == 2) { // 8 tap.
+ } else if (vertical_taps == 8) { // 8 tap.
SetupTaps<8>(&v_filter, taps);
if (width == 2) {
- FilterVertical2xH<8, 2>(src, src_stride, dest, dest_stride, height, taps);
+ FilterVertical2xH<8>(src, src_stride, dest, dest_stride, height, taps);
} else if (width == 4) {
- FilterVertical4xH<8, 2>(src, src_stride, dest, dest_stride, height, taps);
+ FilterVertical4xH<8>(src, src_stride, dest, dest_stride, height, taps);
} else {
- FilterVertical<2>(src, src_stride, dest, dest_stride, width, height,
+ FilterVertical<8>(src, src_stride, dest, dest_stride, width, height,
taps);
}
- } else if (filter_index == 3) { // 2 tap.
+ } else if (vertical_taps == 2) { // 2 tap.
SetupTaps<2>(&v_filter, taps);
if (width == 2) {
- FilterVertical2xH<2, 3>(src, src_stride, dest, dest_stride, height, taps);
+ FilterVertical2xH<2>(src, src_stride, dest, dest_stride, height, taps);
} else if (width == 4) {
- FilterVertical4xH<2, 3>(src, src_stride, dest, dest_stride, height, taps);
+ FilterVertical4xH<2>(src, src_stride, dest, dest_stride, height, taps);
} else {
- FilterVertical<3>(src, src_stride, dest, dest_stride, width, height,
+ FilterVertical<2>(src, src_stride, dest, dest_stride, width, height,
taps);
}
- } else if (filter_index == 4) { // 4 tap.
+ } else { // 4 tap
SetupTaps<4>(&v_filter, taps);
if (width == 2) {
- FilterVertical2xH<4, 4>(src, src_stride, dest, dest_stride, height, taps);
+ FilterVertical2xH<4>(src, src_stride, dest, dest_stride, height, taps);
} else if (width == 4) {
- FilterVertical4xH<4, 4>(src, src_stride, dest, dest_stride, height, taps);
+ FilterVertical4xH<4>(src, src_stride, dest, dest_stride, height, taps);
} else {
FilterVertical<4>(src, src_stride, dest, dest_stride, width, height,
taps);
}
- } else {
- // TODO(slavarnway): Investigate adding |filter_index| == 1 special cases.
- // See convolve_neon.cc
- SetupTaps<4>(&v_filter, taps);
-
- if (width == 2) {
- FilterVertical2xH<4, 5>(src, src_stride, dest, dest_stride, height, taps);
- } else if (width == 4) {
- FilterVertical4xH<4, 5>(src, src_stride, dest, dest_stride, height, taps);
- } else {
- FilterVertical<5>(src, src_stride, dest, dest_stride, width, height,
- taps);
- }
}
}
-void ConvolveCompoundCopy_SSE4(
+void ConvolveCompoundCopy_SSE4_1(
const void* LIBGAV1_RESTRICT const reference,
const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
const int /*vertical_filter_index*/, const int /*horizontal_filter_id*/,
@@ -502,7 +484,6 @@ void ConvolveCompoundCopy_SSE4(
_mm_slli_epi16(v_src_ext_lo, kRoundBitsVertical);
const __m128i v_dest_hi =
_mm_slli_epi16(v_src_ext_hi, kRoundBitsVertical);
- // TODO(slavarnway): Investigate using aligned stores.
StoreUnaligned16(&dest[x], v_dest_lo);
StoreUnaligned16(&dest[x + 8], v_dest_hi);
x += 16;
@@ -544,7 +525,8 @@ void ConvolveCompoundVertical_SSE4_1(
const int vertical_filter_id, const int width, const int height,
void* LIBGAV1_RESTRICT prediction, const ptrdiff_t /*pred_stride*/) {
const int filter_index = GetFilterIndex(vertical_filter_index, height);
- const int vertical_taps = GetNumTapsInFilter(filter_index);
+ const int vertical_taps =
+ GetNumTapsInFilter(filter_index, vertical_filter_id);
const ptrdiff_t src_stride = reference_stride;
const auto* src = static_cast<const uint8_t*>(reference) -
(vertical_taps / 2 - 1) * src_stride;
@@ -555,55 +537,42 @@ void ConvolveCompoundVertical_SSE4_1(
const __m128i v_filter =
LoadLo8(kHalfSubPixelFilters[filter_index][vertical_filter_id]);
- if (filter_index < 2) { // 6 tap.
+ if (vertical_taps == 6) { // 6 tap.
SetupTaps<6>(&v_filter, taps);
if (width == 4) {
- FilterVertical4xH<6, 0, /*is_compound=*/true>(src, src_stride, dest, 4,
- height, taps);
+ FilterVertical4xH<6, /*is_compound=*/true>(src, src_stride, dest, 4,
+ height, taps);
} else {
- FilterVertical<0, /*is_compound=*/true>(src, src_stride, dest, width,
+ FilterVertical<6, /*is_compound=*/true>(src, src_stride, dest, width,
width, height, taps);
}
- } else if (filter_index == 2) { // 8 tap.
+ } else if (vertical_taps == 8) { // 8 tap.
SetupTaps<8>(&v_filter, taps);
-
if (width == 4) {
- FilterVertical4xH<8, 2, /*is_compound=*/true>(src, src_stride, dest, 4,
- height, taps);
+ FilterVertical4xH<8, /*is_compound=*/true>(src, src_stride, dest, 4,
+ height, taps);
} else {
- FilterVertical<2, /*is_compound=*/true>(src, src_stride, dest, width,
+ FilterVertical<8, /*is_compound=*/true>(src, src_stride, dest, width,
width, height, taps);
}
- } else if (filter_index == 3) { // 2 tap.
+ } else if (vertical_taps == 2) { // 2 tap.
SetupTaps<2>(&v_filter, taps);
-
if (width == 4) {
- FilterVertical4xH<2, 3, /*is_compound=*/true>(src, src_stride, dest, 4,
- height, taps);
+ FilterVertical4xH<2, /*is_compound=*/true>(src, src_stride, dest, 4,
+ height, taps);
} else {
- FilterVertical<3, /*is_compound=*/true>(src, src_stride, dest, width,
+ FilterVertical<2, /*is_compound=*/true>(src, src_stride, dest, width,
width, height, taps);
}
- } else if (filter_index == 4) { // 4 tap.
+ } else { // 4 tap
SetupTaps<4>(&v_filter, taps);
-
if (width == 4) {
- FilterVertical4xH<4, 4, /*is_compound=*/true>(src, src_stride, dest, 4,
- height, taps);
+ FilterVertical4xH<4, /*is_compound=*/true>(src, src_stride, dest, 4,
+ height, taps);
} else {
FilterVertical<4, /*is_compound=*/true>(src, src_stride, dest, width,
width, height, taps);
}
- } else {
- SetupTaps<4>(&v_filter, taps);
-
- if (width == 4) {
- FilterVertical4xH<4, 5, /*is_compound=*/true>(src, src_stride, dest, 4,
- height, taps);
- } else {
- FilterVertical<5, /*is_compound=*/true>(src, src_stride, dest, width,
- width, height, taps);
- }
}
}
@@ -656,7 +625,8 @@ void ConvolveCompound2D_SSE4_1(
// Similarly for height.
const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
- const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
+ const int vertical_taps =
+ GetNumTapsInFilter(vert_filter_index, vertical_filter_id);
const int intermediate_height = height + vertical_taps - 1;
const ptrdiff_t src_stride = reference_stride;
const auto* const src = static_cast<const uint8_t*>(reference) -
@@ -933,7 +903,7 @@ inline void ConvolveHorizontalScale(const uint8_t* LIBGAV1_RESTRICT src,
source);
StoreLo8(intermediate, RightShiftWithRounding_S16(
- SumOnePassTaps<filter_index>(source, taps),
+ SumOnePassTaps<num_taps>(source, taps),
kInterRoundBitsHorizontal - 1));
src_x += src_stride;
intermediate += kIntermediateStride;
@@ -960,10 +930,9 @@ inline void ConvolveHorizontalScale(const uint8_t* LIBGAV1_RESTRICT src,
PrepareSourceVectors<num_taps, grade_x>(src_x, packed_indices, source);
// Shift by one less because the taps are halved.
- StoreAligned16(
- intermediate_x,
- RightShiftWithRounding_S16(SumOnePassTaps<filter_index>(source, taps),
- kInterRoundBitsHorizontal - 1));
+ StoreAligned16(intermediate_x, RightShiftWithRounding_S16(
+ SumOnePassTaps<num_taps>(source, taps),
+ kInterRoundBitsHorizontal - 1));
src_x += src_stride;
intermediate_x += kIntermediateStride;
} while (--y != 0);
@@ -1188,7 +1157,7 @@ void ConvolveScale2D_SSE4_1(const void* LIBGAV1_RESTRICT const reference,
alignas(16) int16_t
intermediate_result[kIntermediateAllocWidth *
(2 * kIntermediateAllocWidth + kSubPixelTaps)];
- const int num_vert_taps = GetNumTapsInFilter(vert_filter_index);
+ const int num_vert_taps = dsp::GetNumTapsInFilter(vert_filter_index);
const int intermediate_height =
(((height - 1) * step_y + (1 << kScaleSubPixelBits) - 1) >>
kScaleSubPixelBits) +
@@ -1211,7 +1180,7 @@ void ConvolveScale2D_SSE4_1(const void* LIBGAV1_RESTRICT const reference,
// inputs in each iteration on large blocks. When step_x is large, we need a
// second register and alignr in order to gather all filter inputs.
// |num_taps| - 1 is the offset for the shuffle of inputs to the final tap.
- const int num_horiz_taps = GetNumTapsInFilter(horiz_filter_index);
+ const int num_horiz_taps = dsp::GetNumTapsInFilter(horiz_filter_index);
const int kernel_start_ceiling = 16 - num_horiz_taps;
// This truncated quotient |grade_x_threshold| selects |step_x| such that:
// (step_x * 7) >> kScaleSubPixelBits < single load limit
@@ -1891,7 +1860,7 @@ void Init8bpp() {
dsp->convolve[0][0][1][0] = ConvolveVertical_SSE4_1;
dsp->convolve[0][0][1][1] = Convolve2D_SSE4_1;
- dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_SSE4;
+ dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_SSE4_1;
dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_SSE4_1;
dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_SSE4_1;
dsp->convolve[0][1][1][1] = ConvolveCompound2D_SSE4_1;
diff --git a/src/dsp/x86/convolve_sse4.inc b/src/dsp/x86/convolve_sse4.inc
index 550d6a4..5548c5b 100644
--- a/src/dsp/x86/convolve_sse4.inc
+++ b/src/dsp/x86/convolve_sse4.inc
@@ -18,20 +18,63 @@
#include "src/dsp/convolve.inc"
+// This version checks for the special cases when filter_index == 1.
+int GetNumTapsInFilter(const int filter_index, const int filter_id) {
+ if (filter_index == 0) {
+ // Despite the names these only use 6 taps.
+ // kInterpolationFilterEightTap
+ // kInterpolationFilterEightTapSmooth
+ return 6;
+ }
+
+ if (filter_index == 1) {
+ // Despite the names these only use 6 taps.
+ // kInterpolationFilterEightTap
+ // kInterpolationFilterEightTapSmooth
+ if (((filter_id == 1) | (filter_id == 15) | (filter_id == 7) |
+ (filter_id == 8) | (filter_id == 9)) != 0) {
+ return 6;
+ }
+ // When |filter_index| == 1, the |filter_id| values not listed above map to
+ // 4 tap filters.
+ return 4;
+ }
+
+ if (filter_index == 2) {
+ // kInterpolationFilterEightTapSharp
+ return 8;
+ }
+
+ if (filter_index == 3) {
+ // kInterpolationFilterBilinear
+ return 2;
+ }
+
+ assert(filter_index > 3);
+ // For small sizes (width/height <= 4) the large filters are replaced with 4
+ // tap options.
+ // If the original filters were |kInterpolationFilterEightTap| or
+ // |kInterpolationFilterEightTapSharp| then it becomes
+ // |kInterpolationFilterSwitchable|.
+ // If it was |kInterpolationFilterEightTapSmooth| then it becomes an unnamed 4
+ // tap filter.
+ return 4;
+}
+
// Multiply every entry in |src[]| by the corresponding entry in |taps[]| and
// sum. The filters in |taps[]| are pre-shifted by 1. This prevents the final
// sum from outranging int16_t.
-template <int filter_index>
+template <int num_taps>
__m128i SumOnePassTaps(const __m128i* const src, const __m128i* const taps) {
__m128i sum;
- if (filter_index < 2) {
+ if (num_taps == 6) {
// 6 taps.
const __m128i v_madd_21 = _mm_maddubs_epi16(src[0], taps[0]); // k2k1
const __m128i v_madd_43 = _mm_maddubs_epi16(src[1], taps[1]); // k4k3
const __m128i v_madd_65 = _mm_maddubs_epi16(src[2], taps[2]); // k6k5
sum = _mm_add_epi16(v_madd_21, v_madd_43);
sum = _mm_add_epi16(sum, v_madd_65);
- } else if (filter_index == 2) {
+ } else if (num_taps == 8) {
// 8 taps.
const __m128i v_madd_10 = _mm_maddubs_epi16(src[0], taps[0]); // k1k0
const __m128i v_madd_32 = _mm_maddubs_epi16(src[1], taps[1]); // k3k2
@@ -40,7 +83,7 @@ __m128i SumOnePassTaps(const __m128i* const src, const __m128i* const taps) {
const __m128i v_sum_3210 = _mm_add_epi16(v_madd_10, v_madd_32);
const __m128i v_sum_7654 = _mm_add_epi16(v_madd_54, v_madd_76);
sum = _mm_add_epi16(v_sum_7654, v_sum_3210);
- } else if (filter_index == 3) {
+ } else if (num_taps == 2) {
// 2 taps.
sum = _mm_maddubs_epi16(src[0], taps[0]); // k4k3
} else {
@@ -52,13 +95,13 @@ __m128i SumOnePassTaps(const __m128i* const src, const __m128i* const taps) {
return sum;
}
-template <int filter_index>
+template <int num_taps>
__m128i SumHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
const __m128i* const v_tap) {
// 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17
const __m128i v_src = LoadHi8(LoadLo8(&src[0]), &src[src_stride]);
- if (filter_index == 3) {
+ if (num_taps == 2) {
// 03 04 04 05 05 06 06 07 13 14 14 15 15 16 16 17
const __m128i v_src_43 = _mm_shuffle_epi8(
v_src, _mm_set_epi32(0x0f0e0e0d, 0x0d0c0c0b, 0x07060605, 0x05040403));
@@ -79,10 +122,10 @@ __m128i SumHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
return v_sum_5432;
}
-template <int filter_index>
+template <int num_taps>
__m128i SimpleHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
const __m128i* const v_tap) {
- __m128i sum = SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
+ __m128i sum = SumHorizontalTaps2x2<num_taps>(src, src_stride, v_tap);
// Normally the Horizontal pass does the downshift in two passes:
// kInterRoundBitsHorizontal - 1 and then (kFilterBits -
@@ -95,11 +138,10 @@ __m128i SimpleHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
return _mm_packus_epi16(sum, sum);
}
-template <int filter_index>
+template <int num_taps>
__m128i HorizontalTaps8To16_2x2(const uint8_t* src, const ptrdiff_t src_stride,
const __m128i* const v_tap) {
- const __m128i sum =
- SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
+ const __m128i sum = SumHorizontalTaps2x2<num_taps>(src, src_stride, v_tap);
return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
}
@@ -411,36 +453,34 @@ __m128i Compound1DShift(const __m128i sum) {
return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
}
-template <int filter_index>
+template <int num_taps>
__m128i SumVerticalTaps(const __m128i* const srcs, const __m128i* const v_tap) {
__m128i v_src[4];
- if (filter_index < 2) {
+ if (num_taps == 6) {
// 6 taps.
v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]);
v_src[2] = _mm_unpacklo_epi8(srcs[4], srcs[5]);
- } else if (filter_index == 2) {
+ } else if (num_taps == 8) {
// 8 taps.
v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]);
v_src[2] = _mm_unpacklo_epi8(srcs[4], srcs[5]);
v_src[3] = _mm_unpacklo_epi8(srcs[6], srcs[7]);
- } else if (filter_index == 3) {
+ } else if (num_taps == 2) {
// 2 taps.
v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
- } else if (filter_index > 3) {
+ } else {
// 4 taps.
v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]);
}
- const __m128i sum = SumOnePassTaps<filter_index>(v_src, v_tap);
+ const __m128i sum = SumOnePassTaps<num_taps>(v_src, v_tap);
return sum;
}
-// TODO(slavarnway): Use num_taps instead of filter_index for templates. See the
-// 2D version.
-template <int num_taps, int filter_index, bool is_compound = false>
+template <int num_taps, bool is_compound = false>
void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
void* const dst, const ptrdiff_t dst_stride,
const int height, const __m128i* const v_tap) {
@@ -468,7 +508,7 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
// 10 11 12 13 20 21 22 23
srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
- const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
if (is_compound) {
const __m128i results = Compound1DShift(sums);
StoreUnaligned16(dst16, results);
@@ -515,7 +555,7 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
// 30 31 32 33 40 41 42 43
srcs[3] = _mm_unpacklo_epi32(b, srcs[4]);
- const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
if (is_compound) {
const __m128i results = Compound1DShift(sums);
StoreUnaligned16(dst16, results);
@@ -574,7 +614,7 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
// 50 51 52 53 60 61 62 63
srcs[5] = _mm_unpacklo_epi32(c, srcs[6]);
- const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
if (is_compound) {
const __m128i results = Compound1DShift(sums);
StoreUnaligned16(dst16, results);
@@ -645,7 +685,7 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
// 70 71 72 73 80 81 82 83
srcs[7] = _mm_unpacklo_epi32(d, srcs[8]);
- const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
if (is_compound) {
const __m128i results = Compound1DShift(sums);
StoreUnaligned16(dst16, results);
@@ -672,7 +712,7 @@ void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
}
}
-template <int num_taps, int filter_index, bool negative_outside_taps = false>
+template <int num_taps, bool negative_outside_taps = false>
void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride,
void* const dst, const ptrdiff_t dst_stride,
const int height, const __m128i* const v_tap) {
@@ -705,7 +745,7 @@ void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride,
// 10 11 20 21 30 31 40 41
srcs[1] = _mm_srli_si128(srcs_0_2, 2);
// This uses srcs[0]..srcs[1].
- const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
const __m128i results_16 =
RightShiftWithRounding_S16(sums, kFilterBits - 1);
const __m128i results = _mm_packus_epi16(results_16, results_16);
@@ -760,7 +800,7 @@ void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride,
srcs[3] = _mm_srli_si128(srcs_0_4, 6);
// This uses srcs[0]..srcs[3].
- const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
const __m128i results_16 =
RightShiftWithRounding_S16(sums, kFilterBits - 1);
const __m128i results = _mm_packus_epi16(results_16, results_16);
@@ -829,7 +869,7 @@ void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride,
srcs[5] = _mm_srli_si128(srcs_4_8, 2);
// This uses srcs[0]..srcs[5].
- const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
const __m128i results_16 =
RightShiftWithRounding_S16(sums, kFilterBits - 1);
const __m128i results = _mm_packus_epi16(results_16, results_16);
@@ -909,7 +949,7 @@ void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride,
srcs[7] = _mm_srli_si128(srcs_4_8, 6);
// This uses srcs[0]..srcs[7].
- const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
const __m128i results_16 =
RightShiftWithRounding_S16(sums, kFilterBits - 1);
const __m128i results = _mm_packus_epi16(results_16, results_16);
diff --git a/src/dsp/x86/distance_weighted_blend_sse4.cc b/src/dsp/x86/distance_weighted_blend_sse4.cc
index c813df4..8c32117 100644
--- a/src/dsp/x86/distance_weighted_blend_sse4.cc
+++ b/src/dsp/x86/distance_weighted_blend_sse4.cc
@@ -34,54 +34,50 @@ namespace low_bitdepth {
namespace {
constexpr int kInterPostRoundBit = 4;
+constexpr int kInterPostRhsAdjust = 1 << (16 - kInterPostRoundBit - 1);
inline __m128i ComputeWeightedAverage8(const __m128i& pred0,
const __m128i& pred1,
- const __m128i& weights) {
- // TODO(https://issuetracker.google.com/issues/150325685): Investigate range.
- const __m128i preds_lo = _mm_unpacklo_epi16(pred0, pred1);
- const __m128i mult_lo = _mm_madd_epi16(preds_lo, weights);
- const __m128i result_lo =
- RightShiftWithRounding_S32(mult_lo, kInterPostRoundBit + 4);
-
- const __m128i preds_hi = _mm_unpackhi_epi16(pred0, pred1);
- const __m128i mult_hi = _mm_madd_epi16(preds_hi, weights);
- const __m128i result_hi =
- RightShiftWithRounding_S32(mult_hi, kInterPostRoundBit + 4);
-
- return _mm_packs_epi32(result_lo, result_hi);
+ const __m128i& weight) {
+ // Given: p0,p1 in range [-5132,9212] and w0 = 16 - w1, w1 = 16 - w0
+ // Output: (p0 * w0 + p1 * w1 + 128(=rounding bit)) >>
+ // 8(=kInterPostRoundBit + 4)
+ // The formula is manipulated to avoid lengthening to 32 bits.
+ // p0 * w0 + p1 * w1 = p0 * w0 + (16 - w0) * p1
+ // = (p0 - p1) * w0 + 16 * p1
+ // Maximum value of p0 - p1 is 9212 + 5132 = 0x3808.
+ const __m128i diff = _mm_slli_epi16(_mm_sub_epi16(pred0, pred1), 1);
+ // (((p0 - p1) * (w0 << 12) >> 16) + ((16 * p1) >> 4)
+ const __m128i weighted_diff = _mm_mulhi_epi16(diff, weight);
+ // ((p0 - p1) * w0 >> 4) + p1
+ const __m128i upscaled_average = _mm_add_epi16(weighted_diff, pred1);
+ // (x << 11) >> 15 == x >> 4
+ const __m128i right_shift_prep = _mm_set1_epi16(kInterPostRhsAdjust);
+ // (((p0 - p1) * w0 >> 4) + p1 + (128 >> 4)) >> 4
+ return _mm_mulhrs_epi16(upscaled_average, right_shift_prep);
}
template <int height>
inline void DistanceWeightedBlend4xH_SSE4_1(
const int16_t* LIBGAV1_RESTRICT pred_0,
- const int16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight_0,
- const uint8_t weight_1, void* LIBGAV1_RESTRICT const dest,
- const ptrdiff_t dest_stride) {
+ const int16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight,
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t dest_stride) {
auto* dst = static_cast<uint8_t*>(dest);
- const __m128i weights = _mm_set1_epi32(weight_0 | (weight_1 << 16));
+ // Upscale the weight for mulhi.
+ const __m128i weights = _mm_set1_epi16(weight << 11);
for (int y = 0; y < height; y += 4) {
- // TODO(b/150326556): Use larger loads.
- const __m128i src_00 = LoadLo8(pred_0);
- const __m128i src_10 = LoadLo8(pred_1);
- pred_0 += 4;
- pred_1 += 4;
- __m128i src_0 = LoadHi8(src_00, pred_0);
- __m128i src_1 = LoadHi8(src_10, pred_1);
- pred_0 += 4;
- pred_1 += 4;
- const __m128i res0 = ComputeWeightedAverage8(src_0, src_1, weights);
-
- const __m128i src_01 = LoadLo8(pred_0);
- const __m128i src_11 = LoadLo8(pred_1);
- pred_0 += 4;
- pred_1 += 4;
- src_0 = LoadHi8(src_01, pred_0);
- src_1 = LoadHi8(src_11, pred_1);
- pred_0 += 4;
- pred_1 += 4;
- const __m128i res1 = ComputeWeightedAverage8(src_0, src_1, weights);
+ const __m128i src_00 = LoadAligned16(pred_0);
+ const __m128i src_10 = LoadAligned16(pred_1);
+ pred_0 += 8;
+ pred_1 += 8;
+ const __m128i res0 = ComputeWeightedAverage8(src_00, src_10, weights);
+
+ const __m128i src_01 = LoadAligned16(pred_0);
+ const __m128i src_11 = LoadAligned16(pred_1);
+ pred_0 += 8;
+ pred_1 += 8;
+ const __m128i res1 = ComputeWeightedAverage8(src_01, src_11, weights);
const __m128i result_pixels = _mm_packus_epi16(res0, res1);
Store4(dst, result_pixels);
@@ -101,11 +97,11 @@ inline void DistanceWeightedBlend4xH_SSE4_1(
template <int height>
inline void DistanceWeightedBlend8xH_SSE4_1(
const int16_t* LIBGAV1_RESTRICT pred_0,
- const int16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight_0,
- const uint8_t weight_1, void* LIBGAV1_RESTRICT const dest,
- const ptrdiff_t dest_stride) {
+ const int16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight,
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t dest_stride) {
auto* dst = static_cast<uint8_t*>(dest);
- const __m128i weights = _mm_set1_epi32(weight_0 | (weight_1 << 16));
+ // Upscale the weight for mulhi.
+ const __m128i weights = _mm_set1_epi16(weight << 11);
for (int y = 0; y < height; y += 2) {
const __m128i src_00 = LoadAligned16(pred_0);
@@ -130,11 +126,12 @@ inline void DistanceWeightedBlend8xH_SSE4_1(
inline void DistanceWeightedBlendLarge_SSE4_1(
const int16_t* LIBGAV1_RESTRICT pred_0,
- const int16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight_0,
- const uint8_t weight_1, const int width, const int height,
- void* LIBGAV1_RESTRICT const dest, const ptrdiff_t dest_stride) {
+ const int16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight,
+ const int width, const int height, void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t dest_stride) {
auto* dst = static_cast<uint8_t*>(dest);
- const __m128i weights = _mm_set1_epi32(weight_0 | (weight_1 << 16));
+ // Upscale the weight for mulhi.
+ const __m128i weights = _mm_set1_epi16(weight << 11);
int y = height;
do {
@@ -162,23 +159,24 @@ inline void DistanceWeightedBlendLarge_SSE4_1(
void DistanceWeightedBlend_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
const void* LIBGAV1_RESTRICT prediction_1,
const uint8_t weight_0,
- const uint8_t weight_1, const int width,
+ const uint8_t /*weight_1*/, const int width,
const int height,
void* LIBGAV1_RESTRICT const dest,
const ptrdiff_t dest_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ const uint8_t weight = weight_0;
if (width == 4) {
if (height == 4) {
- DistanceWeightedBlend4xH_SSE4_1<4>(pred_0, pred_1, weight_0, weight_1,
- dest, dest_stride);
+ DistanceWeightedBlend4xH_SSE4_1<4>(pred_0, pred_1, weight, dest,
+ dest_stride);
} else if (height == 8) {
- DistanceWeightedBlend4xH_SSE4_1<8>(pred_0, pred_1, weight_0, weight_1,
- dest, dest_stride);
+ DistanceWeightedBlend4xH_SSE4_1<8>(pred_0, pred_1, weight, dest,
+ dest_stride);
} else {
assert(height == 16);
- DistanceWeightedBlend4xH_SSE4_1<16>(pred_0, pred_1, weight_0, weight_1,
- dest, dest_stride);
+ DistanceWeightedBlend4xH_SSE4_1<16>(pred_0, pred_1, weight, dest,
+ dest_stride);
}
return;
}
@@ -186,28 +184,28 @@ void DistanceWeightedBlend_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
if (width == 8) {
switch (height) {
case 4:
- DistanceWeightedBlend8xH_SSE4_1<4>(pred_0, pred_1, weight_0, weight_1,
- dest, dest_stride);
+ DistanceWeightedBlend8xH_SSE4_1<4>(pred_0, pred_1, weight, dest,
+ dest_stride);
return;
case 8:
- DistanceWeightedBlend8xH_SSE4_1<8>(pred_0, pred_1, weight_0, weight_1,
- dest, dest_stride);
+ DistanceWeightedBlend8xH_SSE4_1<8>(pred_0, pred_1, weight, dest,
+ dest_stride);
return;
case 16:
- DistanceWeightedBlend8xH_SSE4_1<16>(pred_0, pred_1, weight_0, weight_1,
- dest, dest_stride);
+ DistanceWeightedBlend8xH_SSE4_1<16>(pred_0, pred_1, weight, dest,
+ dest_stride);
return;
default:
assert(height == 32);
- DistanceWeightedBlend8xH_SSE4_1<32>(pred_0, pred_1, weight_0, weight_1,
- dest, dest_stride);
+ DistanceWeightedBlend8xH_SSE4_1<32>(pred_0, pred_1, weight, dest,
+ dest_stride);
return;
}
}
- DistanceWeightedBlendLarge_SSE4_1(pred_0, pred_1, weight_0, weight_1, width,
- height, dest, dest_stride);
+ DistanceWeightedBlendLarge_SSE4_1(pred_0, pred_1, weight, width, height, dest,
+ dest_stride);
}
void Init8bpp() {
@@ -273,27 +271,19 @@ inline void DistanceWeightedBlend4xH_SSE4_1(
int y = height;
do {
- const __m128i src_00 = LoadLo8(pred_0);
- const __m128i src_10 = LoadLo8(pred_1);
- pred_0 += 4;
- pred_1 += 4;
- __m128i src_0 = LoadHi8(src_00, pred_0);
- __m128i src_1 = LoadHi8(src_10, pred_1);
- pred_0 += 4;
- pred_1 += 4;
+ const __m128i src_00 = LoadAligned16(pred_0);
+ const __m128i src_10 = LoadAligned16(pred_1);
+ pred_0 += 8;
+ pred_1 += 8;
const __m128i res0 =
- ComputeWeightedAverage8(src_0, src_1, weight0, weight1);
-
- const __m128i src_01 = LoadLo8(pred_0);
- const __m128i src_11 = LoadLo8(pred_1);
- pred_0 += 4;
- pred_1 += 4;
- src_0 = LoadHi8(src_01, pred_0);
- src_1 = LoadHi8(src_11, pred_1);
- pred_0 += 4;
- pred_1 += 4;
+ ComputeWeightedAverage8(src_00, src_10, weight0, weight1);
+
+ const __m128i src_01 = LoadAligned16(pred_0);
+ const __m128i src_11 = LoadAligned16(pred_1);
+ pred_0 += 8;
+ pred_1 += 8;
const __m128i res1 =
- ComputeWeightedAverage8(src_0, src_1, weight0, weight1);
+ ComputeWeightedAverage8(src_01, src_11, weight0, weight1);
StoreLo8(dst, res0);
dst += dest_stride;
diff --git a/src/dsp/x86/film_grain_sse4.cc b/src/dsp/x86/film_grain_sse4.cc
index 9ece947..59d18a6 100644
--- a/src/dsp/x86/film_grain_sse4.cc
+++ b/src/dsp/x86/film_grain_sse4.cc
@@ -23,14 +23,15 @@
#include <cstdint>
#include <cstring>
-#include "src/dsp/common.h"
#include "src/dsp/constants.h"
#include "src/dsp/dsp.h"
#include "src/dsp/film_grain_common.h"
#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/array_2d.h"
#include "src/utils/common.h"
#include "src/utils/compiler_attributes.h"
-#include "src/utils/logging.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
namespace libgav1 {
namespace dsp {
@@ -165,7 +166,7 @@ void BlendNoiseWithImageLuma_SSE4_1(
int y = 0;
do {
int x = 0;
- for (; x < safe_width; x += 8) {
+ for (; x + 8 <= safe_width; x += 8) {
const __m128i orig = LoadSource(&in_y_row[x]);
const __m128i scaling =
GetScalingFactors<bitdepth, Pixel>(scaling_lut_y, &in_y_row[x]);
@@ -181,6 +182,7 @@ void BlendNoiseWithImageLuma_SSE4_1(
// Prevent arbitrary indices from entering GetScalingFactors.
memset(luma_buffer, 0, sizeof(luma_buffer));
const int valid_range = width - x;
+ assert(valid_range < 8);
memcpy(luma_buffer, &in_y_row[x], valid_range * sizeof(in_y_row[0]));
luma_buffer[valid_range] = in_y_row[width - 1];
const __m128i orig = LoadSource(&in_y_row[x]);
@@ -239,7 +241,7 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlaneWithCfl_SSE4_1(
int y = 0;
do {
int x = 0;
- for (; x < safe_chroma_width; x += 8) {
+ for (; x + 8 <= safe_chroma_width; x += 8) {
const int luma_x = x << subsampling_x;
const __m128i average_luma =
GetAverageLuma(&in_y_row[luma_x], subsampling_x);
@@ -252,8 +254,6 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlaneWithCfl_SSE4_1(
StoreUnsigned(&out_chroma_row[x], Clip3(blended, floor, ceiling));
}
- // This section only runs if width % (8 << sub_x) != 0. It should never run
- // on 720p and above.
if (x < chroma_width) {
// Prevent huge indices from entering GetScalingFactors due to
// uninitialized values. This is not a problem in 8bpp because the table
@@ -365,7 +365,7 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlane8bpp_SSE4_1(
int y = 0;
do {
int x = 0;
- for (; x < safe_chroma_width; x += 8) {
+ for (; x + 8 <= safe_chroma_width; x += 8) {
const int luma_x = x << subsampling_x;
const __m128i average_luma =
GetAverageLuma(&in_y_row[luma_x], subsampling_x);
diff --git a/src/dsp/x86/intrapred_directional_sse4.cc b/src/dsp/x86/intrapred_directional_sse4.cc
index e642aee..bc61745 100644
--- a/src/dsp/x86/intrapred_directional_sse4.cc
+++ b/src/dsp/x86/intrapred_directional_sse4.cc
@@ -624,14 +624,6 @@ inline void DirectionalZone2FromLeftCol_4x4_SSE4_1(
}
}
-// The height at which a load of 16 bytes will not contain enough source pixels
-// from |left_column| to supply an accurate row when computing 8 pixels at a
-// time. The values are found by inspection. By coincidence, all angles that
-// satisfy (ystep >> 6) == 2 map to the same value, so it is enough to look up
-// by ystep >> 6. The largest index for this lookup is 1023 >> 6 == 15.
-constexpr int kDirectionalZone2ShuffleInvalidHeight[16] = {
- 1024, 1024, 16, 16, 16, 16, 0, 0, 18, 0, 0, 0, 0, 0, 0, 40};
-
template <bool upsampled>
inline void DirectionalZone2FromLeftCol_8x8_SSE4_1(
uint8_t* dst, ptrdiff_t stride, const uint8_t* const left_column,
@@ -729,6 +721,103 @@ inline void DirectionalZone1Blend_8xH(
}
}
+template <bool shuffle_left_column, bool upsampled_left, bool upsampled_top>
+inline void DirectionalZone2_8xH(
+ uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride,
+ const uint8_t* LIBGAV1_RESTRICT const top_row,
+ const uint8_t* LIBGAV1_RESTRICT const left_column, const int height,
+ const int xstep, const int ystep, const int x, const int left_offset,
+ const __m128i& xstep_for_shift, const __m128i& xstep_bounds_base,
+ const __m128i& left_y) {
+ const int upsample_left_shift = static_cast<int>(upsampled_left);
+ const int upsample_top_shift = static_cast<int>(upsampled_top);
+
+ // Loop incrementers for moving by block (8x8). This function handles blocks
+ // with height 4 as well. They are calculated in one pass so these variables
+ // do not get used.
+ const ptrdiff_t stride8 = stride << 3;
+ const int xstep8 = xstep << 3;
+ const __m128i xstep8_vect = _mm_set1_epi16(xstep8);
+
+ // Cover 8x4 case.
+ const int min_height = (height == 4) ? 4 : 8;
+
+ // The first stage, before the first y-loop, covers blocks that are only
+ // computed from the top row. The second stage, comprising two y-loops, covers
+ // blocks that have a mixture of values computed from top or left. The final
+ // stage covers blocks that are only computed from the left.
+ uint8_t* dst_x = dst + x;
+
+ // Round down to the nearest multiple of 8 (or 4, if height is 4).
+ const int max_top_only_y =
+ std::min(((x + 1) << 6) / xstep, height) & ~(min_height - 1);
+ DirectionalZone1_4xH(dst_x, stride, top_row + (x << upsample_top_shift),
+ max_top_only_y, -xstep, upsampled_top);
+ DirectionalZone1_4xH(dst_x + 4, stride,
+ top_row + ((x + 4) << upsample_top_shift),
+ max_top_only_y, -xstep, upsampled_top);
+ if (max_top_only_y == height) return;
+
+ const __m128i max_shift = _mm_set1_epi8(32);
+ const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
+ const __m128i dest_index_x =
+ _mm_set_epi32(0x00070006, 0x00050004, 0x00030002, 0x00010000);
+ const __m128i sampler_top =
+ upsampled_top
+ ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
+ : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
+ int y = max_top_only_y;
+ dst_x += stride * y;
+ const int xstep_y = xstep * y;
+ const __m128i xstep_y_vect = _mm_set1_epi16(xstep_y);
+ // All rows from |min_left_only_y| down for this set of columns, only need
+ // |left_column| to compute.
+ const int min_left_only_y =
+ Align(std::min(((x + 8) << 6) / xstep, height), 8);
+
+ __m128i xstep_bounds = _mm_add_epi16(xstep_bounds_base, xstep_y_vect);
+ __m128i xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift, xstep_y_vect);
+ int top_x = -xstep_y;
+
+ const auto base_left_y = static_cast<int16_t>(_mm_extract_epi16(left_y, 0));
+ for (; y < min_left_only_y;
+ y += 8, dst_x += stride8,
+ xstep_bounds = _mm_add_epi16(xstep_bounds, xstep8_vect),
+ xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep8_vect),
+ top_x -= xstep8) {
+ // Pick up from the last y-value, using the 10% slower but secure method for
+ // left prediction.
+ if (shuffle_left_column) {
+ DirectionalZone2FromLeftCol_8x8_SSE4_1<upsampled_left>(
+ dst_x, stride,
+ left_column + ((left_offset + y) << upsample_left_shift), left_y);
+ } else {
+ DirectionalZone3_8xH<upsampled_left, 8>(
+ dst_x, stride,
+ left_column + ((left_offset + y) << upsample_left_shift), base_left_y,
+ -ystep);
+ }
+
+ __m128i shifts = _mm_srli_epi16(
+ _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift),
+ shift_mask),
+ 1);
+ shifts = _mm_packus_epi16(shifts, shifts);
+ __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts);
+ shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
+ __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6);
+ DirectionalZone1Blend_8xH<upsampled_top, 8>(
+ dst_x, top_row + (x << upsample_top_shift), stride, sampler_top,
+ xstep_bounds_off, shifts, dest_index_x, top_x, xstep);
+ }
+ // Loop over y for left_only rows.
+ for (; y < height; y += 8, dst_x += stride8) {
+ DirectionalZone3_8xH<upsampled_left, 8>(
+ dst_x, stride, left_column + ((left_offset + y) << upsample_left_shift),
+ base_left_y, -ystep);
+ }
+}
+
// 7.11.2.4 (8) 90 < angle > 180
// The strategy for this function is to know how many blocks can be processed
// with just pixels from |top_ptr|, then handle mixed blocks, then handle only
@@ -742,29 +831,11 @@ inline void DirectionalZone2_SSE4_1(void* dest, ptrdiff_t stride,
const int width, const int height,
const int xstep, const int ystep) {
auto* dst = static_cast<uint8_t*>(dest);
- const int upsample_left_shift = static_cast<int>(upsampled_left);
const int upsample_top_shift = static_cast<int>(upsampled_top);
- const __m128i max_shift = _mm_set1_epi8(32);
- const ptrdiff_t stride8 = stride << 3;
- const __m128i dest_index_x =
- _mm_set_epi32(0x00070006, 0x00050004, 0x00030002, 0x00010000);
- const __m128i sampler_top =
- upsampled_top
- ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
- : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
- const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
- // All columns from |min_top_only_x| to the right will only need |top_row| to
- // compute. This assumes minimum |xstep| is 3.
+ // All columns from |min_top_only_x| to the right will only need |top_row|
+ // to compute. This assumes minimum |xstep| is 3.
const int min_top_only_x = std::min((height * xstep) >> 6, width);
- // For steep angles, the source pixels from left_column may not fit in a
- // 16-byte load for shuffling.
- // TODO(petersonab): Find a more precise formula for this subject to x.
- const int max_shuffle_height =
- std::min(height, kDirectionalZone2ShuffleInvalidHeight[ystep >> 6]);
-
- const int xstep8 = xstep << 3;
- const __m128i xstep8_vect = _mm_set1_epi16(xstep8);
// Accumulate xstep across 8 rows.
const __m128i xstep_dup = _mm_set1_epi16(-xstep);
const __m128i increments = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
@@ -787,105 +858,39 @@ inline void DirectionalZone2_SSE4_1(void* dest, ptrdiff_t stride,
// offset. Following values need the full ystep as a relative offset.
const __m128i ystep_init = _mm_set1_epi16(-ystep_remainder);
const __m128i ystep_dup = _mm_set1_epi16(-ystep);
+ const __m128i dest_index_x =
+ _mm_set_epi32(0x00070006, 0x00050004, 0x00030002, 0x00010000);
__m128i left_y = _mm_mullo_epi16(ystep_dup, dest_index_x);
left_y = _mm_add_epi16(ystep_init, left_y);
+ // Analysis finds that, for most angles (ystep < 132), all segments that use
+ // both top_row and left_column can compute from left_column using byte
+ // shuffles from a single vector. For steeper angles, the shuffle is also
+ // fully reliable when x >= 32.
+ const int shuffle_left_col_x = (ystep < 132) ? 0 : 32;
+ const int min_shuffle_x = std::min(min_top_only_x, shuffle_left_col_x);
const __m128i increment_top8 = _mm_set1_epi16(8 << 6);
int x = 0;
- // This loop treats each set of 4 columns in 3 stages with y-value boundaries.
- // The first stage, before the first y-loop, covers blocks that are only
- // computed from the top row. The second stage, comprising two y-loops, covers
- // blocks that have a mixture of values computed from top or left. The final
- // stage covers blocks that are only computed from the left.
+ for (int left_offset = -left_base_increment; x < min_shuffle_x;
+ x += 8,
+ xstep_bounds_base = _mm_sub_epi16(xstep_bounds_base, increment_top8),
+ // Watch left_y because it can still get big.
+ left_y = _mm_add_epi16(left_y, increment_left8),
+ left_offset -= left_base_increment8) {
+ DirectionalZone2_8xH<false, upsampled_left, upsampled_top>(
+ dst, stride, top_row, left_column, height, xstep, ystep, x, left_offset,
+ xstep_for_shift, xstep_bounds_base, left_y);
+ }
for (int left_offset = -left_base_increment; x < min_top_only_x;
x += 8,
xstep_bounds_base = _mm_sub_epi16(xstep_bounds_base, increment_top8),
// Watch left_y because it can still get big.
left_y = _mm_add_epi16(left_y, increment_left8),
left_offset -= left_base_increment8) {
- uint8_t* dst_x = dst + x;
-
- // Round down to the nearest multiple of 8.
- const int max_top_only_y = std::min(((x + 1) << 6) / xstep, height) & ~7;
- DirectionalZone1_4xH(dst_x, stride, top_row + (x << upsample_top_shift),
- max_top_only_y, -xstep, upsampled_top);
- DirectionalZone1_4xH(dst_x + 4, stride,
- top_row + ((x + 4) << upsample_top_shift),
- max_top_only_y, -xstep, upsampled_top);
-
- int y = max_top_only_y;
- dst_x += stride * y;
- const int xstep_y = xstep * y;
- const __m128i xstep_y_vect = _mm_set1_epi16(xstep_y);
- // All rows from |min_left_only_y| down for this set of columns, only need
- // |left_column| to compute.
- const int min_left_only_y = std::min(((x + 8) << 6) / xstep, height);
- // At high angles such that min_left_only_y < 8, ystep is low and xstep is
- // high. This means that max_shuffle_height is unbounded and xstep_bounds
- // will overflow in 16 bits. This is prevented by stopping the first
- // blending loop at min_left_only_y for such cases, which means we skip over
- // the second blending loop as well.
- const int left_shuffle_stop_y =
- std::min(max_shuffle_height, min_left_only_y);
- __m128i xstep_bounds = _mm_add_epi16(xstep_bounds_base, xstep_y_vect);
- __m128i xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift, xstep_y_vect);
- int top_x = -xstep_y;
-
- for (; y < left_shuffle_stop_y;
- y += 8, dst_x += stride8,
- xstep_bounds = _mm_add_epi16(xstep_bounds, xstep8_vect),
- xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep8_vect),
- top_x -= xstep8) {
- DirectionalZone2FromLeftCol_8x8_SSE4_1<upsampled_left>(
- dst_x, stride,
- left_column + ((left_offset + y) << upsample_left_shift), left_y);
-
- __m128i shifts = _mm_srli_epi16(
- _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift),
- shift_mask),
- 1);
- shifts = _mm_packus_epi16(shifts, shifts);
- __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts);
- shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
- __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6);
- DirectionalZone1Blend_8xH<upsampled_top, 8>(
- dst_x, top_row + (x << upsample_top_shift), stride, sampler_top,
- xstep_bounds_off, shifts, dest_index_x, top_x, xstep);
- }
- // Pick up from the last y-value, using the 10% slower but secure method for
- // left prediction.
- const auto base_left_y = static_cast<int16_t>(_mm_extract_epi16(left_y, 0));
- for (; y < min_left_only_y;
- y += 8, dst_x += stride8,
- xstep_bounds = _mm_add_epi16(xstep_bounds, xstep8_vect),
- xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep8_vect),
- top_x -= xstep8) {
- const __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6);
-
- DirectionalZone3_8xH<upsampled_left, 8>(
- dst_x, stride,
- left_column + ((left_offset + y) << upsample_left_shift), base_left_y,
- -ystep);
-
- __m128i shifts = _mm_srli_epi16(
- _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift),
- shift_mask),
- 1);
- shifts = _mm_packus_epi16(shifts, shifts);
- __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts);
- shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
- DirectionalZone1Blend_8xH<upsampled_top, 8>(
- dst_x, top_row + (x << upsample_top_shift), stride, sampler_top,
- xstep_bounds_off, shifts, dest_index_x, top_x, xstep);
- }
- // Loop over y for left_only rows.
- for (; y < height; y += 8, dst_x += stride8) {
- DirectionalZone3_8xH<upsampled_left, 8>(
- dst_x, stride,
- left_column + ((left_offset + y) << upsample_left_shift), base_left_y,
- -ystep);
- }
+ DirectionalZone2_8xH<true, upsampled_left, upsampled_top>(
+ dst, stride, top_row, left_column, height, xstep, ystep, x, left_offset,
+ xstep_for_shift, xstep_bounds_base, left_y);
}
for (; x < width; x += 4) {
DirectionalZone1_4xH(dst + x, stride, top_row + (x << upsample_top_shift),
@@ -952,8 +957,8 @@ inline void DirectionalZone2_4_SSE4_1(void* dest, ptrdiff_t stride,
left_offset -= left_base_increment4) {
uint8_t* dst_x = dst + x;
- // Round down to the nearest multiple of 8.
- const int max_top_only_y = std::min((x << 6) / xstep, height) & 0xFFFFFFF4;
+ // Round down to the nearest multiple of 4.
+ const int max_top_only_y = std::min((x << 6) / xstep, height) & ~3;
DirectionalZone1_4xH(dst_x, stride, top_row + (x << upsample_top_shift),
max_top_only_y, -xstep, upsampled_top);
int y = max_top_only_y;
diff --git a/src/dsp/x86/loop_restoration_sse4.cc b/src/dsp/x86/loop_restoration_sse4.cc
index 3363f0e..b4df072 100644
--- a/src/dsp/x86/loop_restoration_sse4.cc
+++ b/src/dsp/x86/loop_restoration_sse4.cc
@@ -2088,6 +2088,7 @@ LIBGAV1_ALWAYS_INLINE void BoxFilter(
uint16_t* const ma444[3], uint16_t* const ma565[2], uint32_t* const b343[4],
uint32_t* const b444[3], uint32_t* const b565[2], uint8_t* const dst) {
__m128i s[2][2], ma3[2][2], ma5[2], sq[2][4], b3[2][3], b5[3];
+ ma5[1] = _mm_setzero_si128(); // Quiets -Wmaybe-unintialized with gcc.
s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1 - width);
sq[0][0] = SquareLo8(s[0][0]);
diff --git a/src/dsp/x86/mask_blend_sse4.cc b/src/dsp/x86/mask_blend_sse4.cc
index a18444b..833814c 100644
--- a/src/dsp/x86/mask_blend_sse4.cc
+++ b/src/dsp/x86/mask_blend_sse4.cc
@@ -30,35 +30,81 @@
namespace libgav1 {
namespace dsp {
-namespace low_bitdepth {
namespace {
+template <int subsampling_x, int subsampling_y>
+inline __m128i GetMask8(const uint8_t* mask, const ptrdiff_t stride) {
+ if (subsampling_x == 1 && subsampling_y == 1) {
+ const __m128i one = _mm_set1_epi8(1);
+ const __m128i mask_val_0 = LoadUnaligned16(mask);
+ const __m128i mask_val_1 = LoadUnaligned16(mask + stride);
+ const __m128i add_0 = _mm_adds_epu8(mask_val_0, mask_val_1);
+ const __m128i mask_0 = _mm_maddubs_epi16(add_0, one);
+ return RightShiftWithRounding_U16(mask_0, 2);
+ }
+ if (subsampling_x == 1) {
+ const __m128i row_vals = LoadUnaligned16(mask);
+ const __m128i mask_val_0 = _mm_cvtepu8_epi16(row_vals);
+ const __m128i mask_val_1 = _mm_cvtepu8_epi16(_mm_srli_si128(row_vals, 8));
+ __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1);
+ return RightShiftWithRounding_U16(subsampled_mask, 1);
+ }
+ assert(subsampling_y == 0 && subsampling_x == 0);
+ const __m128i mask_val = LoadLo8(mask);
+ return _mm_cvtepu8_epi16(mask_val);
+}
+
+// Imitate behavior of ARM vtrn1q_u64.
+inline __m128i Transpose1_U64(const __m128i a, const __m128i b) {
+ return _mm_castps_si128(
+ _mm_movelh_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b)));
+}
+
+// Imitate behavior of ARM vtrn2q_u64.
+inline __m128i Transpose2_U64(const __m128i a, const __m128i b) {
+ return _mm_castps_si128(
+ _mm_movehl_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b)));
+}
+
// Width can only be 4 when it is subsampled from a block of width 8, hence
// subsampling_x is always 1 when this function is called.
template <int subsampling_x, int subsampling_y>
-inline __m128i GetMask4x2(const uint8_t* LIBGAV1_RESTRICT mask,
- ptrdiff_t mask_stride) {
+inline __m128i GetMask4x2(const uint8_t* mask) {
+ if (subsampling_x == 1 && subsampling_y == 1) {
+ const __m128i mask_val_01 = LoadUnaligned16(mask);
+ // Stride is fixed because this is the smallest block size.
+ const __m128i mask_val_23 = LoadUnaligned16(mask + 16);
+ // Transpose rows to add row 0 to row 1, and row 2 to row 3.
+ const __m128i mask_val_02 = Transpose1_U64(mask_val_01, mask_val_23);
+ const __m128i mask_val_13 = Transpose2_U64(mask_val_23, mask_val_01);
+ const __m128i add_0 = _mm_adds_epu8(mask_val_02, mask_val_13);
+ const __m128i one = _mm_set1_epi8(1);
+ const __m128i mask_0 = _mm_maddubs_epi16(add_0, one);
+ return RightShiftWithRounding_U16(mask_0, 2);
+ }
+ return GetMask8<subsampling_x, 0>(mask, 0);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline __m128i GetInterIntraMask4x2(const uint8_t* mask,
+ ptrdiff_t mask_stride) {
if (subsampling_x == 1) {
- const __m128i mask_val_0 = _mm_cvtepu8_epi16(LoadLo8(mask));
- const __m128i mask_val_1 =
- _mm_cvtepu8_epi16(LoadLo8(mask + (mask_stride << subsampling_y)));
- __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1);
- if (subsampling_y == 1) {
- const __m128i next_mask_val_0 =
- _mm_cvtepu8_epi16(LoadLo8(mask + mask_stride));
- const __m128i next_mask_val_1 =
- _mm_cvtepu8_epi16(LoadLo8(mask + mask_stride * 3));
- subsampled_mask = _mm_add_epi16(
- subsampled_mask, _mm_hadd_epi16(next_mask_val_0, next_mask_val_1));
- }
- return RightShiftWithRounding_U16(subsampled_mask, 1 + subsampling_y);
+ return GetMask4x2<subsampling_x, subsampling_y>(mask);
}
+ // When using intra or difference weighted masks, the function doesn't use
+ // subsampling, so |mask_stride| may be 4 or 8.
+ assert(subsampling_y == 0 && subsampling_x == 0);
const __m128i mask_val_0 = Load4(mask);
const __m128i mask_val_1 = Load4(mask + mask_stride);
return _mm_cvtepu8_epi16(
_mm_or_si128(mask_val_0, _mm_slli_si128(mask_val_1, 4)));
}
+} // namespace
+
+namespace low_bitdepth {
+namespace {
+
// This function returns a 16-bit packed mask to fit in _mm_madd_epi16.
// 16-bit is also the lowest packing for hadd, but without subsampling there is
// an unfortunate conversion required.
@@ -87,38 +133,6 @@ inline __m128i GetMask8(const uint8_t* LIBGAV1_RESTRICT mask,
return _mm_cvtepu8_epi16(mask_val);
}
-// This version returns 8-bit packed values to fit in _mm_maddubs_epi16 because,
-// when is_inter_intra is true, the prediction values are brought to 8-bit
-// packing as well.
-template <int subsampling_x, int subsampling_y>
-inline __m128i GetInterIntraMask8(const uint8_t* LIBGAV1_RESTRICT mask,
- ptrdiff_t stride) {
- if (subsampling_x == 1) {
- const __m128i row_vals = LoadUnaligned16(mask);
-
- const __m128i mask_val_0 = _mm_cvtepu8_epi16(row_vals);
- const __m128i mask_val_1 = _mm_cvtepu8_epi16(_mm_srli_si128(row_vals, 8));
- __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1);
-
- if (subsampling_y == 1) {
- const __m128i next_row_vals = LoadUnaligned16(mask + stride);
- const __m128i next_mask_val_0 = _mm_cvtepu8_epi16(next_row_vals);
- const __m128i next_mask_val_1 =
- _mm_cvtepu8_epi16(_mm_srli_si128(next_row_vals, 8));
- subsampled_mask = _mm_add_epi16(
- subsampled_mask, _mm_hadd_epi16(next_mask_val_0, next_mask_val_1));
- }
- const __m128i ret =
- RightShiftWithRounding_U16(subsampled_mask, 1 + subsampling_y);
- return _mm_packus_epi16(ret, ret);
- }
- assert(subsampling_y == 0 && subsampling_x == 0);
- // Unfortunately there is no shift operation for 8-bit packing, or else we
- // could return everything with 8-bit packing.
- const __m128i mask_val = LoadLo8(mask);
- return mask_val;
-}
-
inline void WriteMaskBlendLine4x2(const int16_t* LIBGAV1_RESTRICT const pred_0,
const int16_t* LIBGAV1_RESTRICT const pred_1,
const __m128i pred_mask_0,
@@ -149,15 +163,14 @@ inline void WriteMaskBlendLine4x2(const int16_t* LIBGAV1_RESTRICT const pred_0,
}
template <int subsampling_x, int subsampling_y>
-inline void MaskBlending4x4_SSE4(const int16_t* LIBGAV1_RESTRICT pred_0,
- const int16_t* LIBGAV1_RESTRICT pred_1,
- const uint8_t* LIBGAV1_RESTRICT mask,
- const ptrdiff_t mask_stride,
- uint8_t* LIBGAV1_RESTRICT dst,
- const ptrdiff_t dst_stride) {
+inline void MaskBlending4x4_SSE4_1(const int16_t* LIBGAV1_RESTRICT pred_0,
+ const int16_t* LIBGAV1_RESTRICT pred_1,
+ const uint8_t* LIBGAV1_RESTRICT mask,
+ uint8_t* LIBGAV1_RESTRICT dst,
+ const ptrdiff_t dst_stride) {
+ constexpr ptrdiff_t mask_stride = 4 << subsampling_x;
const __m128i mask_inverter = _mm_set1_epi16(64);
- __m128i pred_mask_0 =
- GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+ __m128i pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask);
__m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
dst_stride);
@@ -166,30 +179,30 @@ inline void MaskBlending4x4_SSE4(const int16_t* LIBGAV1_RESTRICT pred_0,
mask += mask_stride << (1 + subsampling_y);
dst += dst_stride << 1;
- pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+ pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask);
pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
dst_stride);
}
template <int subsampling_x, int subsampling_y>
-inline void MaskBlending4xH_SSE4(const int16_t* LIBGAV1_RESTRICT pred_0,
- const int16_t* LIBGAV1_RESTRICT pred_1,
- const uint8_t* LIBGAV1_RESTRICT const mask_ptr,
- const ptrdiff_t mask_stride, const int height,
- uint8_t* LIBGAV1_RESTRICT dst,
- const ptrdiff_t dst_stride) {
+inline void MaskBlending4xH_SSE4_1(
+ const int16_t* LIBGAV1_RESTRICT pred_0,
+ const int16_t* LIBGAV1_RESTRICT pred_1,
+ const uint8_t* LIBGAV1_RESTRICT const mask_ptr, const int height,
+ uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t dst_stride) {
+ assert(subsampling_x == 1);
const uint8_t* mask = mask_ptr;
+ constexpr ptrdiff_t mask_stride = 4 << subsampling_x;
if (height == 4) {
- MaskBlending4x4_SSE4<subsampling_x, subsampling_y>(
- pred_0, pred_1, mask, mask_stride, dst, dst_stride);
+ MaskBlending4x4_SSE4_1<subsampling_x, subsampling_y>(pred_0, pred_1, mask,
+ dst, dst_stride);
return;
}
const __m128i mask_inverter = _mm_set1_epi16(64);
int y = 0;
do {
- __m128i pred_mask_0 =
- GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+ __m128i pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask);
__m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
@@ -199,7 +212,7 @@ inline void MaskBlending4xH_SSE4(const int16_t* LIBGAV1_RESTRICT pred_0,
mask += mask_stride << (1 + subsampling_y);
dst += dst_stride << 1;
- pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+ pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask);
pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
dst_stride);
@@ -208,7 +221,7 @@ inline void MaskBlending4xH_SSE4(const int16_t* LIBGAV1_RESTRICT pred_0,
mask += mask_stride << (1 + subsampling_y);
dst += dst_stride << 1;
- pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+ pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask);
pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
dst_stride);
@@ -217,7 +230,7 @@ inline void MaskBlending4xH_SSE4(const int16_t* LIBGAV1_RESTRICT pred_0,
mask += mask_stride << (1 + subsampling_y);
dst += dst_stride << 1;
- pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+ pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask);
pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
dst_stride);
@@ -230,21 +243,21 @@ inline void MaskBlending4xH_SSE4(const int16_t* LIBGAV1_RESTRICT pred_0,
}
template <int subsampling_x, int subsampling_y>
-inline void MaskBlend_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
- const void* LIBGAV1_RESTRICT prediction_1,
- const ptrdiff_t /*prediction_stride_1*/,
- const uint8_t* LIBGAV1_RESTRICT const mask_ptr,
- const ptrdiff_t mask_stride, const int width,
- const int height, void* LIBGAV1_RESTRICT dest,
- const ptrdiff_t dst_stride) {
+inline void MaskBlend_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ const ptrdiff_t /*prediction_stride_1*/,
+ const uint8_t* LIBGAV1_RESTRICT const mask_ptr,
+ const ptrdiff_t mask_stride, const int width,
+ const int height, void* LIBGAV1_RESTRICT dest,
+ const ptrdiff_t dst_stride) {
auto* dst = static_cast<uint8_t*>(dest);
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
const ptrdiff_t pred_stride_0 = width;
const ptrdiff_t pred_stride_1 = width;
if (width == 4) {
- MaskBlending4xH_SSE4<subsampling_x, subsampling_y>(
- pred_0, pred_1, mask_ptr, mask_stride, height, dst, dst_stride);
+ MaskBlending4xH_SSE4_1<subsampling_x, subsampling_y>(
+ pred_0, pred_1, mask_ptr, height, dst, dst_stride);
return;
}
const uint8_t* mask = mask_ptr;
@@ -293,7 +306,6 @@ inline void InterIntraWriteMaskBlendLine8bpp4x2(
const __m128i pred_mask = _mm_unpacklo_epi8(pred_mask_0, pred_mask_1);
const __m128i pred_val_0 = LoadLo8(pred_0);
- // TODO(b/150326556): One load.
__m128i pred_val_1 = Load4(pred_1);
pred_val_1 = _mm_or_si128(_mm_slli_si128(Load4(pred_1 + pred_stride_1), 4),
pred_val_1);
@@ -309,16 +321,16 @@ inline void InterIntraWriteMaskBlendLine8bpp4x2(
}
template <int subsampling_x, int subsampling_y>
-inline void InterIntraMaskBlending8bpp4x4_SSE4(
+inline void InterIntraMaskBlending8bpp4x4_SSE4_1(
const uint8_t* LIBGAV1_RESTRICT pred_0, uint8_t* LIBGAV1_RESTRICT pred_1,
const ptrdiff_t pred_stride_1, const uint8_t* LIBGAV1_RESTRICT mask,
const ptrdiff_t mask_stride) {
const __m128i mask_inverter = _mm_set1_epi8(64);
const __m128i pred_mask_u16_first =
- GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+ GetInterIntraMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
mask += mask_stride << (1 + subsampling_y);
const __m128i pred_mask_u16_second =
- GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+ GetInterIntraMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
mask += mask_stride << (1 + subsampling_y);
__m128i pred_mask_1 =
_mm_packus_epi16(pred_mask_u16_first, pred_mask_u16_second);
@@ -335,26 +347,26 @@ inline void InterIntraMaskBlending8bpp4x4_SSE4(
}
template <int subsampling_x, int subsampling_y>
-inline void InterIntraMaskBlending8bpp4xH_SSE4(
+inline void InterIntraMaskBlending8bpp4xH_SSE4_1(
const uint8_t* LIBGAV1_RESTRICT pred_0, uint8_t* LIBGAV1_RESTRICT pred_1,
const ptrdiff_t pred_stride_1,
const uint8_t* LIBGAV1_RESTRICT const mask_ptr, const ptrdiff_t mask_stride,
const int height) {
const uint8_t* mask = mask_ptr;
if (height == 4) {
- InterIntraMaskBlending8bpp4x4_SSE4<subsampling_x, subsampling_y>(
+ InterIntraMaskBlending8bpp4x4_SSE4_1<subsampling_x, subsampling_y>(
pred_0, pred_1, pred_stride_1, mask, mask_stride);
return;
}
int y = 0;
do {
- InterIntraMaskBlending8bpp4x4_SSE4<subsampling_x, subsampling_y>(
+ InterIntraMaskBlending8bpp4x4_SSE4_1<subsampling_x, subsampling_y>(
pred_0, pred_1, pred_stride_1, mask, mask_stride);
pred_0 += 4 << 2;
pred_1 += pred_stride_1 << 2;
mask += mask_stride << (2 + subsampling_y);
- InterIntraMaskBlending8bpp4x4_SSE4<subsampling_x, subsampling_y>(
+ InterIntraMaskBlending8bpp4x4_SSE4_1<subsampling_x, subsampling_y>(
pred_0, pred_1, pred_stride_1, mask, mask_stride);
pred_0 += 4 << 2;
pred_1 += pred_stride_1 << 2;
@@ -363,14 +375,31 @@ inline void InterIntraMaskBlending8bpp4xH_SSE4(
} while (y < height);
}
+// This version returns 8-bit packed values to fit in _mm_maddubs_epi16 because,
+// when is_inter_intra is true, the prediction values are brought to 8-bit
+// packing as well.
+template <int subsampling_x, int subsampling_y>
+inline __m128i GetInterIntraMask8bpp8(const uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t stride) {
+ if (subsampling_x == 1) {
+ const __m128i ret = GetMask8<subsampling_x, subsampling_y>(mask, stride);
+ return _mm_packus_epi16(ret, ret);
+ }
+ assert(subsampling_y == 0 && subsampling_x == 0);
+ // Unfortunately there is no shift operation for 8-bit packing, or else we
+ // could return everything with 8-bit packing.
+ const __m128i mask_val = LoadLo8(mask);
+ return mask_val;
+}
+
template <int subsampling_x, int subsampling_y>
-void InterIntraMaskBlend8bpp_SSE4(
+void InterIntraMaskBlend8bpp_SSE4_1(
const uint8_t* LIBGAV1_RESTRICT prediction_0,
uint8_t* LIBGAV1_RESTRICT prediction_1, const ptrdiff_t prediction_stride_1,
const uint8_t* LIBGAV1_RESTRICT const mask_ptr, const ptrdiff_t mask_stride,
const int width, const int height) {
if (width == 4) {
- InterIntraMaskBlending8bpp4xH_SSE4<subsampling_x, subsampling_y>(
+ InterIntraMaskBlending8bpp4xH_SSE4_1<subsampling_x, subsampling_y>(
prediction_0, prediction_1, prediction_stride_1, mask_ptr, mask_stride,
height);
return;
@@ -382,7 +411,7 @@ void InterIntraMaskBlend8bpp_SSE4(
int x = 0;
do {
const __m128i pred_mask_1 =
- GetInterIntraMask8<subsampling_x, subsampling_y>(
+ GetInterIntraMask8bpp8<subsampling_x, subsampling_y>(
mask + (x << subsampling_x), mask_stride);
// 64 - mask
const __m128i pred_mask_0 = _mm_sub_epi8(mask_inverter, pred_mask_1);
@@ -411,24 +440,24 @@ void Init8bpp() {
Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
assert(dsp != nullptr);
#if DSP_ENABLED_8BPP_SSE4_1(MaskBlend444)
- dsp->mask_blend[0][0] = MaskBlend_SSE4<0, 0>;
+ dsp->mask_blend[0][0] = MaskBlend_SSE4_1<0, 0>;
#endif
#if DSP_ENABLED_8BPP_SSE4_1(MaskBlend422)
- dsp->mask_blend[1][0] = MaskBlend_SSE4<1, 0>;
+ dsp->mask_blend[1][0] = MaskBlend_SSE4_1<1, 0>;
#endif
#if DSP_ENABLED_8BPP_SSE4_1(MaskBlend420)
- dsp->mask_blend[2][0] = MaskBlend_SSE4<1, 1>;
+ dsp->mask_blend[2][0] = MaskBlend_SSE4_1<1, 1>;
#endif
// The is_inter_intra index of mask_blend[][] is replaced by
// inter_intra_mask_blend_8bpp[] in 8-bit.
#if DSP_ENABLED_8BPP_SSE4_1(InterIntraMaskBlend8bpp444)
- dsp->inter_intra_mask_blend_8bpp[0] = InterIntraMaskBlend8bpp_SSE4<0, 0>;
+ dsp->inter_intra_mask_blend_8bpp[0] = InterIntraMaskBlend8bpp_SSE4_1<0, 0>;
#endif
#if DSP_ENABLED_8BPP_SSE4_1(InterIntraMaskBlend8bpp422)
- dsp->inter_intra_mask_blend_8bpp[1] = InterIntraMaskBlend8bpp_SSE4<1, 0>;
+ dsp->inter_intra_mask_blend_8bpp[1] = InterIntraMaskBlend8bpp_SSE4_1<1, 0>;
#endif
#if DSP_ENABLED_8BPP_SSE4_1(InterIntraMaskBlend8bpp420)
- dsp->inter_intra_mask_blend_8bpp[2] = InterIntraMaskBlend8bpp_SSE4<1, 1>;
+ dsp->inter_intra_mask_blend_8bpp[2] = InterIntraMaskBlend8bpp_SSE4_1<1, 1>;
#endif
}
@@ -443,14 +472,6 @@ constexpr int kMax10bppSample = (1 << 10) - 1;
constexpr int kMaskInverse = 64;
constexpr int kRoundBitsMaskBlend = 4;
-inline __m128i RightShiftWithRoundingZero_U16(const __m128i v_val_d, int bits,
- const __m128i zero) {
- // Shift out all but the last bit.
- const __m128i v_tmp_d = _mm_srli_epi16(v_val_d, bits - 1);
- // Avg with zero will shift by 1 and round.
- return _mm_avg_epu16(v_tmp_d, zero);
-}
-
inline __m128i RightShiftWithRoundingConst_S32(const __m128i v_val_d, int bits,
const __m128i shift) {
const __m128i v_tmp_d = _mm_add_epi32(v_val_d, shift);
@@ -458,53 +479,31 @@ inline __m128i RightShiftWithRoundingConst_S32(const __m128i v_val_d, int bits,
}
template <int subsampling_x, int subsampling_y>
-inline __m128i GetMask4x2(const uint8_t* mask, ptrdiff_t mask_stride,
- const __m128i zero) {
- if (subsampling_x == 1) {
- if (subsampling_y == 0) {
- const __m128i mask_val_0 = _mm_cvtepu8_epi16(LoadLo8(mask));
- const __m128i mask_val_1 =
- _mm_cvtepu8_epi16(LoadLo8(mask + (mask_stride << subsampling_y)));
- __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1);
- return RightShiftWithRoundingZero_U16(subsampled_mask, 1, zero);
- }
- const __m128i one = _mm_set1_epi8(1);
- const __m128i mask_val_0 =
- LoadHi8(LoadLo8(mask), mask + (mask_stride << 1));
- const __m128i mask_val_1 = LoadHi8(LoadLo8(mask + mask_stride),
- mask + (mask_stride << 1) + mask_stride);
- const __m128i add = _mm_adds_epu8(mask_val_0, mask_val_1);
- const __m128i subsampled_mask = _mm_maddubs_epi16(add, one);
- return RightShiftWithRoundingZero_U16(subsampled_mask, 2, zero);
+inline __m128i GetMask4x2(const uint8_t* mask) {
+ if (subsampling_x == 1 && subsampling_y == 1) {
+ const __m128i mask_row_01 = LoadUnaligned16(mask);
+ const __m128i mask_row_23 = LoadUnaligned16(mask + 16);
+ const __m128i mask_val_0 = _mm_cvtepu8_epi16(mask_row_01);
+ const __m128i mask_val_1 =
+ _mm_cvtepu8_epi16(_mm_srli_si128(mask_row_01, 8));
+ const __m128i mask_val_2 = _mm_cvtepu8_epi16(mask_row_23);
+ const __m128i mask_val_3 =
+ _mm_cvtepu8_epi16(_mm_srli_si128(mask_row_23, 8));
+ const __m128i subsampled_mask_02 = _mm_hadd_epi16(mask_val_0, mask_val_2);
+ const __m128i subsampled_mask_13 = _mm_hadd_epi16(mask_val_1, mask_val_3);
+ const __m128i subsampled_mask =
+ _mm_add_epi16(subsampled_mask_02, subsampled_mask_13);
+ return RightShiftWithRounding_U16(subsampled_mask, 2);
}
- assert(subsampling_y == 0 && subsampling_x == 0);
- const __m128i mask_val_0 = Load4(mask);
- const __m128i mask_val_1 = Load4(mask + mask_stride);
- return _mm_cvtepu8_epi16(
- _mm_or_si128(mask_val_0, _mm_slli_si128(mask_val_1, 4)));
-}
-
-template <int subsampling_x, int subsampling_y>
-inline __m128i GetMask8(const uint8_t* mask, const ptrdiff_t stride,
- const __m128i zero) {
if (subsampling_x == 1) {
- if (subsampling_y == 0) {
- const __m128i row_vals = LoadUnaligned16(mask);
- const __m128i mask_val_0 = _mm_cvtepu8_epi16(row_vals);
- const __m128i mask_val_1 = _mm_cvtepu8_epi16(_mm_srli_si128(row_vals, 8));
- __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1);
- return RightShiftWithRoundingZero_U16(subsampled_mask, 1, zero);
- }
- const __m128i one = _mm_set1_epi8(1);
- const __m128i mask_val_0 = LoadUnaligned16(mask);
- const __m128i mask_val_1 = LoadUnaligned16(mask + stride);
- const __m128i add_0 = _mm_adds_epu8(mask_val_0, mask_val_1);
- const __m128i mask_0 = _mm_maddubs_epi16(add_0, one);
- return RightShiftWithRoundingZero_U16(mask_0, 2, zero);
+ const __m128i mask_row_01 = LoadUnaligned16(mask);
+ const __m128i mask_val_0 = _mm_cvtepu8_epi16(mask_row_01);
+ const __m128i mask_val_1 =
+ _mm_cvtepu8_epi16(_mm_srli_si128(mask_row_01, 8));
+ const __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1);
+ return RightShiftWithRounding_U16(subsampled_mask, 1);
}
- assert(subsampling_y == 0 && subsampling_x == 0);
- const __m128i mask_val = LoadLo8(mask);
- return _mm_cvtepu8_epi16(mask_val);
+ return _mm_cvtepu8_epi16(LoadLo8(mask));
}
inline void WriteMaskBlendLine10bpp4x2_SSE4_1(
@@ -558,12 +557,10 @@ inline void MaskBlend10bpp4x4_SSE4_1(const uint16_t* LIBGAV1_RESTRICT pred_0,
uint16_t* LIBGAV1_RESTRICT dst,
const ptrdiff_t dst_stride) {
const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse);
- const __m128i zero = _mm_setzero_si128();
const __m128i shift4 = _mm_set1_epi32((1 << kRoundBitsMaskBlend) >> 1);
const __m128i offset = _mm_set1_epi32(kCompoundOffset);
const __m128i max = _mm_set1_epi16(kMax10bppSample);
- __m128i pred_mask_0 =
- GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+ __m128i pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask);
__m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, pred_mask_0,
pred_mask_1, offset, max, shift4, dst,
@@ -573,8 +570,7 @@ inline void MaskBlend10bpp4x4_SSE4_1(const uint16_t* LIBGAV1_RESTRICT pred_0,
mask += mask_stride << (1 + subsampling_y);
dst += dst_stride << 1;
- pred_mask_0 =
- GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+ pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask);
pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, pred_mask_0,
pred_mask_1, offset, max, shift4, dst,
@@ -595,7 +591,6 @@ inline void MaskBlend10bpp4xH_SSE4_1(
return;
}
const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse);
- const __m128i zero = _mm_setzero_si128();
const uint8_t pred0_stride2 = 4 << 1;
const ptrdiff_t pred1_stride2 = pred_stride_1 << 1;
const ptrdiff_t mask_stride2 = mask_stride << (1 + subsampling_y);
@@ -605,8 +600,7 @@ inline void MaskBlend10bpp4xH_SSE4_1(
const __m128i shift4 = _mm_set1_epi32((1 << kRoundBitsMaskBlend) >> 1);
int y = height;
do {
- __m128i pred_mask_0 =
- GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+ __m128i pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask);
__m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
@@ -617,8 +611,7 @@ inline void MaskBlend10bpp4xH_SSE4_1(
mask += mask_stride2;
dst += dst_stride2;
- pred_mask_0 =
- GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+ pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask);
pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
pred_mask_0, pred_mask_1, offset, max,
@@ -628,8 +621,7 @@ inline void MaskBlend10bpp4xH_SSE4_1(
mask += mask_stride2;
dst += dst_stride2;
- pred_mask_0 =
- GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+ pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask);
pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
pred_mask_0, pred_mask_1, offset, max,
@@ -639,8 +631,7 @@ inline void MaskBlend10bpp4xH_SSE4_1(
mask += mask_stride2;
dst += dst_stride2;
- pred_mask_0 =
- GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+ pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask);
pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
pred_mask_0, pred_mask_1, offset, max,
@@ -675,7 +666,6 @@ inline void MaskBlend10bpp_SSE4_1(
}
const uint8_t* mask = mask_ptr;
const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse);
- const __m128i zero = _mm_setzero_si128();
const ptrdiff_t mask_stride_ss = mask_stride << subsampling_y;
const __m128i offset = _mm_set1_epi32(kCompoundOffset);
const __m128i max = _mm_set1_epi16(kMax10bppSample);
@@ -685,7 +675,7 @@ inline void MaskBlend10bpp_SSE4_1(
int x = 0;
do {
const __m128i pred_mask_0 = GetMask8<subsampling_x, subsampling_y>(
- mask + (x << subsampling_x), mask_stride, zero);
+ mask + (x << subsampling_x), mask_stride);
const __m128i pred_val_0 = LoadUnaligned16(pred_0 + x);
const __m128i pred_val_1 = LoadUnaligned16(pred_1 + x);
// 64 - mask
@@ -729,7 +719,6 @@ inline void MaskBlend10bpp_SSE4_1(
mask += mask_stride_ss;
} while (--y != 0);
}
-
inline void InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(
const uint16_t* LIBGAV1_RESTRICT prediction_0,
const uint16_t* LIBGAV1_RESTRICT prediction_1,
@@ -764,9 +753,8 @@ inline void InterIntraMaskBlend10bpp4x4_SSE4_1(
uint16_t* LIBGAV1_RESTRICT dst, const ptrdiff_t dst_stride) {
const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse);
const __m128i shift6 = _mm_set1_epi32((1 << 6) >> 1);
- const __m128i zero = _mm_setzero_si128();
__m128i pred_mask_0 =
- GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+ GetInterIntraMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
__m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
pred_mask_0, pred_mask_1, shift6,
@@ -777,7 +765,7 @@ inline void InterIntraMaskBlend10bpp4x4_SSE4_1(
dst += dst_stride << 1;
pred_mask_0 =
- GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+ GetInterIntraMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
pred_mask_0, pred_mask_1, shift6,
@@ -798,7 +786,6 @@ inline void InterIntraMaskBlend10bpp4xH_SSE4_1(
return;
}
const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse);
- const __m128i zero = _mm_setzero_si128();
const __m128i shift6 = _mm_set1_epi32((1 << 6) >> 1);
const uint8_t pred0_stride2 = 4 << 1;
const ptrdiff_t pred1_stride2 = pred_stride_1 << 1;
@@ -807,7 +794,7 @@ inline void InterIntraMaskBlend10bpp4xH_SSE4_1(
int y = height;
do {
__m128i pred_mask_0 =
- GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+ GetInterIntraMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
__m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
pred_mask_0, pred_mask_1,
@@ -818,7 +805,7 @@ inline void InterIntraMaskBlend10bpp4xH_SSE4_1(
dst += dst_stride2;
pred_mask_0 =
- GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+ GetInterIntraMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
pred_mask_0, pred_mask_1,
@@ -829,7 +816,7 @@ inline void InterIntraMaskBlend10bpp4xH_SSE4_1(
dst += dst_stride2;
pred_mask_0 =
- GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+ GetInterIntraMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
pred_mask_0, pred_mask_1,
@@ -840,7 +827,7 @@ inline void InterIntraMaskBlend10bpp4xH_SSE4_1(
dst += dst_stride2;
pred_mask_0 =
- GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+ GetInterIntraMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
pred_mask_0, pred_mask_1,
@@ -876,14 +863,13 @@ inline void InterIntraMaskBlend10bpp_SSE4_1(
const uint8_t* mask = mask_ptr;
const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse);
const __m128i shift6 = _mm_set1_epi32((1 << 6) >> 1);
- const __m128i zero = _mm_setzero_si128();
const ptrdiff_t mask_stride_ss = mask_stride << subsampling_y;
int y = height;
do {
int x = 0;
do {
const __m128i pred_mask_0 = GetMask8<subsampling_x, subsampling_y>(
- mask + (x << subsampling_x), mask_stride, zero);
+ mask + (x << subsampling_x), mask_stride);
const __m128i pred_val_0 = LoadUnaligned16(pred_0 + x);
const __m128i pred_val_1 = LoadUnaligned16(pred_1 + x);
// 64 - mask
diff --git a/src/dsp/x86/obmc_sse4.cc b/src/dsp/x86/obmc_sse4.cc
index 8ce23b4..f068ff3 100644
--- a/src/dsp/x86/obmc_sse4.cc
+++ b/src/dsp/x86/obmc_sse4.cc
@@ -39,8 +39,8 @@ namespace {
inline void OverlapBlendFromLeft2xH_SSE4_1(
uint8_t* LIBGAV1_RESTRICT const prediction,
const ptrdiff_t prediction_stride, const int height,
- const uint8_t* LIBGAV1_RESTRICT const obmc_prediction,
- const ptrdiff_t obmc_prediction_stride) {
+ const uint8_t* LIBGAV1_RESTRICT const obmc_prediction) {
+ constexpr int obmc_prediction_stride = 2;
uint8_t* pred = prediction;
const uint8_t* obmc_pred = obmc_prediction;
const __m128i mask_inverter = _mm_cvtsi32_si128(0x40404040);
@@ -51,8 +51,7 @@ inline void OverlapBlendFromLeft2xH_SSE4_1(
int y = height;
do {
const __m128i pred_val = Load2x2(pred, pred + prediction_stride);
- const __m128i obmc_pred_val =
- Load2x2(obmc_pred, obmc_pred + obmc_prediction_stride);
+ const __m128i obmc_pred_val = Load4(obmc_pred);
const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
const __m128i result =
@@ -71,8 +70,8 @@ inline void OverlapBlendFromLeft2xH_SSE4_1(
inline void OverlapBlendFromLeft4xH_SSE4_1(
uint8_t* LIBGAV1_RESTRICT const prediction,
const ptrdiff_t prediction_stride, const int height,
- const uint8_t* LIBGAV1_RESTRICT const obmc_prediction,
- const ptrdiff_t obmc_prediction_stride) {
+ const uint8_t* LIBGAV1_RESTRICT const obmc_prediction) {
+ constexpr int obmc_prediction_stride = 4;
uint8_t* pred = prediction;
const uint8_t* obmc_pred = obmc_prediction;
const __m128i mask_inverter = _mm_cvtsi32_si128(0x40404040);
@@ -85,15 +84,12 @@ inline void OverlapBlendFromLeft4xH_SSE4_1(
int y = height;
do {
const __m128i pred_val0 = Load4(pred);
- const __m128i obmc_pred_val0 = Load4(obmc_pred);
pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
// Place the second row of each source in the second four bytes.
const __m128i pred_val =
_mm_alignr_epi8(Load4(pred), _mm_slli_si128(pred_val0, 12), 12);
- const __m128i obmc_pred_val = _mm_alignr_epi8(
- Load4(obmc_pred), _mm_slli_si128(obmc_pred_val0, 12), 12);
+ const __m128i obmc_pred_val = LoadLo8(obmc_pred);
const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
const __m128i result =
RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6);
@@ -102,7 +98,7 @@ inline void OverlapBlendFromLeft4xH_SSE4_1(
const int second_row_result = _mm_extract_epi32(packed_result, 1);
memcpy(pred, &second_row_result, sizeof(second_row_result));
pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
+ obmc_pred += obmc_prediction_stride << 1;
y -= 2;
} while (y != 0);
}
@@ -110,8 +106,8 @@ inline void OverlapBlendFromLeft4xH_SSE4_1(
inline void OverlapBlendFromLeft8xH_SSE4_1(
uint8_t* LIBGAV1_RESTRICT const prediction,
const ptrdiff_t prediction_stride, const int height,
- const uint8_t* LIBGAV1_RESTRICT const obmc_prediction,
- const ptrdiff_t obmc_prediction_stride) {
+ const uint8_t* LIBGAV1_RESTRICT const obmc_prediction) {
+ constexpr int obmc_prediction_stride = 8;
uint8_t* pred = prediction;
const uint8_t* obmc_pred = obmc_prediction;
const __m128i mask_inverter = _mm_set1_epi8(64);
@@ -121,16 +117,25 @@ inline void OverlapBlendFromLeft8xH_SSE4_1(
const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
int y = height;
do {
- const __m128i pred_val = LoadLo8(pred);
- const __m128i obmc_pred_val = LoadLo8(obmc_pred);
- const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
- const __m128i result =
- RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6);
+ const __m128i pred_val = LoadHi8(LoadLo8(pred), pred + prediction_stride);
+ const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred);
+
+ const __m128i terms_lo = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
+ const __m128i result_lo =
+ RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_lo, masks), 6);
+
+ const __m128i terms_hi = _mm_unpackhi_epi8(pred_val, obmc_pred_val);
+ const __m128i result_hi =
+ RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_hi, masks), 6);
- StoreLo8(pred, _mm_packus_epi16(result, result));
+ const __m128i result = _mm_packus_epi16(result_lo, result_hi);
+ StoreLo8(pred, result);
pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
- } while (--y != 0);
+ StoreHi8(pred, result);
+ pred += prediction_stride;
+ obmc_pred += obmc_prediction_stride << 1;
+ y -= 2;
+ } while (y != 0);
}
void OverlapBlendFromLeft_SSE4_1(
@@ -144,18 +149,15 @@ void OverlapBlendFromLeft_SSE4_1(
assert(height >= 4);
if (width == 2) {
- OverlapBlendFromLeft2xH_SSE4_1(pred, prediction_stride, height, obmc_pred,
- obmc_prediction_stride);
+ OverlapBlendFromLeft2xH_SSE4_1(pred, prediction_stride, height, obmc_pred);
return;
}
if (width == 4) {
- OverlapBlendFromLeft4xH_SSE4_1(pred, prediction_stride, height, obmc_pred,
- obmc_prediction_stride);
+ OverlapBlendFromLeft4xH_SSE4_1(pred, prediction_stride, height, obmc_pred);
return;
}
if (width == 8) {
- OverlapBlendFromLeft8xH_SSE4_1(pred, prediction_stride, height, obmc_pred,
- obmc_prediction_stride);
+ OverlapBlendFromLeft8xH_SSE4_1(pred, prediction_stride, height, obmc_pred);
return;
}
const __m128i mask_inverter = _mm_set1_epi8(64);
@@ -192,8 +194,8 @@ void OverlapBlendFromLeft_SSE4_1(
inline void OverlapBlendFromTop4xH_SSE4_1(
uint8_t* LIBGAV1_RESTRICT const prediction,
const ptrdiff_t prediction_stride, const int height,
- const uint8_t* LIBGAV1_RESTRICT const obmc_prediction,
- const ptrdiff_t obmc_prediction_stride) {
+ const uint8_t* LIBGAV1_RESTRICT const obmc_prediction) {
+ constexpr int obmc_prediction_stride = 4;
uint8_t* pred = prediction;
const uint8_t* obmc_pred = obmc_prediction;
const __m128i mask_inverter = _mm_set1_epi16(64);
@@ -212,13 +214,10 @@ inline void OverlapBlendFromTop4xH_SSE4_1(
_mm_sub_epi8(mask_inverter, _mm_sign_epi8(mask_val, mask_preinverter));
const __m128i pred_val0 = Load4(pred);
- const __m128i obmc_pred_val0 = Load4(obmc_pred);
+ const __m128i obmc_pred_val = LoadLo8(obmc_pred);
pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
const __m128i pred_val =
_mm_alignr_epi8(Load4(pred), _mm_slli_si128(pred_val0, 12), 12);
- const __m128i obmc_pred_val = _mm_alignr_epi8(
- Load4(obmc_pred), _mm_slli_si128(obmc_pred_val0, 12), 12);
const __m128i terms = _mm_unpacklo_epi8(obmc_pred_val, pred_val);
const __m128i result =
RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6);
@@ -227,7 +226,7 @@ inline void OverlapBlendFromTop4xH_SSE4_1(
Store4(pred - prediction_stride, packed_result);
Store4(pred, _mm_srli_si128(packed_result, 4));
pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
+ obmc_pred += obmc_prediction_stride << 1;
y += 2;
} while (y < compute_height);
}
@@ -235,8 +234,8 @@ inline void OverlapBlendFromTop4xH_SSE4_1(
inline void OverlapBlendFromTop8xH_SSE4_1(
uint8_t* LIBGAV1_RESTRICT const prediction,
const ptrdiff_t prediction_stride, const int height,
- const uint8_t* LIBGAV1_RESTRICT const obmc_prediction,
- const ptrdiff_t obmc_prediction_stride) {
+ const uint8_t* LIBGAV1_RESTRICT const obmc_prediction) {
+ constexpr int obmc_prediction_stride = 8;
uint8_t* pred = prediction;
const uint8_t* obmc_pred = obmc_prediction;
const uint8_t* mask = kObmcMask + height - 2;
@@ -244,20 +243,35 @@ inline void OverlapBlendFromTop8xH_SSE4_1(
const int compute_height = height - (height >> 2);
int y = compute_height;
do {
- const __m128i mask_val = _mm_set1_epi8(mask[compute_height - y]);
+ const __m128i mask_val0 = _mm_set1_epi8(mask[compute_height - y]);
// 64 - mask
- const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
- const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
- const __m128i pred_val = LoadLo8(pred);
- const __m128i obmc_pred_val = LoadLo8(obmc_pred);
- const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
- const __m128i result =
- RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6);
+ const __m128i obmc_mask_val0 = _mm_sub_epi8(mask_inverter, mask_val0);
+ const __m128i masks0 = _mm_unpacklo_epi8(mask_val0, obmc_mask_val0);
- StoreLo8(pred, _mm_packus_epi16(result, result));
+ const __m128i pred_val = LoadHi8(LoadLo8(pred), pred + prediction_stride);
+ const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred);
+
+ const __m128i terms_lo = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
+ const __m128i result_lo =
+ RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_lo, masks0), 6);
+
+ --y;
+ const __m128i mask_val1 = _mm_set1_epi8(mask[compute_height - y]);
+ // 64 - mask
+ const __m128i obmc_mask_val1 = _mm_sub_epi8(mask_inverter, mask_val1);
+ const __m128i masks1 = _mm_unpacklo_epi8(mask_val1, obmc_mask_val1);
+
+ const __m128i terms_hi = _mm_unpackhi_epi8(pred_val, obmc_pred_val);
+ const __m128i result_hi =
+ RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_hi, masks1), 6);
+
+ const __m128i result = _mm_packus_epi16(result_lo, result_hi);
+ StoreLo8(pred, result);
pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
- } while (--y != 0);
+ StoreHi8(pred, result);
+ pred += prediction_stride;
+ obmc_pred += obmc_prediction_stride << 1;
+ } while (--y > 0);
}
void OverlapBlendFromTop_SSE4_1(
@@ -271,13 +285,11 @@ void OverlapBlendFromTop_SSE4_1(
assert(height >= 2);
if (width == 4) {
- OverlapBlendFromTop4xH_SSE4_1(pred, prediction_stride, height, obmc_pred,
- obmc_prediction_stride);
+ OverlapBlendFromTop4xH_SSE4_1(pred, prediction_stride, height, obmc_pred);
return;
}
if (width == 8) {
- OverlapBlendFromTop8xH_SSE4_1(pred, prediction_stride, height, obmc_pred,
- obmc_prediction_stride);
+ OverlapBlendFromTop8xH_SSE4_1(pred, prediction_stride, height, obmc_pred);
return;
}
@@ -333,8 +345,8 @@ constexpr int kRoundBitsObmcBlend = 6;
inline void OverlapBlendFromLeft2xH_SSE4_1(
uint16_t* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride,
- const int height, const uint16_t* LIBGAV1_RESTRICT const obmc_prediction,
- const ptrdiff_t obmc_pred_stride) {
+ const int height, const uint16_t* LIBGAV1_RESTRICT const obmc_prediction) {
+ constexpr int obmc_pred_stride = 2;
uint16_t* pred = prediction;
const uint16_t* obmc_pred = obmc_prediction;
const ptrdiff_t pred_stride2 = pred_stride << 1;
@@ -348,8 +360,7 @@ inline void OverlapBlendFromLeft2xH_SSE4_1(
int y = height;
do {
const __m128i pred_val = Load4x2(pred, pred + pred_stride);
- const __m128i obmc_pred_val =
- Load4x2(obmc_pred, obmc_pred + obmc_pred_stride);
+ const __m128i obmc_pred_val = LoadLo8(obmc_pred);
const __m128i terms = _mm_unpacklo_epi16(pred_val, obmc_pred_val);
const __m128i result = RightShiftWithRounding_U32(
_mm_madd_epi16(terms, masks), kRoundBitsObmcBlend);
@@ -364,8 +375,8 @@ inline void OverlapBlendFromLeft2xH_SSE4_1(
inline void OverlapBlendFromLeft4xH_SSE4_1(
uint16_t* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride,
- const int height, const uint16_t* LIBGAV1_RESTRICT const obmc_prediction,
- const ptrdiff_t obmc_pred_stride) {
+ const int height, const uint16_t* LIBGAV1_RESTRICT const obmc_prediction) {
+ constexpr int obmc_pred_stride = 4;
uint16_t* pred = prediction;
const uint16_t* obmc_pred = obmc_prediction;
const ptrdiff_t pred_stride2 = pred_stride << 1;
@@ -379,8 +390,7 @@ inline void OverlapBlendFromLeft4xH_SSE4_1(
int y = height;
do {
const __m128i pred_val = LoadHi8(LoadLo8(pred), pred + pred_stride);
- const __m128i obmc_pred_val =
- LoadHi8(LoadLo8(obmc_pred), obmc_pred + obmc_pred_stride);
+ const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred);
const __m128i terms_lo = _mm_unpacklo_epi16(pred_val, obmc_pred_val);
const __m128i terms_hi = _mm_unpackhi_epi16(pred_val, obmc_pred_val);
const __m128i result_lo = RightShiftWithRounding_U32(
@@ -410,13 +420,11 @@ void OverlapBlendFromLeft10bpp_SSE4_1(
assert(height >= 4);
if (width == 2) {
- OverlapBlendFromLeft2xH_SSE4_1(pred, pred_stride, height, obmc_pred,
- obmc_pred_stride);
+ OverlapBlendFromLeft2xH_SSE4_1(pred, pred_stride, height, obmc_pred);
return;
}
if (width == 4) {
- OverlapBlendFromLeft4xH_SSE4_1(pred, pred_stride, height, obmc_pred,
- obmc_pred_stride);
+ OverlapBlendFromLeft4xH_SSE4_1(pred, pred_stride, height, obmc_pred);
return;
}
const __m128i mask_inverter = _mm_set1_epi8(64);
@@ -452,8 +460,8 @@ void OverlapBlendFromLeft10bpp_SSE4_1(
inline void OverlapBlendFromTop4xH_SSE4_1(
uint16_t* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride,
- const int height, const uint16_t* LIBGAV1_RESTRICT const obmc_prediction,
- const ptrdiff_t obmc_pred_stride) {
+ const int height, const uint16_t* LIBGAV1_RESTRICT const obmc_prediction) {
+ constexpr int obmc_pred_stride = 4;
uint16_t* pred = prediction;
const uint16_t* obmc_pred = obmc_prediction;
const __m128i mask_inverter = _mm_set1_epi16(64);
@@ -473,8 +481,7 @@ inline void OverlapBlendFromTop4xH_SSE4_1(
const __m128i masks_hi = _mm_cvtepi8_epi16(_mm_srli_si128(masks, 8));
const __m128i pred_val = LoadHi8(LoadLo8(pred), pred + pred_stride);
- const __m128i obmc_pred_val =
- LoadHi8(LoadLo8(obmc_pred), obmc_pred + obmc_pred_stride);
+ const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred);
const __m128i terms_lo = _mm_unpacklo_epi16(obmc_pred_val, pred_val);
const __m128i terms_hi = _mm_unpackhi_epi16(obmc_pred_val, pred_val);
const __m128i result_lo = RightShiftWithRounding_U32(
@@ -505,8 +512,7 @@ void OverlapBlendFromTop10bpp_SSE4_1(
assert(height >= 2);
if (width == 4) {
- OverlapBlendFromTop4xH_SSE4_1(pred, pred_stride, height, obmc_pred,
- obmc_pred_stride);
+ OverlapBlendFromTop4xH_SSE4_1(pred, pred_stride, height, obmc_pred);
return;
}
diff --git a/src/dsp/x86/warp_sse4.cc b/src/dsp/x86/warp_sse4.cc
index 5830894..5498052 100644
--- a/src/dsp/x86/warp_sse4.cc
+++ b/src/dsp/x86/warp_sse4.cc
@@ -167,7 +167,7 @@ inline void WriteVerticalFilter(const __m128i filter[8],
}
template <bool is_compound, typename DestType>
-inline void VerticalFilter(const int16_t source[15][8], int y4, int gamma,
+inline void VerticalFilter(const int16_t source[15][8], int64_t y4, int gamma,
int delta, DestType* LIBGAV1_RESTRICT dest_row,
ptrdiff_t dest_stride) {
int sy4 = (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta);
@@ -188,8 +188,8 @@ inline void VerticalFilter(const int16_t source[15][8], int y4, int gamma,
}
template <bool is_compound, typename DestType>
-inline void VerticalFilter(const int16_t* LIBGAV1_RESTRICT source_cols, int y4,
- int gamma, int delta,
+inline void VerticalFilter(const int16_t* LIBGAV1_RESTRICT source_cols,
+ int64_t y4, int gamma, int delta,
DestType* LIBGAV1_RESTRICT dest_row,
ptrdiff_t dest_stride) {
int sy4 = (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta);
@@ -249,7 +249,7 @@ inline void WarpRegion1(const uint8_t* LIBGAV1_RESTRICT src,
template <bool is_compound, typename DestType>
inline void WarpRegion2(const uint8_t* LIBGAV1_RESTRICT src,
- ptrdiff_t source_stride, int source_width, int y4,
+ ptrdiff_t source_stride, int source_width, int64_t y4,
int ix4, int iy4, int gamma, int delta,
int16_t intermediate_result_column[15],
DestType* LIBGAV1_RESTRICT dst_row,
@@ -291,7 +291,7 @@ inline void WarpRegion2(const uint8_t* LIBGAV1_RESTRICT src,
template <bool is_compound, typename DestType>
inline void WarpRegion3(const uint8_t* LIBGAV1_RESTRICT src,
ptrdiff_t source_stride, int source_height, int alpha,
- int beta, int x4, int ix4, int iy4,
+ int beta, int64_t x4, int ix4, int iy4,
int16_t intermediate_result[15][8]) {
// Region 3
// At this point, we know ix4 - 7 < source_width - 1 and ix4 + 7 > 0.
@@ -323,8 +323,9 @@ inline void WarpRegion3(const uint8_t* LIBGAV1_RESTRICT src,
template <bool is_compound, typename DestType>
inline void WarpRegion4(const uint8_t* LIBGAV1_RESTRICT src,
- ptrdiff_t source_stride, int alpha, int beta, int x4,
- int ix4, int iy4, int16_t intermediate_result[15][8]) {
+ ptrdiff_t source_stride, int alpha, int beta,
+ int64_t x4, int ix4, int iy4,
+ int16_t intermediate_result[15][8]) {
// Region 4.
// At this point, we know ix4 - 7 < source_width - 1 and ix4 + 7 > 0.
@@ -379,14 +380,8 @@ inline void HandleWarpBlock(const uint8_t* LIBGAV1_RESTRICT src,
int16_t intermediate_result_column[15];
};
- const int dst_x =
- src_x * warp_params[2] + src_y * warp_params[3] + warp_params[0];
- const int dst_y =
- src_x * warp_params[4] + src_y * warp_params[5] + warp_params[1];
- const int x4 = dst_x >> subsampling_x;
- const int y4 = dst_y >> subsampling_y;
- const int ix4 = x4 >> kWarpedModelPrecisionBits;
- const int iy4 = y4 >> kWarpedModelPrecisionBits;
+ const WarpFilterParams filter_params = GetWarpFilterParams(
+ src_x, src_y, subsampling_x, subsampling_y, warp_params);
// A prediction block may fall outside the frame's boundaries. If a
// prediction block is calculated using only samples outside the frame's
// boundary, the filtering can be simplified. We can divide the plane
@@ -439,33 +434,38 @@ inline void HandleWarpBlock(const uint8_t* LIBGAV1_RESTRICT src,
// border index (source_width - 1 or 0, respectively). Then for each x,
// the inner for loop of the horizontal filter is reduced to multiplying
// the border pixel by the sum of the filter coefficients.
- if (ix4 - 7 >= source_width - 1 || ix4 + 7 <= 0) {
- if ((iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0)) {
+ if (filter_params.ix4 - 7 >= source_width - 1 || filter_params.ix4 + 7 <= 0) {
+ if ((filter_params.iy4 - 7 >= source_height - 1 ||
+ filter_params.iy4 + 7 <= 0)) {
// Outside the frame in both directions. One repeated value.
- WarpRegion1<is_compound, DestType>(src, source_stride, source_width,
- source_height, ix4, iy4, dst_row,
- dest_stride);
+ WarpRegion1<is_compound, DestType>(
+ src, source_stride, source_width, source_height, filter_params.ix4,
+ filter_params.iy4, dst_row, dest_stride);
return;
}
// Outside the frame horizontally. Rows repeated.
WarpRegion2<is_compound, DestType>(
- src, source_stride, source_width, y4, ix4, iy4, gamma, delta,
- intermediate_result_column, dst_row, dest_stride);
+ src, source_stride, source_width, filter_params.y4, filter_params.ix4,
+ filter_params.iy4, gamma, delta, intermediate_result_column, dst_row,
+ dest_stride);
return;
}
- if ((iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0)) {
+ if ((filter_params.iy4 - 7 >= source_height - 1 ||
+ filter_params.iy4 + 7 <= 0)) {
// Outside the frame vertically.
- WarpRegion3<is_compound, DestType>(src, source_stride, source_height, alpha,
- beta, x4, ix4, iy4, intermediate_result);
+ WarpRegion3<is_compound, DestType>(
+ src, source_stride, source_height, alpha, beta, filter_params.x4,
+ filter_params.ix4, filter_params.iy4, intermediate_result);
} else {
// Inside the frame.
- WarpRegion4<is_compound, DestType>(src, source_stride, alpha, beta, x4, ix4,
- iy4, intermediate_result);
+ WarpRegion4<is_compound, DestType>(src, source_stride, alpha, beta,
+ filter_params.x4, filter_params.ix4,
+ filter_params.iy4, intermediate_result);
}
// Region 3 and 4 vertical filter.
- VerticalFilter<is_compound, DestType>(intermediate_result, y4, gamma, delta,
- dst_row, dest_stride);
+ VerticalFilter<is_compound, DestType>(intermediate_result, filter_params.y4,
+ gamma, delta, dst_row, dest_stride);
}
template <bool is_compound>
diff --git a/src/dsp/x86/weight_mask_sse4.cc b/src/dsp/x86/weight_mask_sse4.cc
index 69cb784..53a374d 100644
--- a/src/dsp/x86/weight_mask_sse4.cc
+++ b/src/dsp/x86/weight_mask_sse4.cc
@@ -37,10 +37,10 @@ namespace {
constexpr int kRoundingBits8bpp = 4;
template <bool mask_is_inverse, bool is_store_16>
-inline void WeightMask16_SSE4(const int16_t* LIBGAV1_RESTRICT prediction_0,
- const int16_t* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT mask,
- ptrdiff_t mask_stride) {
+inline void WeightMask16_SSE4_1(const int16_t* LIBGAV1_RESTRICT prediction_0,
+ const int16_t* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const __m128i pred_00 = LoadAligned16(prediction_0);
const __m128i pred_10 = LoadAligned16(prediction_1);
const __m128i difference_0 = RightShiftWithRounding_U16(
@@ -78,7 +78,7 @@ inline void WeightMask16_SSE4(const int16_t* LIBGAV1_RESTRICT prediction_0,
}
#define WEIGHT8_PAIR_WITHOUT_STRIDE \
- WeightMask16_SSE4<mask_is_inverse, false>(pred_0, pred_1, mask, mask_stride)
+ WeightMask16_SSE4_1<mask_is_inverse, false>(pred_0, pred_1, mask, mask_stride)
#define WEIGHT8_PAIR_AND_STRIDE \
WEIGHT8_PAIR_WITHOUT_STRIDE; \
@@ -87,9 +87,10 @@ inline void WeightMask16_SSE4(const int16_t* LIBGAV1_RESTRICT prediction_0,
mask += mask_stride << 1
template <bool mask_is_inverse>
-void WeightMask8x8_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
- const void* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT mask, ptrdiff_t mask_stride) {
+void WeightMask8x8_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
@@ -100,10 +101,10 @@ void WeightMask8x8_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
}
template <bool mask_is_inverse>
-void WeightMask8x16_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
- const void* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT mask,
- ptrdiff_t mask_stride) {
+void WeightMask8x16_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
int y3 = 3;
@@ -116,10 +117,10 @@ void WeightMask8x16_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
}
template <bool mask_is_inverse>
-void WeightMask8x32_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
- const void* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT mask,
- ptrdiff_t mask_stride) {
+void WeightMask8x32_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
int y5 = 5;
@@ -132,7 +133,7 @@ void WeightMask8x32_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
}
#define WEIGHT16_WITHOUT_STRIDE \
- WeightMask16_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask, mask_stride)
+ WeightMask16_SSE4_1<mask_is_inverse, true>(pred_0, pred_1, mask, mask_stride)
#define WEIGHT16_AND_STRIDE \
WEIGHT16_WITHOUT_STRIDE; \
@@ -141,10 +142,10 @@ void WeightMask8x32_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
mask += mask_stride
template <bool mask_is_inverse>
-void WeightMask16x8_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
- const void* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT mask,
- ptrdiff_t mask_stride) {
+void WeightMask16x8_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
int y = 7;
@@ -155,10 +156,10 @@ void WeightMask16x8_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
}
template <bool mask_is_inverse>
-void WeightMask16x16_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
- const void* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT mask,
- ptrdiff_t mask_stride) {
+void WeightMask16x16_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
int y3 = 5;
@@ -171,10 +172,10 @@ void WeightMask16x16_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
}
template <bool mask_is_inverse>
-void WeightMask16x32_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
- const void* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT mask,
- ptrdiff_t mask_stride) {
+void WeightMask16x32_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
int y5 = 6;
@@ -190,10 +191,10 @@ void WeightMask16x32_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
}
template <bool mask_is_inverse>
-void WeightMask16x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
- const void* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT mask,
- ptrdiff_t mask_stride) {
+void WeightMask16x64_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
int y3 = 21;
@@ -205,10 +206,11 @@ void WeightMask16x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
WEIGHT16_WITHOUT_STRIDE;
}
-#define WEIGHT32_WITHOUT_STRIDE \
- WeightMask16_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask, mask_stride); \
- WeightMask16_SSE4<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16, \
- mask + 16, mask_stride)
+#define WEIGHT32_WITHOUT_STRIDE \
+ WeightMask16_SSE4_1<mask_is_inverse, true>(pred_0, pred_1, mask, \
+ mask_stride); \
+ WeightMask16_SSE4_1<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16, \
+ mask + 16, mask_stride)
#define WEIGHT32_AND_STRIDE \
WEIGHT32_WITHOUT_STRIDE; \
@@ -217,10 +219,10 @@ void WeightMask16x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
mask += mask_stride
template <bool mask_is_inverse>
-void WeightMask32x8_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
- const void* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT mask,
- ptrdiff_t mask_stride) {
+void WeightMask32x8_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
WEIGHT32_AND_STRIDE;
@@ -234,10 +236,10 @@ void WeightMask32x8_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
}
template <bool mask_is_inverse>
-void WeightMask32x16_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
- const void* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT mask,
- ptrdiff_t mask_stride) {
+void WeightMask32x16_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
int y3 = 5;
@@ -250,10 +252,10 @@ void WeightMask32x16_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
}
template <bool mask_is_inverse>
-void WeightMask32x32_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
- const void* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT mask,
- ptrdiff_t mask_stride) {
+void WeightMask32x32_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
int y5 = 6;
@@ -269,10 +271,10 @@ void WeightMask32x32_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
}
template <bool mask_is_inverse>
-void WeightMask32x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
- const void* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT mask,
- ptrdiff_t mask_stride) {
+void WeightMask32x64_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
int y3 = 21;
@@ -284,14 +286,15 @@ void WeightMask32x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
WEIGHT32_WITHOUT_STRIDE;
}
-#define WEIGHT64_WITHOUT_STRIDE \
- WeightMask16_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask, mask_stride); \
- WeightMask16_SSE4<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16, \
- mask + 16, mask_stride); \
- WeightMask16_SSE4<mask_is_inverse, true>(pred_0 + 32, pred_1 + 32, \
- mask + 32, mask_stride); \
- WeightMask16_SSE4<mask_is_inverse, true>(pred_0 + 48, pred_1 + 48, \
- mask + 48, mask_stride)
+#define WEIGHT64_WITHOUT_STRIDE \
+ WeightMask16_SSE4_1<mask_is_inverse, true>(pred_0, pred_1, mask, \
+ mask_stride); \
+ WeightMask16_SSE4_1<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16, \
+ mask + 16, mask_stride); \
+ WeightMask16_SSE4_1<mask_is_inverse, true>(pred_0 + 32, pred_1 + 32, \
+ mask + 32, mask_stride); \
+ WeightMask16_SSE4_1<mask_is_inverse, true>(pred_0 + 48, pred_1 + 48, \
+ mask + 48, mask_stride)
#define WEIGHT64_AND_STRIDE \
WEIGHT64_WITHOUT_STRIDE; \
@@ -300,10 +303,10 @@ void WeightMask32x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
mask += mask_stride
template <bool mask_is_inverse>
-void WeightMask64x16_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
- const void* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT mask,
- ptrdiff_t mask_stride) {
+void WeightMask64x16_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
int y3 = 0;
@@ -316,10 +319,10 @@ void WeightMask64x16_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
}
template <bool mask_is_inverse>
-void WeightMask64x32_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
- const void* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT mask,
- ptrdiff_t mask_stride) {
+void WeightMask64x32_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
int y5 = 0;
@@ -335,10 +338,10 @@ void WeightMask64x32_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
}
template <bool mask_is_inverse>
-void WeightMask64x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
- const void* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT mask,
- ptrdiff_t mask_stride) {
+void WeightMask64x64_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
int y3 = 0;
@@ -351,10 +354,10 @@ void WeightMask64x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
}
template <bool mask_is_inverse>
-void WeightMask64x128_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
- const void* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT mask,
- ptrdiff_t mask_stride) {
+void WeightMask64x128_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
int y3 = 0;
@@ -368,10 +371,10 @@ void WeightMask64x128_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
}
template <bool mask_is_inverse>
-void WeightMask128x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
- const void* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT mask,
- ptrdiff_t mask_stride) {
+void WeightMask128x64_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
int y3 = 0;
@@ -412,10 +415,10 @@ void WeightMask128x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
}
template <bool mask_is_inverse>
-void WeightMask128x128_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
- const void* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT mask,
- ptrdiff_t mask_stride) {
+void WeightMask128x128_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
int y3 = 0;
@@ -466,8 +469,9 @@ void WeightMask128x128_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
#define INIT_WEIGHT_MASK_8BPP(width, height, w_index, h_index) \
dsp->weight_mask[w_index][h_index][0] = \
- WeightMask##width##x##height##_SSE4<0>; \
- dsp->weight_mask[w_index][h_index][1] = WeightMask##width##x##height##_SSE4<1>
+ WeightMask##width##x##height##_SSE4_1<0>; \
+ dsp->weight_mask[w_index][h_index][1] = \
+ WeightMask##width##x##height##_SSE4_1<1>
void Init8bpp() {
Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
assert(dsp != nullptr);
@@ -501,7 +505,7 @@ constexpr int kRoundingBits10bpp = 6;
constexpr int kScaledDiffShift = 4;
template <bool mask_is_inverse, bool is_store_16>
-inline void WeightMask16_10bpp_SSE4(
+inline void WeightMask16_10bpp_SSE4_1(
const uint16_t* LIBGAV1_RESTRICT prediction_0,
const uint16_t* LIBGAV1_RESTRICT prediction_1,
uint8_t* LIBGAV1_RESTRICT mask, ptrdiff_t mask_stride) {
@@ -562,9 +566,9 @@ inline void WeightMask16_10bpp_SSE4(
}
}
-#define WEIGHT8_PAIR_WITHOUT_STRIDE_10BPP \
- WeightMask16_10bpp_SSE4<mask_is_inverse, false>(pred_0, pred_1, mask, \
- mask_stride)
+#define WEIGHT8_PAIR_WITHOUT_STRIDE_10BPP \
+ WeightMask16_10bpp_SSE4_1<mask_is_inverse, false>(pred_0, pred_1, mask, \
+ mask_stride)
#define WEIGHT8_PAIR_AND_STRIDE_10BPP \
WEIGHT8_PAIR_WITHOUT_STRIDE_10BPP; \
@@ -573,10 +577,10 @@ inline void WeightMask16_10bpp_SSE4(
mask += mask_stride << 1
template <bool mask_is_inverse>
-void WeightMask8x8_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
- const void* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT mask,
- ptrdiff_t mask_stride) {
+void WeightMask8x8_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
@@ -587,10 +591,10 @@ void WeightMask8x8_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
}
template <bool mask_is_inverse>
-void WeightMask8x16_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
- const void* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT mask,
- ptrdiff_t mask_stride) {
+void WeightMask8x16_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
int y3 = 3;
@@ -603,10 +607,10 @@ void WeightMask8x16_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
}
template <bool mask_is_inverse>
-void WeightMask8x32_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
- const void* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT mask,
- ptrdiff_t mask_stride) {
+void WeightMask8x32_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
int y5 = 5;
@@ -618,9 +622,9 @@ void WeightMask8x32_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
WEIGHT8_PAIR_WITHOUT_STRIDE_10BPP;
}
-#define WEIGHT16_WITHOUT_STRIDE_10BPP \
- WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask, \
- mask_stride)
+#define WEIGHT16_WITHOUT_STRIDE_10BPP \
+ WeightMask16_10bpp_SSE4_1<mask_is_inverse, true>(pred_0, pred_1, mask, \
+ mask_stride)
#define WEIGHT16_AND_STRIDE_10BPP \
WEIGHT16_WITHOUT_STRIDE_10BPP; \
@@ -629,10 +633,10 @@ void WeightMask8x32_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
mask += mask_stride
template <bool mask_is_inverse>
-void WeightMask16x8_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
- const void* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT mask,
- ptrdiff_t mask_stride) {
+void WeightMask16x8_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
int y = 7;
@@ -643,10 +647,10 @@ void WeightMask16x8_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
}
template <bool mask_is_inverse>
-void WeightMask16x16_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
- const void* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT mask,
- ptrdiff_t mask_stride) {
+void WeightMask16x16_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
int y3 = 5;
@@ -659,10 +663,10 @@ void WeightMask16x16_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
}
template <bool mask_is_inverse>
-void WeightMask16x32_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
- const void* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT mask,
- ptrdiff_t mask_stride) {
+void WeightMask16x32_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
int y5 = 6;
@@ -678,10 +682,10 @@ void WeightMask16x32_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
}
template <bool mask_is_inverse>
-void WeightMask16x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
- const void* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT mask,
- ptrdiff_t mask_stride) {
+void WeightMask16x64_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
int y3 = 21;
@@ -693,11 +697,11 @@ void WeightMask16x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
WEIGHT16_WITHOUT_STRIDE_10BPP;
}
-#define WEIGHT32_WITHOUT_STRIDE_10BPP \
- WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask, \
- mask_stride); \
- WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16, \
- mask + 16, mask_stride)
+#define WEIGHT32_WITHOUT_STRIDE_10BPP \
+ WeightMask16_10bpp_SSE4_1<mask_is_inverse, true>(pred_0, pred_1, mask, \
+ mask_stride); \
+ WeightMask16_10bpp_SSE4_1<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16, \
+ mask + 16, mask_stride)
#define WEIGHT32_AND_STRIDE_10BPP \
WEIGHT32_WITHOUT_STRIDE_10BPP; \
@@ -706,10 +710,10 @@ void WeightMask16x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
mask += mask_stride
template <bool mask_is_inverse>
-void WeightMask32x8_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
- const void* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT mask,
- ptrdiff_t mask_stride) {
+void WeightMask32x8_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
WEIGHT32_AND_STRIDE_10BPP;
@@ -723,10 +727,10 @@ void WeightMask32x8_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
}
template <bool mask_is_inverse>
-void WeightMask32x16_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
- const void* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT mask,
- ptrdiff_t mask_stride) {
+void WeightMask32x16_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
int y3 = 5;
@@ -739,10 +743,10 @@ void WeightMask32x16_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
}
template <bool mask_is_inverse>
-void WeightMask32x32_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
- const void* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT mask,
- ptrdiff_t mask_stride) {
+void WeightMask32x32_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
int y5 = 6;
@@ -758,10 +762,10 @@ void WeightMask32x32_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
}
template <bool mask_is_inverse>
-void WeightMask32x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
- const void* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT mask,
- ptrdiff_t mask_stride) {
+void WeightMask32x64_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
int y3 = 21;
@@ -773,15 +777,15 @@ void WeightMask32x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
WEIGHT32_WITHOUT_STRIDE_10BPP;
}
-#define WEIGHT64_WITHOUT_STRIDE_10BPP \
- WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask, \
- mask_stride); \
- WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16, \
- mask + 16, mask_stride); \
- WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0 + 32, pred_1 + 32, \
- mask + 32, mask_stride); \
- WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0 + 48, pred_1 + 48, \
- mask + 48, mask_stride)
+#define WEIGHT64_WITHOUT_STRIDE_10BPP \
+ WeightMask16_10bpp_SSE4_1<mask_is_inverse, true>(pred_0, pred_1, mask, \
+ mask_stride); \
+ WeightMask16_10bpp_SSE4_1<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16, \
+ mask + 16, mask_stride); \
+ WeightMask16_10bpp_SSE4_1<mask_is_inverse, true>(pred_0 + 32, pred_1 + 32, \
+ mask + 32, mask_stride); \
+ WeightMask16_10bpp_SSE4_1<mask_is_inverse, true>(pred_0 + 48, pred_1 + 48, \
+ mask + 48, mask_stride)
#define WEIGHT64_AND_STRIDE_10BPP \
WEIGHT64_WITHOUT_STRIDE_10BPP; \
@@ -790,10 +794,10 @@ void WeightMask32x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
mask += mask_stride
template <bool mask_is_inverse>
-void WeightMask64x16_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
- const void* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT mask,
- ptrdiff_t mask_stride) {
+void WeightMask64x16_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
int y3 = 5;
@@ -806,10 +810,10 @@ void WeightMask64x16_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
}
template <bool mask_is_inverse>
-void WeightMask64x32_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
- const void* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT mask,
- ptrdiff_t mask_stride) {
+void WeightMask64x32_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
int y5 = 6;
@@ -825,10 +829,10 @@ void WeightMask64x32_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
}
template <bool mask_is_inverse>
-void WeightMask64x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
- const void* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT mask,
- ptrdiff_t mask_stride) {
+void WeightMask64x64_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
int y3 = 21;
@@ -841,10 +845,10 @@ void WeightMask64x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
}
template <bool mask_is_inverse>
-void WeightMask64x128_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
- const void* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT mask,
- ptrdiff_t mask_stride) {
+void WeightMask64x128_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
int y3 = 42;
@@ -858,10 +862,10 @@ void WeightMask64x128_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
}
template <bool mask_is_inverse>
-void WeightMask128x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
- const void* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT mask,
- ptrdiff_t mask_stride) {
+void WeightMask128x64_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
int y3 = 21;
@@ -902,10 +906,10 @@ void WeightMask128x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
}
template <bool mask_is_inverse>
-void WeightMask128x128_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
- const void* LIBGAV1_RESTRICT prediction_1,
- uint8_t* LIBGAV1_RESTRICT mask,
- ptrdiff_t mask_stride) {
+void WeightMask128x128_10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+ const void* LIBGAV1_RESTRICT prediction_1,
+ uint8_t* LIBGAV1_RESTRICT mask,
+ ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
int y3 = 42;
@@ -956,9 +960,9 @@ void WeightMask128x128_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
#define INIT_WEIGHT_MASK_10BPP(width, height, w_index, h_index) \
dsp->weight_mask[w_index][h_index][0] = \
- WeightMask##width##x##height##_10bpp_SSE4<0>; \
+ WeightMask##width##x##height##_10bpp_SSE4_1<0>; \
dsp->weight_mask[w_index][h_index][1] = \
- WeightMask##width##x##height##_10bpp_SSE4<1>
+ WeightMask##width##x##height##_10bpp_SSE4_1<1>
void Init10bpp() {
Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
assert(dsp != nullptr);
diff --git a/src/film_grain.cc b/src/film_grain.cc
index 5c64ff2..44a2543 100644
--- a/src/film_grain.cc
+++ b/src/film_grain.cc
@@ -824,5 +824,8 @@ template class FilmGrain<kBitdepth8>;
#if LIBGAV1_MAX_BITDEPTH >= 10
template class FilmGrain<kBitdepth10>;
#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+template class FilmGrain<kBitdepth12>;
+#endif
} // namespace libgav1
diff --git a/src/film_grain.h b/src/film_grain.h
index f2c1e93..bda8458 100644
--- a/src/film_grain.h
+++ b/src/film_grain.h
@@ -104,7 +104,9 @@ class FilmGrain {
using Pixel =
typename std::conditional<bitdepth == 8, uint8_t, uint16_t>::type;
static constexpr int kScalingLutLength =
- (kScalingLookupTableSize + kScalingLookupTablePadding) << (bitdepth - 8);
+ (bitdepth == 10)
+ ? (kScalingLookupTableSize + kScalingLookupTablePadding) << 2
+ : kScalingLookupTableSize + kScalingLookupTablePadding;
bool Init();
diff --git a/src/film_grain_test.cc b/src/film_grain_test.cc
index bf37299..d5854e0 100644
--- a/src/film_grain_test.cc
+++ b/src/film_grain_test.cc
@@ -435,11 +435,25 @@ const char* GetTestDigestLuma(int bitdepth, int param_index) {
"0efbad5f9dc07391ad243232b8df1787", "2bd41882cd82960019aa2b87d5fb1fbc",
"1c66629c0c4e7b6f9b0a7a6944fbad50", "2c633a50ead62f8e844a409545f46244",
};
+ static const char* const kTestDigestsLuma12bpp[10] = {
+ "1dc9b38a93454a85eb924f25346ae369", "5f9d311ee5384a5a902f8e2d1297319e",
+ "cf1a35878720564c7a741f91eef66565", "47a0608fe0f6f7ccae42a5ca05783cbf",
+ "dbc28da0178e3c18a036c3f2203c300f", "04911d2074e3252119ee2d80426b8c01",
+ "df19ab8103c40b726c842ccf7772208b", "39276967eb16710d98f82068c3eeba41",
+ "b83100f18abb2062d9c9969f07182b86", "b39a69515491329698cf66f6d4fa371f",
+ };
- if (bitdepth == 8) {
- return kTestDigestsLuma8bpp[param_index];
+ switch (bitdepth) {
+ case 8:
+ return kTestDigestsLuma8bpp[param_index];
+ case 10:
+ return kTestDigestsLuma10bpp[param_index];
+ case 12:
+ return kTestDigestsLuma12bpp[param_index];
+ default:
+ assert(bitdepth == 8 || bitdepth == 10 || bitdepth == 12);
+ return nullptr;
}
- return kTestDigestsLuma10bpp[param_index];
}
const char* GetTestDigestChromaU(int bitdepth, int param_index) {
@@ -457,10 +471,25 @@ const char* GetTestDigestChromaU(int bitdepth, int param_index) {
"be306c6a94c55dbd9ef514f0ad4a0011", "904602329b0dec352b3b177b0a2554d2",
"58afc9497d968c67fdf2c0cf23b33aa3", "74fee7be6f62724bf901fdd04a733b46",
};
- if (bitdepth == 8) {
- return kTestDigestsChromaU8bpp[param_index];
+ static const char* const kTestDigestsChromaU12bpp[10] = {
+ "846d608050fe7c19d6cabe2d53cb7821", "2caf4665a26aad50f68497e4b1326417",
+ "ce40f0f8f8c207c7c985464c812fea33", "820de51d07a21da5c00833bab546f1fa",
+ "5e7bedd8933cd274af03babb4dbb94dd", "d137cf584eabea86387460a6d3f62bfe",
+ "f206e0c6ed35b3ab35c6ff37e151e963", "55d87981b7044df225b3b5935185449b",
+ "6a655c8bf4df6af0e80ae6d004a73a25", "6234ae36076cc77161af6e6e3c04449a",
+ };
+
+ switch (bitdepth) {
+ case 8:
+ return kTestDigestsChromaU8bpp[param_index];
+ case 10:
+ return kTestDigestsChromaU10bpp[param_index];
+ case 12:
+ return kTestDigestsChromaU12bpp[param_index];
+ default:
+ assert(bitdepth == 8 || bitdepth == 10 || bitdepth == 12);
+ return nullptr;
}
- return kTestDigestsChromaU10bpp[param_index];
}
const char* GetTestDigestChromaV(int bitdepth, int param_index) {
@@ -478,95 +507,93 @@ const char* GetTestDigestChromaV(int bitdepth, int param_index) {
"7b1624c3543badf5fadaee4d1e602e6b", "3be074e4ca0eec5770748b15661aaadd",
"639197401032f272d6c30666a2d08f43", "28075dd34246bf9d5e6197b1944f646a",
};
- if (bitdepth == 8) {
- return kTestDigestsChromaV8bpp[param_index];
+ static const char* const kTestDigestsChromaV12bpp[10] = {
+ "4957ec919c20707d594fa5c2138c2550", "3f07c65bfb42c81768b1f5ad9611d1ce",
+ "665d9547171c99faba95ac81a35c9a0c", "1b5d032e0cefdb4041ad51796de8a45e",
+ "18fa974579a4f1ff8cd7df664fc339d5", "2ffaa4f143495ff73c06a580a97b6321",
+ "4fd1f562bc47a68dbfaf7c566c7c4da6", "4d37c80c9caf110c1d3d20bd1a1875b3",
+ "8ea29759640962613166dc5154837d14", "5ca4c10f42d0906c72ebee90fae6ce7d",
+ };
+
+ switch (bitdepth) {
+ case 8:
+ return kTestDigestsChromaV8bpp[param_index];
+ case 10:
+ return kTestDigestsChromaV10bpp[param_index];
+ case 12:
+ return kTestDigestsChromaV12bpp[param_index];
+ default:
+ assert(bitdepth == 8 || bitdepth == 10 || bitdepth == 12);
+ return nullptr;
}
- return kTestDigestsChromaV10bpp[param_index];
}
const char* GetARTestDigestLuma(int bitdepth, int coeff_lag, int param_index) {
static const char* const kTestDigestsLuma8bpp[3][kNumFilmGrainTestParams] = {
- {
- "a835127918f93478b45f1ba4d20d81bd",
- "a835127918f93478b45f1ba4d20d81bd",
- "e5db4da626e214bb17bcc7ecffa76303",
- "a835127918f93478b45f1ba4d20d81bd",
- "a835127918f93478b45f1ba4d20d81bd",
- "e5db4da626e214bb17bcc7ecffa76303",
- "a835127918f93478b45f1ba4d20d81bd",
- "1da62b7233de502123a18546b6c97da2",
- "1da62b7233de502123a18546b6c97da2",
- "1da62b7233de502123a18546b6c97da2",
- },
- {
- "11464b880de3ecd6e6189c5c4e7f9b28",
- "dfe411762e283b5f49bece02ec200951",
- "5c534d92afdf0a5b53dbe4fe7271929c",
- "2e1a68a18aca96c31320ba7ceab59be9",
- "584c0323e6b276cb9acb1a294d462d58",
- "9571eb8f1cbaa96ea3bf64a820a8d9f0",
- "305285ff0df87aba3c59e3fc0818697d",
- "0066d35c8818cf20230114dcd3765a4d",
- "0066d35c8818cf20230114dcd3765a4d",
- "16d61b046084ef2636eedc5a737cb6f6",
- },
- {
- "0c9e2cf1b6c3cad0f7668026e8ea0516",
- "7d094855292d0eded9e0d1b5bab1990b",
- "fbf28860a5f1285dcc6725a45256a86a",
- "dccb906904160ccabbd2c9a7797a4bf9",
- "46f645e17f08a3260b1ae70284e5c5b8",
- "124fdc90bed11a7320a0cbdee8b94400",
- "8d2978651dddeaef6282191fa146f0a0",
- "28b4d5aa33f05b3fb7f9323a11936bdc",
- "6a8ea684f6736a069e3612d1af6391a8",
- "2781ea40a63704dbfeb3a1ac5db6f2fc",
- },
+ {"a835127918f93478b45f1ba4d20d81bd", "a835127918f93478b45f1ba4d20d81bd",
+ "e5db4da626e214bb17bcc7ecffa76303", "a835127918f93478b45f1ba4d20d81bd",
+ "a835127918f93478b45f1ba4d20d81bd", "e5db4da626e214bb17bcc7ecffa76303",
+ "a835127918f93478b45f1ba4d20d81bd", "1da62b7233de502123a18546b6c97da2",
+ "1da62b7233de502123a18546b6c97da2", "1da62b7233de502123a18546b6c97da2"},
+ {"11464b880de3ecd6e6189c5c4e7f9b28", "dfe411762e283b5f49bece02ec200951",
+ "5c534d92afdf0a5b53dbe4fe7271929c", "2e1a68a18aca96c31320ba7ceab59be9",
+ "584c0323e6b276cb9acb1a294d462d58", "9571eb8f1cbaa96ea3bf64a820a8d9f0",
+ "305285ff0df87aba3c59e3fc0818697d", "0066d35c8818cf20230114dcd3765a4d",
+ "0066d35c8818cf20230114dcd3765a4d", "16d61b046084ef2636eedc5a737cb6f6"},
+ {"0c9e2cf1b6c3cad0f7668026e8ea0516", "7d094855292d0eded9e0d1b5bab1990b",
+ "fbf28860a5f1285dcc6725a45256a86a", "dccb906904160ccabbd2c9a7797a4bf9",
+ "46f645e17f08a3260b1ae70284e5c5b8", "124fdc90bed11a7320a0cbdee8b94400",
+ "8d2978651dddeaef6282191fa146f0a0", "28b4d5aa33f05b3fb7f9323a11936bdc",
+ "6a8ea684f6736a069e3612d1af6391a8", "2781ea40a63704dbfeb3a1ac5db6f2fc"},
};
static const char* const kTestDigestsLuma10bpp[3][kNumFilmGrainTestParams] = {
- {
- "5e6bc8444ece2d38420f51d82238d812",
- "5e6bc8444ece2d38420f51d82238d812",
- "2bfaec768794af33d60a9771f971f68d",
- "5e6bc8444ece2d38420f51d82238d812",
- "5e6bc8444ece2d38420f51d82238d812",
- "c880807a368c4e82c23bea6f035ad23f",
- "5e6bc8444ece2d38420f51d82238d812",
- "c576667da5286183ec3aab9a76f53a2e",
- "c576667da5286183ec3aab9a76f53a2e",
- "c576667da5286183ec3aab9a76f53a2e",
- },
- {
- "095c2dd4d4d52aff9696df9bfdb70062",
- "983d14afa497060792d472a449a380c7",
- "c5fdc0f7c594b2b36132cec6f45a79bd",
- "acff232ac5597c1712213150552281d1",
- "4dd7341923b1d260092853553b6b6246",
- "0ca8afd71a4f564ea1ce69c4af14e9ab",
- "9bc7565e5359d09194fcee28e4bf7b94",
- "6fea7805458b9d149f238a30e2dc3f13",
- "6fea7805458b9d149f238a30e2dc3f13",
- "681dff5fc7a7244ba4e4a582ca7ecb14",
- },
- {
- "cb99352c9c6300e7e825188bb4adaee0",
- "7e40674de0209bd72f8e9c6e39ee6f7c",
- "3e475572f6b4ecbb2730fd16751ad7ed",
- "e6e4c63abc9cb112d9d1f23886cd1415",
- "1a1c953b175c105c604902877e2bab18",
- "380a53072530223d4ee622e014ee4bdb",
- "6137394ea1172fb7ea0cbac237ff1703",
- "85ab0c813e46f97cb9f42542f44c01ad",
- "68c8ac462f0e28cb35402c538bee32f1",
- "0038502ffa4760c8feb6f9abd4de7250",
- },
+ {"5e6bc8444ece2d38420f51d82238d812", "5e6bc8444ece2d38420f51d82238d812",
+ "2bfaec768794af33d60a9771f971f68d", "5e6bc8444ece2d38420f51d82238d812",
+ "5e6bc8444ece2d38420f51d82238d812", "c880807a368c4e82c23bea6f035ad23f",
+ "5e6bc8444ece2d38420f51d82238d812", "c576667da5286183ec3aab9a76f53a2e",
+ "c576667da5286183ec3aab9a76f53a2e", "c576667da5286183ec3aab9a76f53a2e"},
+ {"095c2dd4d4d52aff9696df9bfdb70062", "983d14afa497060792d472a449a380c7",
+ "c5fdc0f7c594b2b36132cec6f45a79bd", "acff232ac5597c1712213150552281d1",
+ "4dd7341923b1d260092853553b6b6246", "0ca8afd71a4f564ea1ce69c4af14e9ab",
+ "9bc7565e5359d09194fcee28e4bf7b94", "6fea7805458b9d149f238a30e2dc3f13",
+ "6fea7805458b9d149f238a30e2dc3f13", "681dff5fc7a7244ba4e4a582ca7ecb14"},
+ {"cb99352c9c6300e7e825188bb4adaee0", "7e40674de0209bd72f8e9c6e39ee6f7c",
+ "3e475572f6b4ecbb2730fd16751ad7ed", "e6e4c63abc9cb112d9d1f23886cd1415",
+ "1a1c953b175c105c604902877e2bab18", "380a53072530223d4ee622e014ee4bdb",
+ "6137394ea1172fb7ea0cbac237ff1703", "85ab0c813e46f97cb9f42542f44c01ad",
+ "68c8ac462f0e28cb35402c538bee32f1", "0038502ffa4760c8feb6f9abd4de7250"},
};
- if (bitdepth == 8) {
- return kTestDigestsLuma8bpp[coeff_lag - 1][param_index];
+ static const char* const kTestDigestsLuma12bpp[3][kNumFilmGrainTestParams] = {
+ {"d618bbb0e337969c91b1805f39561520", "d618bbb0e337969c91b1805f39561520",
+ "678f6e911591daf9eca4e305dabdb2b3", "d618bbb0e337969c91b1805f39561520",
+ "d618bbb0e337969c91b1805f39561520", "3b26f49612fd587c7360790d40adb5de",
+ "d618bbb0e337969c91b1805f39561520", "33f77d3ff50cfc64c6bc9a896b567377",
+ "33f77d3ff50cfc64c6bc9a896b567377", "33f77d3ff50cfc64c6bc9a896b567377"},
+ {"362fd67050fb7abaf57c43a92d993423", "e014ae0eb9e697281015c38905cc46ef",
+ "82b867e57151dc08afba31eccf5ccf69", "a94ba736cdce7bfa0b550285f59e47a9",
+ "3f1b0b7dd3b10e322254d35e4e185b7c", "7929708e5f017d58c53513cb79b35fda",
+ "6d26d31a091cbe642a7070933bd7de5a", "dc29ac40a994c0a760bfbad0bfc15b3a",
+ "dc29ac40a994c0a760bfbad0bfc15b3a", "399b919db5190a5311ce8d166580827b"},
+ {"6116d1f569f5b568eca4dc1fbf255086", "7e9cf31ea74e8ea99ffd12094ce6cd05",
+ "bb982c4c39e82a333d744defd16f4388", "7c6e584b082dc6b97ed0d967def3993f",
+ "fb234695353058f03c8e128f2f8de130", "9218c6ca67bf6a9237f98aa1ce7acdfd",
+ "d1fb834bbb388ed066c5cbc1c79b5bdf", "d6f630daedc08216fcea12012e7408b5",
+ "dd7fe49299e6f113a98debc7411c8db8", "8b89e45a5101a28c24209ae119eafeb8"},
+ };
+
+ switch (bitdepth) {
+ case 8:
+ return kTestDigestsLuma8bpp[coeff_lag - 1][param_index];
+ case 10:
+ return kTestDigestsLuma10bpp[coeff_lag - 1][param_index];
+ case 12:
+ return kTestDigestsLuma12bpp[coeff_lag - 1][param_index];
+ default:
+ assert(bitdepth == 8 || bitdepth == 10 || bitdepth == 12);
+ return nullptr;
}
- return kTestDigestsLuma10bpp[coeff_lag - 1][param_index];
}
const char* GetARTestDigestChromaU(int bitdepth, int coeff_lag,
@@ -589,12 +616,28 @@ const char* GetARTestDigestChromaU(int bitdepth, int coeff_lag,
"e2688d7286cd43fe0a3ea734d2ad0f77", "853193c4981bd882912171061327bdf2",
};
+ static const char* const kTestDigestsChromaU12bpp[12] = {
+ "04c23b01d01c0e3f3247f3741581b383", "9f8ea1d66e44f6fe93d765ce56b2b0f3",
+ "5dda44b128d6c244963f1e8e17cc1d22", "9dd0a79dd2f772310a95762d445bface",
+ "0dbd40d930e4873d72ea72b9e3d62440", "d7d83c207c6b435a164206d5f457931f",
+ "e8d04f6e63ed63838adff965275a1ff1", "fc09a903e941fcff8bad67a84f705775",
+ "9cd706606a2aa40d0957547756f7abd9", "258b37e7b8f48db77dac7ea24073fe69",
+ "80149b8bb05308da09c1383d8b79d3da", "e993f3bffae53204a1942feb1af42074",
+ };
+
assert(!(subsampling_x == 0 && subsampling_y == 1));
const int base_index = 3 * coeff_lag + subsampling_x + subsampling_y;
- if (bitdepth == 8) {
- return kTestDigestsChromaU8bpp[base_index];
+ switch (bitdepth) {
+ case 8:
+ return kTestDigestsChromaU8bpp[base_index];
+ case 10:
+ return kTestDigestsChromaU10bpp[base_index];
+ case 12:
+ return kTestDigestsChromaU12bpp[base_index];
+ default:
+ assert(bitdepth == 8 || bitdepth == 10 || bitdepth == 12);
+ return nullptr;
}
- return kTestDigestsChromaU10bpp[base_index];
}
const char* GetARTestDigestChromaV(int bitdepth, int coeff_lag,
@@ -617,12 +660,28 @@ const char* GetARTestDigestChromaV(int bitdepth, int coeff_lag,
"d3d0912e3fdb956fef416a010bd7b4c2", "a2fca8abd9fd38d2eef3c4495d9eff78",
};
+ static const char* const kTestDigestsChromaV12bpp[12] = {
+ "0d1890335f4464167de22353678ca9c6", "9e6830aba73139407196f1c811f910bc",
+ "6018f2fb76bd648bef0262471cfeba5c", "78e1ae1b790d709cdb8997621cf0fde3",
+ "5b44ae281d7f9db2f17aa3c24b4741dd", "f931d16991669cb16721de87da9b8067",
+ "5580f2aed349d9cabdafb9fc25a57b1c", "86918cd78bf95e6d4405dd050f5890b8",
+ "13c8b314eeebe35fa60b703d94e1b2c1", "13c6fb75cab3f42e0d4ca31e4d068b0e",
+ "bb9ca0bd6f8cd67e44c8ac2803abf5a5", "0da4ea711ffe557bb66577392b6f148b",
+ };
+
assert(!(subsampling_x == 0 && subsampling_y == 1));
const int base_index = 3 * coeff_lag + subsampling_x + subsampling_y;
- if (bitdepth == 8) {
- return kTestDigestsChromaV8bpp[base_index];
+ switch (bitdepth) {
+ case 8:
+ return kTestDigestsChromaV8bpp[base_index];
+ case 10:
+ return kTestDigestsChromaV10bpp[base_index];
+ case 12:
+ return kTestDigestsChromaV12bpp[base_index];
+ default:
+ assert(bitdepth == 8 || bitdepth == 10 || bitdepth == 12);
+ return nullptr;
}
- return kTestDigestsChromaV10bpp[base_index];
}
const char* GetGrainGenerationTestDigestLuma(int bitdepth, int param_index) {
@@ -642,10 +701,25 @@ const char* GetGrainGenerationTestDigestLuma(int bitdepth, int param_index) {
"85a122e32648fde84b883a1f98947c60", "dee656e3791138285bc5b71e3491a177",
};
- if (bitdepth == 8) {
- return kTestDigestsLuma8bpp[param_index];
+ static const char* const kTestDigestsLuma12bpp[kNumFilmGrainTestParams] = {
+ "ae359794b5340d073d597117046886ac", "4d4ad3908b4fb0f248a0086537dd6b1e",
+ "672a97e15180cbeeaf76d763992c9f23", "739124d10d16e00a158e833ea92107bc",
+ "4c38c738ff7ffc50adaa4474584d3aae", "ca05ba7e51000a7d10e5cbb2101bbd86",
+ "e207022b916bf03a76ac8742af29853d", "7454bf1859149237ff74f1161156c857",
+ "10fc2a16e663bbc305255b0883cfcd45", "4228abff6899bb33839b579288ab29fe",
+ };
+
+ switch (bitdepth) {
+ case 8:
+ return kTestDigestsLuma8bpp[param_index];
+ case 10:
+ return kTestDigestsLuma10bpp[param_index];
+ case 12:
+ return kTestDigestsLuma12bpp[param_index];
+ default:
+ assert(bitdepth == 8 || bitdepth == 10 || bitdepth == 12);
+ return nullptr;
}
- return kTestDigestsLuma10bpp[param_index];
}
const char* GetConstructStripesTestDigest(int bitdepth, int overlap_flag,
@@ -663,11 +737,24 @@ const char* GetConstructStripesTestDigest(int bitdepth, int overlap_flag,
"125bf18b7787e8f0792ea12f9210de0d", "21cf98cbce17eca77dc150cc9be0e0a0",
};
+ static const char* const kTestDigests12bpp[6] = {
+ "57f8e17078b6e8935252e918a2562636", "556a7b294a99bf1163b7166b4f68357e",
+ "249bee5572cd7d1cc07182c97adc4ba7", "9bf43ae1998c2a5b2e5f4d8236b58747",
+ "477c08fa26499936e5bb03bde097633e", "fe64b7166ff87ea0711ae4f519cadd59",
+ };
+
const int base_index = 3 * overlap_flag + subsampling_x + subsampling_y;
- if (bitdepth == 8) {
- return kTestDigests8bpp[base_index];
+ switch (bitdepth) {
+ case 8:
+ return kTestDigests8bpp[base_index];
+ case 10:
+ return kTestDigests10bpp[base_index];
+ case 12:
+ return kTestDigests12bpp[base_index];
+ default:
+ assert(bitdepth == 8 || bitdepth == 10 || bitdepth == 12);
+ return nullptr;
}
- return kTestDigests10bpp[base_index];
}
const char* GetConstructImageTestDigest(int bitdepth, int overlap_flag,
@@ -684,11 +771,24 @@ const char* GetConstructImageTestDigest(int bitdepth, int overlap_flag,
"656a9ef056b04565bec9ca7e0873c408", "a70fff81ab28d02d99dd4f142699ba39",
};
+ static const char* const kTestDigests12bpp[6] = {
+ "146f7ceadaf77e7a3c41e191a58c1d3c", "de18526db39630936733e687cdca189e",
+ "165c96ff63bf3136505ab1d239f7ceae", "a102636662547f84e5f6fb6c3e4ef959",
+ "4cb073fcc783c158a95c0b1ce0d27e9f", "3a734c71d4325a7da53e2a6e00f81647",
+ };
+
const int base_index = 3 * overlap_flag + subsampling_x + subsampling_y;
- if (bitdepth == 8) {
- return kTestDigests8bpp[base_index];
+ switch (bitdepth) {
+ case 8:
+ return kTestDigests8bpp[base_index];
+ case 10:
+ return kTestDigests10bpp[base_index];
+ case 12:
+ return kTestDigests12bpp[base_index];
+ default:
+ assert(bitdepth == 8 || bitdepth == 10 || bitdepth == 12);
+ return nullptr;
}
- return kTestDigests10bpp[base_index];
}
const char* GetScalingInitTestDigest(int param_index, int bitdepth) {
@@ -708,23 +808,36 @@ const char* GetScalingInitTestDigest(int param_index, int bitdepth) {
"11b3e256c74cee2b5679f7457793869a", "89fab5c1db09e242d0494d1c696a774a",
};
- if (bitdepth == 8) {
- return kTestDigests8bpp[param_index];
+ static const char* const kTestDigests12bpp[kNumFilmGrainTestParams] = {
+ "1554df49a863a851d146213e09d311a4", "84808c3ed3b5495a62c9d2dd9a08cb26",
+ "bb31f083a3bd9ef26587478b8752f280", "34fdfe61d6871e4882e38062a0725c5c",
+ "bb31f083a3bd9ef26587478b8752f280", "e7b8c3e4508ceabe89b78f10a9e160b8",
+ "e7b8c3e4508ceabe89b78f10a9e160b8", "a0ccc9e3d0f0c9d1f08f1249264d92f5",
+ "7992a96883c8a9a35d6ca8961bc4515b", "de906ce2c0fceed6f168215447b21b16",
+ };
+
+ switch (bitdepth) {
+ case 8:
+ return kTestDigests8bpp[param_index];
+ case 10:
+ return kTestDigests10bpp[param_index];
+ case 12:
+ return kTestDigests12bpp[param_index];
+ default:
+ assert(bitdepth == 8 || bitdepth == 10 || bitdepth == 12);
+ return nullptr;
}
- assert(bitdepth == 10);
- return kTestDigests10bpp[param_index];
}
const char* GetBlendLumaTestDigest(int bitdepth) {
- static const char* const kTestDigest8bpp = "de35b16c702690b1d311cdd0973835d7";
-
- static const char* const kTestDigest10bpp =
- "60e9f24dcaaa0207a8db5ab5f3c66608";
+ static const char* const kTestDigests[] = {
+ "de35b16c702690b1d311cdd0973835d7",
+ "60e9f24dcaaa0207a8db5ab5f3c66608",
+ "8e7d44b620bb7768459074be6bfbca7b",
+ };
- if (bitdepth == 8) {
- return kTestDigest8bpp;
- }
- return kTestDigest10bpp;
+ assert(bitdepth == 8 || bitdepth == 10 || bitdepth == 12);
+ return kTestDigests[(bitdepth - 8) / 2];
}
const char* GetBlendChromaUTestDigest(int bitdepth,
@@ -742,12 +855,25 @@ const char* GetBlendChromaUTestDigest(int bitdepth,
"9b7958a2278a16bce2b7bc31fdd811f5", "c5c3c8cccf6a2b4e40b4a412a5bf4f08",
};
+ static const char* const kTestDigests12bpp[6] = {
+ "8fad0cc641da35e0d2d8f178c7ce8394", "793eb9d2e6b4ea2e3bb08e7068236155",
+ "9156bd85ab9493d8867a174f920bb1e6", "6834319b4c88e3e0c96b6f8d7efd08dd",
+ "c40e492790d3803a734efbc6feca46e2", "d884c3b1e2c21d98844ca7639e0599a5",
+ };
+
const int base_index =
3 * chroma_scaling_from_luma + subsampling_x + subsampling_y;
- if (bitdepth == 8) {
- return kTestDigests8bpp[base_index];
+ switch (bitdepth) {
+ case 8:
+ return kTestDigests8bpp[base_index];
+ case 10:
+ return kTestDigests10bpp[base_index];
+ case 12:
+ return kTestDigests12bpp[base_index];
+ default:
+ assert(bitdepth == 8 || bitdepth == 10 || bitdepth == 12);
+ return nullptr;
}
- return kTestDigests10bpp[base_index];
}
const char* GetBlendChromaVTestDigest(int bitdepth,
@@ -765,12 +891,25 @@ const char* GetBlendChromaVTestDigest(int bitdepth,
"ed4382caa936acf1158ff8049d18ffac", "942bdd1344c9182dd7572099fb9372db",
};
+ static const char* const kTestDigests12bpp[6] = {
+ "70704a1e171a3a70d40b7d0037a75fbc", "62549e2afbf36a1ed405a6574d39c542",
+ "e93889927ab77c6e0767ff071d980c02", "a0c1f6ed78874137710fee7418d80959",
+ "f6283e36a25cb867e30bdf0bfdb2124b", "741c2d48898835b9d9e3bd0b6ac6269a",
+ };
+
const int base_index =
3 * chroma_scaling_from_luma + subsampling_x + subsampling_y;
- if (bitdepth == 8) {
- return kTestDigests8bpp[base_index];
+ switch (bitdepth) {
+ case 8:
+ return kTestDigests8bpp[base_index];
+ case 10:
+ return kTestDigests10bpp[base_index];
+ case 12:
+ return kTestDigests12bpp[base_index];
+ default:
+ assert(bitdepth == 8 || bitdepth == 10 || bitdepth == 12);
+ return nullptr;
}
- return kTestDigests10bpp[base_index];
}
// GetFilmGrainRandomNumber() is only invoked with |bits| equal to 11 or 8. Test
@@ -844,6 +983,7 @@ template <int bitdepth>
class AutoRegressionTestLuma
: public testing::TestWithParam<std::tuple<int, int>> {
public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
using GrainType =
typename std::conditional<bitdepth == 8, int8_t, int16_t>::type;
@@ -982,6 +1122,28 @@ TEST_P(AutoRegressionTestLuma10bpp, DISABLED_Speed) {
}
#endif // LIBGAV1_MAX_BITDEPTH >= 10
+#if LIBGAV1_MAX_BITDEPTH == 12
+using AutoRegressionTestLuma12bpp = AutoRegressionTestLuma<12>;
+
+TEST_P(AutoRegressionTestLuma12bpp, AutoRegressiveFilterLuma) {
+ TestAutoRegressiveFilterLuma(std::get<0>(GetParam()), std::get<1>(GetParam()),
+ 1, /*saturate=*/false,
+ /*compare=*/false);
+}
+
+TEST_P(AutoRegressionTestLuma12bpp, AutoRegressiveFilterLumaSaturated) {
+ TestAutoRegressiveFilterLuma(std::get<0>(GetParam()), std::get<1>(GetParam()),
+ 1, /*saturate=*/true,
+ /*compare=*/true);
+}
+
+TEST_P(AutoRegressionTestLuma12bpp, DISABLED_Speed) {
+ TestAutoRegressiveFilterLuma(std::get<0>(GetParam()), std::get<1>(GetParam()),
+ 1e5,
+ /*saturate=*/false, /*compare=*/false);
+}
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
INSTANTIATE_TEST_SUITE_P(
C, AutoRegressionTestLuma8bpp,
testing::Combine(testing::Range(1, 4) /* coeff_lag */,
@@ -1006,6 +1168,13 @@ INSTANTIATE_TEST_SUITE_P(
#endif
#endif // LIBGAV1_MAX_BITDEPTH >= 10
+#if LIBGAV1_MAX_BITDEPTH == 12
+INSTANTIATE_TEST_SUITE_P(
+ C, AutoRegressionTestLuma12bpp,
+ testing::Combine(testing::Range(1, 4) /* coeff_lag */,
+ testing::Range(0, 10) /* param_index */));
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
struct AutoRegressionChromaTestParam {
explicit AutoRegressionChromaTestParam(const std::tuple<int, int>& in)
: coeff_lag(std::get<0>(in)) {
@@ -1033,6 +1202,7 @@ template <int bitdepth>
class AutoRegressionTestChroma
: public testing::TestWithParam<std::tuple<int, int>> {
public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
using GrainType =
typename std::conditional<bitdepth == 8, int8_t, int16_t>::type;
@@ -1228,9 +1398,37 @@ TEST_P(AutoRegressionTestChroma10bpp, DISABLED_Speed) {
1e5 * (1 << (test_param.subsampling_y + test_param.subsampling_x)),
/*saturate=*/false, /*compare=*/false);
}
-
#endif // LIBGAV1_MAX_BITDEPTH >= 10
+#if LIBGAV1_MAX_BITDEPTH == 12
+using AutoRegressionTestChroma12bpp = AutoRegressionTestChroma<12>;
+
+TEST_P(AutoRegressionTestChroma12bpp, AutoRegressiveFilterChroma) {
+ AutoRegressionChromaTestParam test_param(GetParam());
+ TestAutoRegressiveFilterChroma(test_param.coeff_lag, test_param.subsampling_x,
+ test_param.subsampling_y, 1,
+ /*saturate=*/false,
+ /*compare=*/false);
+}
+
+TEST_P(AutoRegressionTestChroma12bpp, AutoRegressiveFilterChromaSaturated) {
+ AutoRegressionChromaTestParam test_param(GetParam());
+ TestAutoRegressiveFilterChroma(test_param.coeff_lag, test_param.subsampling_x,
+ test_param.subsampling_y, 1, /*saturate=*/true,
+ /*compare=*/true);
+}
+
+TEST_P(AutoRegressionTestChroma12bpp, DISABLED_Speed) {
+ AutoRegressionChromaTestParam test_param(GetParam());
+ TestAutoRegressiveFilterChroma(
+ test_param.coeff_lag, test_param.subsampling_x, test_param.subsampling_y,
+ // Subsampling cuts each dimension of the chroma blocks in half, so run
+ // twice as many times to compensate.
+ 1e5 * (1 << (test_param.subsampling_y + test_param.subsampling_x)),
+ /*saturate=*/false, /*compare=*/false);
+}
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
INSTANTIATE_TEST_SUITE_P(C, AutoRegressionTestChroma8bpp,
testing::Combine(testing::Range(0, 4) /* coeff_lag */,
testing::Range(0,
@@ -1243,6 +1441,13 @@ INSTANTIATE_TEST_SUITE_P(C, AutoRegressionTestChroma10bpp,
3) /* subsampling */));
#endif // LIBGAV1_MAX_BITDEPTH >= 10
+#if LIBGAV1_MAX_BITDEPTH == 12
+INSTANTIATE_TEST_SUITE_P(C, AutoRegressionTestChroma12bpp,
+ testing::Combine(testing::Range(0, 4) /* coeff_lag */,
+ testing::Range(0,
+ 3) /* subsampling */));
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
#if LIBGAV1_ENABLE_NEON
INSTANTIATE_TEST_SUITE_P(NEON, AutoRegressionTestChroma8bpp,
testing::Combine(testing::Range(0, 4) /* coeff_lag */,
@@ -1260,6 +1465,7 @@ INSTANTIATE_TEST_SUITE_P(NEON, AutoRegressionTestChroma10bpp,
template <int bitdepth>
class GrainGenerationTest : public testing::TestWithParam<int> {
protected:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
using GrainType =
typename std::conditional<bitdepth == 8, int8_t, int16_t>::type;
@@ -1313,6 +1519,18 @@ TEST_P(GrainGenerationTest10bpp, DISABLED_LumaSpeed) {
}
#endif // LIBGAV1_MAX_BITDEPTH >= 10
+#if LIBGAV1_MAX_BITDEPTH == 12
+using GrainGenerationTest12bpp = GrainGenerationTest<12>;
+
+TEST_P(GrainGenerationTest12bpp, GenerateGrainLuma) {
+ TestGenerateGrainLuma(GetParam(), 1);
+}
+
+TEST_P(GrainGenerationTest12bpp, DISABLED_LumaSpeed) {
+ TestGenerateGrainLuma(GetParam(), 1e5);
+}
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
INSTANTIATE_TEST_SUITE_P(C, GrainGenerationTest8bpp,
testing::Range(0, 10) /* param_index */);
@@ -1320,6 +1538,10 @@ INSTANTIATE_TEST_SUITE_P(C, GrainGenerationTest8bpp,
INSTANTIATE_TEST_SUITE_P(C, GrainGenerationTest10bpp,
testing::Range(0, 10) /* param_index */);
#endif // LIBGAV1_MAX_BITDEPTH >= 10
+#if LIBGAV1_MAX_BITDEPTH == 12
+INSTANTIATE_TEST_SUITE_P(C, GrainGenerationTest12bpp,
+ testing::Range(0, 10) /* param_index */);
+#endif // LIBGAV1_MAX_BITDEPTH == 12
// This param type is used for both ConstructStripesTest and
// ConstructImageTest.
@@ -1350,6 +1572,7 @@ template <int bitdepth>
class ConstructStripesTest
: public testing::TestWithParam<std::tuple<int, int>> {
public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
using GrainType =
typename std::conditional<bitdepth == 8, int8_t, int16_t>::type;
@@ -1523,6 +1746,30 @@ TEST_P(ConstructStripesTest10bpp, DISABLED_Speed) {
}
#endif // LIBGAV1_MAX_BITDEPTH >= 10
+#if LIBGAV1_MAX_BITDEPTH == 12
+using ConstructStripesTest12bpp = ConstructStripesTest<12>;
+
+TEST_P(ConstructStripesTest12bpp, RandomValues) {
+ ConstructNoiseTestParam test_params(GetParam());
+ TestConstructNoiseStripes(test_params.overlap_flag, test_params.subsampling_x,
+ test_params.subsampling_y, /*num_runs=*/1,
+ /*saturate=*/false, /*compare=*/false);
+}
+TEST_P(ConstructStripesTest12bpp, SaturatedValues) {
+ ConstructNoiseTestParam test_params(GetParam());
+ TestConstructNoiseStripes(test_params.overlap_flag, test_params.subsampling_x,
+ test_params.subsampling_y, /*num_runs=*/1,
+ /*saturate=*/true, /*compare=*/true);
+}
+
+TEST_P(ConstructStripesTest12bpp, DISABLED_Speed) {
+ ConstructNoiseTestParam test_params(GetParam());
+ TestConstructNoiseStripes(test_params.overlap_flag, test_params.subsampling_x,
+ test_params.subsampling_y, /*num_runs=*/500,
+ /*saturate=*/false, /*compare=*/false);
+}
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
INSTANTIATE_TEST_SUITE_P(C, ConstructStripesTest8bpp,
testing::Combine(testing::Range(0, 2),
testing::Range(0, 3)));
@@ -1533,9 +1780,16 @@ INSTANTIATE_TEST_SUITE_P(C, ConstructStripesTest10bpp,
testing::Range(0, 3)));
#endif // LIBGAV1_MAX_BITDEPTH >= 10
+#if LIBGAV1_MAX_BITDEPTH == 12
+INSTANTIATE_TEST_SUITE_P(C, ConstructStripesTest12bpp,
+ testing::Combine(testing::Range(0, 2),
+ testing::Range(0, 3)));
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
template <int bitdepth>
class ConstructImageTest : public testing::TestWithParam<std::tuple<int, int>> {
public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
using GrainType =
typename std::conditional<bitdepth == 8, int8_t, int16_t>::type;
@@ -1732,6 +1986,31 @@ TEST_P(ConstructImageTest10bpp, DISABLED_Speed) {
}
#endif // LIBGAV1_MAX_BITDEPTH >= 10
+#if LIBGAV1_MAX_BITDEPTH == 12
+using ConstructImageTest12bpp = ConstructImageTest<12>;
+
+TEST_P(ConstructImageTest12bpp, RandomValues) {
+ ConstructNoiseTestParam test_params(GetParam());
+ TestConstructNoiseImage(test_params.overlap_flag, test_params.subsampling_x,
+ test_params.subsampling_y, /*num_runs=*/1,
+ /*saturate=*/false, /*compare=*/false);
+}
+
+TEST_P(ConstructImageTest12bpp, SaturatedValues) {
+ ConstructNoiseTestParam test_params(GetParam());
+ TestConstructNoiseImage(test_params.overlap_flag, test_params.subsampling_x,
+ test_params.subsampling_y, /*num_runs=*/1,
+ /*saturate=*/true, /*compare=*/true);
+}
+
+TEST_P(ConstructImageTest12bpp, DISABLED_Speed) {
+ ConstructNoiseTestParam test_params(GetParam());
+ TestConstructNoiseImage(test_params.overlap_flag, test_params.subsampling_x,
+ test_params.subsampling_y, /*num_runs=*/500,
+ /*saturate=*/false, /*compare=*/false);
+}
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
INSTANTIATE_TEST_SUITE_P(C, ConstructImageTest8bpp,
testing::Combine(testing::Range(0, 2),
testing::Range(0, 3)));
@@ -1748,9 +2027,16 @@ INSTANTIATE_TEST_SUITE_P(C, ConstructImageTest10bpp,
testing::Range(0, 3)));
#endif // LIBGAV1_MAX_BITDEPTH >= 10
+#if LIBGAV1_MAX_BITDEPTH == 12
+INSTANTIATE_TEST_SUITE_P(C, ConstructImageTest12bpp,
+ testing::Combine(testing::Range(0, 2),
+ testing::Range(0, 3)));
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
template <int bitdepth>
class ScalingLookupTableTest : public testing::TestWithParam<int> {
public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
ScalingLookupTableTest() {
test_utils::ResetDspTable(bitdepth);
FilmGrainInit_C();
@@ -1840,6 +2126,18 @@ TEST_P(ScalingLookupTableTest10bpp, DISABLED_Speed) {
}
#endif // LIBGAV1_MAX_BITDEPTH >= 10
+#if LIBGAV1_MAX_BITDEPTH == 12
+using ScalingLookupTableTest12bpp = ScalingLookupTableTest<12>;
+
+TEST_P(ScalingLookupTableTest12bpp, ZeroPoints) { ZeroPoints(); }
+
+TEST_P(ScalingLookupTableTest12bpp, Correctness) { TestSpeed(/*num_runs=*/1); }
+
+TEST_P(ScalingLookupTableTest12bpp, DISABLED_Speed) {
+ TestSpeed(/*num_runs=*/1e5);
+}
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
INSTANTIATE_TEST_SUITE_P(C, ScalingLookupTableTest8bpp,
testing::Range(0, kNumFilmGrainTestParams));
@@ -1858,6 +2156,11 @@ INSTANTIATE_TEST_SUITE_P(NEON, ScalingLookupTableTest10bpp,
#endif
#endif // LIBGAV1_MAX_BITDEPTH >= 10
+#if LIBGAV1_MAX_BITDEPTH == 12
+INSTANTIATE_TEST_SUITE_P(C, ScalingLookupTableTest12bpp,
+ testing::Range(0, kNumFilmGrainTestParams));
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
struct BlendNoiseTestParam {
explicit BlendNoiseTestParam(const std::tuple<int, int>& in)
: chroma_scaling_from_luma(std::get<0>(in)) {
@@ -1884,6 +2187,7 @@ struct BlendNoiseTestParam {
template <int bitdepth, typename Pixel>
class BlendNoiseTest : public testing::TestWithParam<std::tuple<int, int>> {
public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
using GrainType =
typename std::conditional<bitdepth == 8, int8_t, int16_t>::type;
@@ -2213,9 +2517,22 @@ INSTANTIATE_TEST_SUITE_P(NEON, BlendNoiseTest10bpp,
#endif
#endif // LIBGAV1_MAX_BITDEPTH >= 10
+#if LIBGAV1_MAX_BITDEPTH == 12
+using BlendNoiseTest12bpp = BlendNoiseTest<12, uint16_t>;
+
+TEST_P(BlendNoiseTest12bpp, MatchesOriginalOutput) { TestSpeed(1); }
+
+TEST_P(BlendNoiseTest12bpp, DISABLED_Speed) { TestSpeed(kNumSpeedTests); }
+
+INSTANTIATE_TEST_SUITE_P(C, BlendNoiseTest12bpp,
+ testing::Combine(testing::Range(0, 2),
+ testing::Range(0, 3)));
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
template <int bitdepth, typename Pixel>
class FilmGrainSpeedTest : public testing::TestWithParam<int> {
public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
FilmGrainSpeedTest() {
test_utils::ResetDspTable(bitdepth);
FilmGrainInit_C();
@@ -2354,6 +2671,16 @@ INSTANTIATE_TEST_SUITE_P(NEON, FilmGrainSpeedTest10bpp,
#endif // LIBGAV1_MAX_BITDEPTH >= 10
+#if LIBGAV1_MAX_BITDEPTH == 12
+using FilmGrainSpeedTest12bpp = FilmGrainSpeedTest<12, uint16_t>;
+
+TEST_P(FilmGrainSpeedTest12bpp, MatchesOriginalOutput) { TestSpeed(1); }
+
+TEST_P(FilmGrainSpeedTest12bpp, DISABLED_Speed) { TestSpeed(kNumSpeedTests); }
+
+INSTANTIATE_TEST_SUITE_P(C, FilmGrainSpeedTest12bpp, testing::Values(0, 3, 8));
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
} // namespace
} // namespace film_grain
} // namespace dsp
diff --git a/src/gav1/decoder_buffer.h b/src/gav1/decoder_buffer.h
index 880c320..0a5586e 100644
--- a/src/gav1/decoder_buffer.h
+++ b/src/gav1/decoder_buffer.h
@@ -115,6 +115,27 @@ typedef enum Libgav1ColorRange {
kLibgav1ColorRangeFull // YUV/RGB [0..255]
} Libgav1ColorRange;
+typedef struct Libgav1ObuMetadataHdrCll { // NOLINT
+ uint16_t max_cll; // Maximum content light level.
+ uint16_t max_fall; // Maximum frame-average light level.
+} Libgav1ObuMetadataHdrCll;
+
+typedef struct Libgav1ObuMetadataHdrMdcv { // NOLINT
+ uint16_t primary_chromaticity_x[3];
+ uint16_t primary_chromaticity_y[3];
+ uint16_t white_point_chromaticity_x;
+ uint16_t white_point_chromaticity_y;
+ uint32_t luminance_max;
+ uint32_t luminance_min;
+} Libgav1ObuMetadataHdrMdcv;
+
+typedef struct Libgav1ObuMetadataItutT35 { // NOLINT
+ uint8_t country_code;
+ uint8_t country_code_extension_byte; // Valid if country_code is 0xFF.
+ uint8_t* payload_bytes;
+ int payload_size;
+} Libgav1ObuMetadataItutT35;
+
typedef struct Libgav1DecoderBuffer {
#if defined(__cplusplus)
LIBGAV1_PUBLIC int NumPlanes() const {
@@ -146,6 +167,18 @@ typedef struct Libgav1DecoderBuffer {
// Temporal id of this frame.
int temporal_id;
+ Libgav1ObuMetadataHdrCll hdr_cll;
+ int has_hdr_cll; // 1 if the values in hdr_cll are valid for this frame. 0
+ // otherwise.
+
+ Libgav1ObuMetadataHdrMdcv hdr_mdcv;
+ int has_hdr_mdcv; // 1 if the values in hdr_mdcv are valid for this frame. 0
+ // otherwise.
+
+ Libgav1ObuMetadataItutT35 itut_t35;
+ int has_itut_t35; // 1 if the values in itut_t35 are valid for this frame. 0
+ // otherwise.
+
// The |user_private_data| argument passed to Decoder::EnqueueFrame().
int64_t user_private_data;
// The |private_data| field of FrameBuffer. Set by the get frame buffer
@@ -264,6 +297,10 @@ using ColorRange = Libgav1ColorRange;
constexpr ColorRange kColorRangeStudio = kLibgav1ColorRangeStudio;
constexpr ColorRange kColorRangeFull = kLibgav1ColorRangeFull;
+using ObuMetadataHdrCll = Libgav1ObuMetadataHdrCll;
+using ObuMetadataHdrMdcv = Libgav1ObuMetadataHdrMdcv;
+using ObuMetadataItutT35 = Libgav1ObuMetadataItutT35;
+
using DecoderBuffer = Libgav1DecoderBuffer;
} // namespace libgav1
diff --git a/src/gav1/version.h b/src/gav1/version.h
index 9bdc630..b386acc 100644
--- a/src/gav1/version.h
+++ b/src/gav1/version.h
@@ -23,7 +23,7 @@
// (https://semver.org).
#define LIBGAV1_MAJOR_VERSION 0
-#define LIBGAV1_MINOR_VERSION 17
+#define LIBGAV1_MINOR_VERSION 18
#define LIBGAV1_PATCH_VERSION 0
#define LIBGAV1_VERSION \
diff --git a/src/libgav1_decoder.cmake b/src/libgav1_decoder.cmake
index b97d09d..1314d0b 100644
--- a/src/libgav1_decoder.cmake
+++ b/src/libgav1_decoder.cmake
@@ -107,7 +107,7 @@ macro(libgav1_add_decoder_targets)
list(APPEND libgav1_static_lib_sources ${libgav1_api_sources})
endif()
- if(NOT ANDROID)
+ if(use_absl_threading)
list(APPEND libgav1_absl_deps absl::base absl::synchronization)
endif()
diff --git a/src/obu_parser.cc b/src/obu_parser.cc
index 445450b..9e9166a 100644
--- a/src/obu_parser.cc
+++ b/src/obu_parser.cc
@@ -1767,11 +1767,7 @@ bool ObuParser::ParseFrameParameters() {
int64_t scratch;
if (sequence_header_.reduced_still_picture_header) {
frame_header_.show_frame = true;
- current_frame_ = buffer_pool_->GetFreeBuffer();
- if (current_frame_ == nullptr) {
- LIBGAV1_DLOG(ERROR, "Could not get current_frame from the buffer pool.");
- return false;
- }
+ if (!EnsureCurrentFrameIsNotNull()) return false;
} else {
OBU_READ_BIT_OR_FAIL;
frame_header_.show_existing_frame = scratch != 0;
@@ -1840,11 +1836,7 @@ bool ObuParser::ParseFrameParameters() {
}
return true;
}
- current_frame_ = buffer_pool_->GetFreeBuffer();
- if (current_frame_ == nullptr) {
- LIBGAV1_DLOG(ERROR, "Could not get current_frame from the buffer pool.");
- return false;
- }
+ if (!EnsureCurrentFrameIsNotNull()) return false;
OBU_READ_LITERAL_OR_FAIL(2);
frame_header_.frame_type = static_cast<FrameType>(scratch);
current_frame_->set_frame_type(frame_header_.frame_type);
@@ -2395,50 +2387,58 @@ bool ObuParser::ParseMetadata(const uint8_t* data, size_t size) {
size -= metadata_type_size;
int64_t scratch;
switch (metadata_type) {
- case kMetadataTypeHdrContentLightLevel:
+ case kMetadataTypeHdrContentLightLevel: {
+ ObuMetadataHdrCll hdr_cll;
OBU_READ_LITERAL_OR_FAIL(16);
- metadata_.max_cll = scratch;
+ hdr_cll.max_cll = scratch;
OBU_READ_LITERAL_OR_FAIL(16);
- metadata_.max_fall = scratch;
+ hdr_cll.max_fall = scratch;
+ if (!EnsureCurrentFrameIsNotNull()) return false;
+ current_frame_->set_hdr_cll(hdr_cll);
break;
- case kMetadataTypeHdrMasteringDisplayColorVolume:
+ }
+ case kMetadataTypeHdrMasteringDisplayColorVolume: {
+ ObuMetadataHdrMdcv hdr_mdcv;
for (int i = 0; i < 3; ++i) {
OBU_READ_LITERAL_OR_FAIL(16);
- metadata_.primary_chromaticity_x[i] = scratch;
+ hdr_mdcv.primary_chromaticity_x[i] = scratch;
OBU_READ_LITERAL_OR_FAIL(16);
- metadata_.primary_chromaticity_y[i] = scratch;
+ hdr_mdcv.primary_chromaticity_y[i] = scratch;
}
OBU_READ_LITERAL_OR_FAIL(16);
- metadata_.white_point_chromaticity_x = scratch;
+ hdr_mdcv.white_point_chromaticity_x = scratch;
OBU_READ_LITERAL_OR_FAIL(16);
- metadata_.white_point_chromaticity_y = scratch;
+ hdr_mdcv.white_point_chromaticity_y = scratch;
OBU_READ_LITERAL_OR_FAIL(32);
- metadata_.luminance_max = static_cast<uint32_t>(scratch);
+ hdr_mdcv.luminance_max = static_cast<uint32_t>(scratch);
OBU_READ_LITERAL_OR_FAIL(32);
- metadata_.luminance_min = static_cast<uint32_t>(scratch);
+ hdr_mdcv.luminance_min = static_cast<uint32_t>(scratch);
+ if (!EnsureCurrentFrameIsNotNull()) return false;
+ current_frame_->set_hdr_mdcv(hdr_mdcv);
break;
+ }
case kMetadataTypeScalability:
if (!ParseMetadataScalability()) return false;
break;
case kMetadataTypeItutT35: {
+ ObuMetadataItutT35 itut_t35;
OBU_READ_LITERAL_OR_FAIL(8);
- metadata_.itu_t_t35_country_code = static_cast<uint8_t>(scratch);
+ itut_t35.country_code = static_cast<uint8_t>(scratch);
++data;
--size;
- if (metadata_.itu_t_t35_country_code == 0xFF) {
+ if (itut_t35.country_code == 0xFF) {
OBU_READ_LITERAL_OR_FAIL(8);
- metadata_.itu_t_t35_country_code_extension_byte =
- static_cast<uint8_t>(scratch);
+ itut_t35.country_code_extension_byte = static_cast<uint8_t>(scratch);
++data;
--size;
}
- // Read itu_t_t35_payload_bytes. Section 6.7.2 of the spec says:
- // itu_t_t35_payload_bytes shall be bytes containing data registered as
+ // Read itut_t35.payload_bytes. Section 6.7.2 of the spec says:
+ // itut_t35.payload_bytes shall be bytes containing data registered as
// specified in Recommendation ITU-T T.35.
- // Therefore itu_t_t35_payload_bytes is byte aligned and the first
- // trailing byte should be 0x80. Since the exact syntax of
- // itu_t_t35_payload_bytes is not defined in the AV1 spec, identify the
- // end of itu_t_t35_payload_bytes by searching for the trailing bit.
+ // Therefore itut_t35.payload_bytes is byte aligned and the first trailing
+ // byte should be 0x80. Since the exact syntax of itut_t35.payload_bytes
+ // is not defined in the AV1 spec, identify the end of
+ // itut_t35.payload_bytes by searching for the trailing bit.
const int i = GetLastNonzeroByteIndex(data, size);
if (i < 0) {
LIBGAV1_DLOG(ERROR, "Trailing bit is missing.");
@@ -2447,20 +2447,15 @@ bool ObuParser::ParseMetadata(const uint8_t* data, size_t size) {
if (data[i] != 0x80) {
LIBGAV1_DLOG(
ERROR,
- "itu_t_t35_payload_bytes is not byte aligned. The last nonzero "
- "byte of the payload data is 0x%x, should be 0x80.",
+ "itut_t35.payload_bytes is not byte aligned. The last nonzero byte "
+ "of the payload data is 0x%x, should be 0x80.",
data[i]);
return false;
}
- if (i != 0) {
- // data[0]..data[i - 1] are itu_t_t35_payload_bytes.
- metadata_.itu_t_t35_payload_bytes.reset(new (std::nothrow) uint8_t[i]);
- if (metadata_.itu_t_t35_payload_bytes == nullptr) {
- LIBGAV1_DLOG(ERROR, "Allocation of itu_t_t35_payload_bytes failed.");
- return false;
- }
- memcpy(metadata_.itu_t_t35_payload_bytes.get(), data, i);
- metadata_.itu_t_t35_payload_size = i;
+ itut_t35.payload_size = i;
+ if (!EnsureCurrentFrameIsNotNull() ||
+ !current_frame_->set_itut_t35(itut_t35, data)) {
+ return false;
}
// Skip all bits before the trailing bit.
bit_reader_->SkipBytes(i);
@@ -2637,6 +2632,16 @@ bool ObuParser::InitBitReader(const uint8_t* const data, size_t size) {
return bit_reader_ != nullptr;
}
+bool ObuParser::EnsureCurrentFrameIsNotNull() {
+ if (current_frame_ != nullptr) return true;
+ current_frame_ = buffer_pool_->GetFreeBuffer();
+ if (current_frame_ == nullptr) {
+ LIBGAV1_DLOG(ERROR, "Could not get current_frame from the buffer pool.");
+ return false;
+ }
+ return true;
+}
+
bool ObuParser::HasData() const { return size_ > 0; }
StatusCode ObuParser::ParseOneFrame(RefCountedBufferPtr* const current_frame) {
@@ -2652,7 +2657,6 @@ StatusCode ObuParser::ParseOneFrame(RefCountedBufferPtr* const current_frame) {
// Clear everything except the sequence header.
obu_headers_.clear();
frame_header_ = {};
- metadata_ = {};
tile_buffers_.clear();
next_tile_group_start_ = 0;
sequence_header_changed_ = false;
diff --git a/src/obu_parser.h b/src/obu_parser.h
index 3f452ef..eba3370 100644
--- a/src/obu_parser.h
+++ b/src/obu_parser.h
@@ -221,26 +221,6 @@ enum MetadataType : uint8_t {
// 32 and greater are reserved for AOM use.
};
-struct ObuMetadata {
- // Maximum content light level.
- uint16_t max_cll;
- // Maximum frame-average light level.
- uint16_t max_fall;
- uint16_t primary_chromaticity_x[3];
- uint16_t primary_chromaticity_y[3];
- uint16_t white_point_chromaticity_x;
- uint16_t white_point_chromaticity_y;
- uint32_t luminance_max;
- uint32_t luminance_min;
- // ITU-T T.35.
- uint8_t itu_t_t35_country_code;
- uint8_t itu_t_t35_country_code_extension_byte; // Valid if
- // itu_t_t35_country_code is
- // 0xFF.
- std::unique_ptr<uint8_t[]> itu_t_t35_payload_bytes;
- size_t itu_t_t35_payload_size;
-};
-
class ObuParser : public Allocable {
public:
ObuParser(const uint8_t* const data, size_t size, int operating_point,
@@ -276,7 +256,6 @@ class ObuParser : public Allocable {
const ObuSequenceHeader& sequence_header() const { return sequence_header_; }
const ObuFrameHeader& frame_header() const { return frame_header_; }
const Vector<TileBuffer>& tile_buffers() const { return tile_buffers_; }
- const ObuMetadata& metadata() const { return metadata_; }
// Returns true if the last call to ParseOneFrame() encountered a sequence
// header change.
bool sequence_header_changed() const { return sequence_header_changed_; }
@@ -372,6 +351,11 @@ class ObuParser : public Allocable {
size_t tg_header_size, size_t bytes_consumed_so_far);
bool ParseTileGroup(size_t size, size_t bytes_consumed_so_far); // 5.11.1.
+ // Populates |current_frame_| from the |buffer_pool_| if |current_frame_| is
+ // nullptr. Does not do anything otherwise. Returns true on success, false
+ // otherwise.
+ bool EnsureCurrentFrameIsNotNull();
+
// Parser elements.
std::unique_ptr<RawBitReader> bit_reader_;
const uint8_t* data_;
@@ -383,7 +367,6 @@ class ObuParser : public Allocable {
ObuSequenceHeader sequence_header_ = {};
ObuFrameHeader frame_header_ = {};
Vector<TileBuffer> tile_buffers_;
- ObuMetadata metadata_ = {};
// The expected starting tile number of the next Tile Group.
int next_tile_group_start_ = 0;
// If true, the sequence_header_ field is valid.
diff --git a/src/obu_parser_test.cc b/src/obu_parser_test.cc
index 6397ad0..a471037 100644
--- a/src/obu_parser_test.cc
+++ b/src/obu_parser_test.cc
@@ -31,6 +31,7 @@
#include "src/gav1/status_code.h"
#include "src/utils/common.h"
#include "src/utils/constants.h"
+#include "src/utils/dynamic_buffer.h"
#include "src/utils/segmentation.h"
#include "src/utils/types.h"
#include "src/utils/vector.h"
@@ -780,39 +781,38 @@ class ObuParserTest : public testing::Test {
OBU_TEST_COMPARE(film_grain_params_present);
}
- void VerifyMetadata(MetadataType type, const ObuMetadata& expected) {
- const ObuMetadata& actual = obu_->metadata();
- switch (type) {
- case kMetadataTypeHdrContentLightLevel:
- OBU_TEST_COMPARE(max_cll);
- OBU_TEST_COMPARE(max_fall);
- break;
- case kMetadataTypeHdrMasteringDisplayColorVolume:
- for (int i = 0; i < 3; ++i) {
- OBU_TEST_COMPARE(primary_chromaticity_x[i]);
- OBU_TEST_COMPARE(primary_chromaticity_y[i]);
- }
- OBU_TEST_COMPARE(white_point_chromaticity_x);
- OBU_TEST_COMPARE(white_point_chromaticity_y);
- OBU_TEST_COMPARE(luminance_max);
- OBU_TEST_COMPARE(luminance_min);
- break;
- case kMetadataTypeScalability:
- break;
- case kMetadataTypeItutT35:
- OBU_TEST_COMPARE(itu_t_t35_country_code);
- OBU_TEST_COMPARE(itu_t_t35_country_code_extension_byte);
- ASSERT_EQ(expected.itu_t_t35_payload_size,
- actual.itu_t_t35_payload_size);
- if (actual.itu_t_t35_payload_size != 0) {
- EXPECT_EQ(memcmp(expected.itu_t_t35_payload_bytes.get(),
- actual.itu_t_t35_payload_bytes.get(),
- actual.itu_t_t35_payload_size),
- 0);
- }
- break;
- case kMetadataTypeTimecode:
- break;
+ void VerifyMetadataHdrCll(const ObuMetadataHdrCll& expected) {
+ EXPECT_TRUE(obu_->current_frame_->hdr_cll_set());
+ const ObuMetadataHdrCll& actual = obu_->current_frame_->hdr_cll();
+ OBU_TEST_COMPARE(max_cll);
+ OBU_TEST_COMPARE(max_fall);
+ }
+
+ void VerifyMetadataHdrMdcv(const ObuMetadataHdrMdcv& expected) {
+ EXPECT_TRUE(obu_->current_frame_->hdr_mdcv_set());
+ const ObuMetadataHdrMdcv& actual = obu_->current_frame_->hdr_mdcv();
+ for (int i = 0; i < 3; ++i) {
+ OBU_TEST_COMPARE(primary_chromaticity_x[i]);
+ OBU_TEST_COMPARE(primary_chromaticity_y[i]);
+ }
+ OBU_TEST_COMPARE(white_point_chromaticity_x);
+ OBU_TEST_COMPARE(white_point_chromaticity_y);
+ OBU_TEST_COMPARE(luminance_max);
+ OBU_TEST_COMPARE(luminance_min);
+ }
+
+ void VerifyMetadataItutT35(const ObuMetadataItutT35& expected) {
+ EXPECT_TRUE(obu_->current_frame_->itut_t35_set());
+ const ObuMetadataItutT35& actual = obu_->current_frame_->itut_t35();
+ OBU_TEST_COMPARE(country_code);
+ if (actual.country_code == 0xFF) {
+ OBU_TEST_COMPARE(country_code_extension_byte);
+ }
+ ASSERT_EQ(expected.payload_size, actual.payload_size);
+ if (actual.payload_size != 0) {
+ EXPECT_EQ(memcmp(expected.payload_bytes, actual.payload_bytes,
+ actual.payload_size),
+ 0);
}
}
@@ -2521,9 +2521,9 @@ TEST_F(ObuParserTest, MetadataUnknownType) {
ASSERT_TRUE(ParseMetadata(data.GenerateData()));
}
-TEST_F(ObuParserTest, MetadataCll) {
+TEST_F(ObuParserTest, MetadataHdrCll) {
BytesAndBits data;
- ObuMetadata gold;
+ ObuMetadataHdrCll gold;
gold.max_cll = 25;
gold.max_fall = 100;
@@ -2532,12 +2532,12 @@ TEST_F(ObuParserTest, MetadataCll) {
data.AppendLiteral(16, gold.max_fall);
ASSERT_TRUE(ParseMetadata(data.GenerateData()));
- VerifyMetadata(kMetadataTypeHdrContentLightLevel, gold);
+ VerifyMetadataHdrCll(gold);
}
-TEST_F(ObuParserTest, MetadataMdcv) {
+TEST_F(ObuParserTest, MetadataHdrMdcv) {
BytesAndBits data;
- ObuMetadata gold;
+ ObuMetadataHdrMdcv gold;
for (int i = 0; i < 3; ++i) {
gold.primary_chromaticity_x[i] = 0;
gold.primary_chromaticity_y[i] = 0;
@@ -2558,34 +2558,32 @@ TEST_F(ObuParserTest, MetadataMdcv) {
data.AppendLiteral(32, gold.luminance_min);
ASSERT_TRUE(ParseMetadata(data.GenerateData()));
- VerifyMetadata(kMetadataTypeHdrMasteringDisplayColorVolume, gold);
+ VerifyMetadataHdrMdcv(gold);
}
TEST_F(ObuParserTest, MetadataScalability) {
BytesAndBits data;
- ObuMetadata gold;
data.AppendLiteral(8, kMetadataTypeScalability);
data.AppendLiteral(8, 0); // scalability_mode_idc
ASSERT_TRUE(ParseMetadata(data.GenerateData()));
- VerifyMetadata(kMetadataTypeScalability, gold);
}
TEST_F(ObuParserTest, MetadataItutT35) {
BytesAndBits data;
- ObuMetadata gold;
- gold.itu_t_t35_country_code = 0xA6; // 1 0 1 0 0 1 1 0 Switzerland
- gold.itu_t_t35_country_code_extension_byte = 0;
- gold.itu_t_t35_payload_bytes.reset(new (std::nothrow) uint8_t[10]);
- ASSERT_NE(gold.itu_t_t35_payload_bytes, nullptr);
+ ObuMetadataItutT35 gold;
+ gold.country_code = 0xA6; // 1 0 1 0 0 1 1 0 Switzerland
+ DynamicBuffer<uint8_t> payload_bytes;
+ ASSERT_TRUE(payload_bytes.Resize(10));
+ gold.payload_bytes = payload_bytes.get();
for (int i = 0; i < 10; ++i) {
- gold.itu_t_t35_payload_bytes[i] = 9 - i;
+ gold.payload_bytes[i] = 9 - i;
}
- gold.itu_t_t35_payload_size = 10;
+ gold.payload_size = 10;
data.AppendLiteral(8, kMetadataTypeItutT35);
- data.AppendLiteral(8, gold.itu_t_t35_country_code);
+ data.AppendLiteral(8, gold.country_code);
for (int i = 0; i < 10; ++i) {
data.AppendLiteral(8, 9 - i);
}
@@ -2596,12 +2594,20 @@ TEST_F(ObuParserTest, MetadataItutT35) {
data.AppendLiteral(8, 0x00);
ASSERT_TRUE(ParseMetadata(data.GenerateData()));
- VerifyMetadata(kMetadataTypeItutT35, gold);
+ VerifyMetadataItutT35(gold);
+
+ gold.country_code = 0xFF;
+ gold.country_code_extension_byte = 10;
+
+ data.SetLiteral(8, 8, gold.country_code);
+ data.InsertLiteral(16, 8, gold.country_code_extension_byte);
+
+ ASSERT_TRUE(ParseMetadata(data.GenerateData()));
+ VerifyMetadataItutT35(gold);
}
TEST_F(ObuParserTest, MetadataTimecode) {
BytesAndBits data;
- ObuMetadata gold;
data.AppendLiteral(8, kMetadataTypeTimecode);
data.AppendLiteral(5, 0); // counting_type
@@ -2615,12 +2621,10 @@ TEST_F(ObuParserTest, MetadataTimecode) {
data.AppendLiteral(5, 0); // time_offset_length
ASSERT_TRUE(ParseMetadata(data.GenerateData()));
- VerifyMetadata(kMetadataTypeTimecode, gold);
}
TEST_F(ObuParserTest, MetadataTimecodeInvalidSecondsValue) {
BytesAndBits data;
- ObuMetadata gold;
data.AppendLiteral(8, kMetadataTypeTimecode);
data.AppendLiteral(5, 0); // counting_type
@@ -2638,7 +2642,6 @@ TEST_F(ObuParserTest, MetadataTimecodeInvalidSecondsValue) {
TEST_F(ObuParserTest, MetadataTimecodeInvalidMinutesValue) {
BytesAndBits data;
- ObuMetadata gold;
data.AppendLiteral(8, kMetadataTypeTimecode);
data.AppendLiteral(5, 0); // counting_type
@@ -2656,7 +2659,6 @@ TEST_F(ObuParserTest, MetadataTimecodeInvalidMinutesValue) {
TEST_F(ObuParserTest, MetadataTimecodeInvalidHoursValue) {
BytesAndBits data;
- ObuMetadata gold;
data.AppendLiteral(8, kMetadataTypeTimecode);
data.AppendLiteral(5, 0); // counting_type
diff --git a/src/post_filter/deblock.cc b/src/post_filter/deblock.cc
index 48ad823..daee01c 100644
--- a/src/post_filter/deblock.cc
+++ b/src/post_filter/deblock.cc
@@ -329,7 +329,6 @@ void PostFilter::HorizontalDeblockFilter(int row4x4_start, int row4x4_end,
src_row, src_stride, outer_thresh_[level], inner_thresh_[level],
HevThresh(level));
}
- // TODO(chengchen): use shifts instead of multiplication.
src_row += row_step * src_stride;
row_step = DivideBy4(row_step);
}
diff --git a/src/post_filter_test.cc b/src/post_filter_test.cc
index db9d0f4..034d31f 100644
--- a/src/post_filter_test.cc
+++ b/src/post_filter_test.cc
@@ -141,6 +141,45 @@ const char* GetSuperResDigest10bpp(int id, int plane) {
}
#endif // LIBGAV1_MAX_BITDEPTH >= 10
+#if LIBGAV1_MAX_BITDEPTH == 12
+const char* GetSuperResDigest12bpp(int id, int plane) {
+ // Digests are in Y/U/V order.
+ static const char* const kDigestSuperRes[][kMaxPlanes] = {
+ {
+ // all input is 0.
+ "fccb1f57b252b1a86d335aea929d1d58",
+ "2f244a56091c9705794e92e6bcc38058",
+ "2f244a56091c9705794e92e6bcc38058",
+ },
+ {
+ // all input is 1.
+ "de8556204999d6e4bf74cfdde61a095b",
+ "e7d0f4ce6df81c46de95da7790a67384",
+ "e7d0f4ce6df81c46de95da7790a67384",
+ },
+ {
+ // all input is 2048.
+ "83d600a7b3dc9bc3f710668ee2244e6b",
+ "468eec1453edc1befeb8a346f61950a7",
+ "468eec1453edc1befeb8a346f61950a7",
+ },
+ {
+ // all input is 4095.
+ "30bdb1dfee2b02b12b38e6b9f6287e27",
+ "34d673f075d2caa93a2f648ee3569e20",
+ "34d673f075d2caa93a2f648ee3569e20",
+ },
+ {
+ // random input.
+ "f10f21f5322231d991550fce7ef9787d",
+ "a2d8b6140bd5002e86644ef433b8eb42",
+ "a2d8b6140bd5002e86644ef433b8eb42",
+ },
+ };
+ return kDigestSuperRes[id][plane];
+}
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
} // namespace
// This type is used to parameterize the tests so is defined outside the
@@ -175,6 +214,7 @@ static std::ostream& operator<<(std::ostream& os, const FrameSizeParam& param) {
template <int bitdepth, typename Pixel>
class PostFilterTestBase : public testing::TestWithParam<FrameSizeParam> {
public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
PostFilterTestBase() = default;
PostFilterTestBase(const PostFilterTestBase&) = delete;
PostFilterTestBase& operator=(const PostFilterTestBase&) = delete;
@@ -231,6 +271,7 @@ class PostFilterTestBase : public testing::TestWithParam<FrameSizeParam> {
template <int bitdepth, typename Pixel>
class PostFilterHelperFuncTest : public PostFilterTestBase<bitdepth, Pixel> {
public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
PostFilterHelperFuncTest() = default;
PostFilterHelperFuncTest(const PostFilterHelperFuncTest&) = delete;
PostFilterHelperFuncTest& operator=(const PostFilterHelperFuncTest&) = delete;
@@ -425,6 +466,7 @@ void PostFilterHelperFuncTest<bitdepth, Pixel>::TestExtendFrame(
template <int bitdepth, typename Pixel>
class PostFilterSuperResTest : public PostFilterTestBase<bitdepth, Pixel> {
public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
PostFilterSuperResTest() {
test_utils::ResetDspTable(bitdepth);
dsp::SuperResInit_C();
@@ -581,6 +623,11 @@ void PostFilterSuperResTest<bitdepth, Pixel>::TestApplySuperRes(
expected_digest = GetSuperResDigest10bpp(id, plane);
break;
#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+ case 12:
+ expected_digest = GetSuperResDigest12bpp(id, plane);
+ break;
+#endif
}
ASSERT_NE(expected_digest, nullptr);
EXPECT_STREQ(digest.c_str(), expected_digest);
@@ -680,6 +727,44 @@ INSTANTIATE_TEST_SUITE_P(PostFilterHelperFuncTestInstance,
testing::ValuesIn(kTestParamExtendFrame));
#endif // LIBGAV1_MAX_BITDEPTH >= 10
+#if LIBGAV1_MAX_BITDEPTH == 12
+using PostFilterSuperResTest12bpp = PostFilterSuperResTest<12, uint16_t>;
+
+TEST_P(PostFilterSuperResTest12bpp, ApplySuperRes) {
+ TestApplySuperRes(true, 0, 0, false);
+ TestApplySuperRes(true, 1, 1, false);
+ TestApplySuperRes(true, 1 << 11, 2, false);
+ TestApplySuperRes(true, (1 << 12) - 1, 3, false);
+ TestApplySuperRes(false, 0, 4, false);
+}
+
+TEST_P(PostFilterSuperResTest12bpp, ApplySuperResThreaded) {
+ TestApplySuperRes(true, 0, 0, true);
+ TestApplySuperRes(true, 1, 1, true);
+ TestApplySuperRes(true, 1 << 11, 2, true);
+ TestApplySuperRes(true, (1 << 12) - 1, 3, true);
+ TestApplySuperRes(false, 0, 4, true);
+}
+
+INSTANTIATE_TEST_SUITE_P(PostFilterSuperResTestInstance,
+ PostFilterSuperResTest12bpp,
+ testing::ValuesIn(kTestParamSuperRes));
+
+using PostFilterHelperFuncTest12bpp = PostFilterHelperFuncTest<12, uint16_t>;
+
+TEST_P(PostFilterHelperFuncTest12bpp, ExtendFrame) {
+ TestExtendFrame(true, 0);
+ TestExtendFrame(true, 1);
+ TestExtendFrame(true, 255);
+ TestExtendFrame(true, (1 << 12) - 1);
+ TestExtendFrame(false, 0);
+}
+
+INSTANTIATE_TEST_SUITE_P(PostFilterHelperFuncTestInstance,
+ PostFilterHelperFuncTest12bpp,
+ testing::ValuesIn(kTestParamExtendFrame));
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
namespace {
const char* GetDigestApplyCdef8bpp(int id) {
@@ -712,12 +797,29 @@ const char* GetDigestApplyCdef10bpp(int id) {
}
#endif // LIBGAV1_MAX_BITDEPTH >= 10
+#if LIBGAV1_MAX_BITDEPTH == 12
+const char* GetDigestApplyCdef12bpp(int id) {
+ static const char* const kDigest[] = {
+ "06e2d09b6ce3924f3b5d4c00ab76eea5", "287240e4b13cb75e17932a3dd7ba3b3c",
+ "265da123e3347c4fb3e434f26a3949e7", "e032ce6eb76242df6894482ac6688406",
+ "f648328221f0f02a5b7fc3d55a66271a", "8f759aa84a110902025dacf8062d2f6a",
+ "592b49e4b993d6b4634d8eb1ee3bba54", "29a3e8e329ec70d06910e982ea763e6b",
+ "f648328221f0f02a5b7fc3d55a66271a", "8f759aa84a110902025dacf8062d2f6a",
+ "592b49e4b993d6b4634d8eb1ee3bba54", "29a3e8e329ec70d06910e982ea763e6b",
+ "155dd4283f8037f86cce34b6cfe67a7e", "0a022c70ead199517af9bad2002d70cd",
+ "a966dfea52a7a2084545f68b2c9e1735", "e098438a23a7c9f276e594b98b2db922",
+ };
+ return kDigest[id];
+}
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
} // namespace
template <int bitdepth, typename Pixel>
class PostFilterApplyCdefTest : public testing::TestWithParam<FrameSizeParam>,
public test_utils::MaxAlignedAllocable {
public:
+ static_assert(bitdepth >= kBitdepth8 && bitdepth <= LIBGAV1_MAX_BITDEPTH, "");
PostFilterApplyCdefTest() = default;
PostFilterApplyCdefTest(const PostFilterApplyCdefTest&) = delete;
PostFilterApplyCdefTest& operator=(const PostFilterApplyCdefTest&) = delete;
@@ -903,17 +1005,25 @@ void PostFilterApplyCdefTest<bitdepth, Pixel>::TestMultiThread(
elapsed_time += absl::Now() - start;
CopyFilterOutputToDestBuffer();
- if (bitdepth == 8) {
- test_utils::CheckMd5Digest(kCdef, kApplyCdefName,
- GetDigestApplyCdef8bpp(id), dest_, size_,
- elapsed_time);
+ const char* expected_digest = nullptr;
+ switch (bitdepth) {
+ case 8:
+ expected_digest = GetDigestApplyCdef8bpp(id);
+ break;
#if LIBGAV1_MAX_BITDEPTH >= 10
- } else {
- test_utils::CheckMd5Digest(kCdef, kApplyCdefName,
- GetDigestApplyCdef10bpp(id), dest_, size_,
- elapsed_time);
-#endif // LIBGAV1_MAX_BITDEPTH >= 10
+ case 10:
+ expected_digest = GetDigestApplyCdef10bpp(id);
+ break;
+#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+ case 12:
+ expected_digest = GetDigestApplyCdef12bpp(id);
+ break;
+#endif
}
+ ASSERT_NE(expected_digest, nullptr);
+ test_utils::CheckMd5Digest(kCdef, kApplyCdefName, expected_digest, dest_,
+ size_, elapsed_time);
}
const FrameSizeParam kTestParamApplyCdef[] = {
@@ -953,4 +1063,18 @@ INSTANTIATE_TEST_SUITE_P(PostFilterApplyCdefTestInstance,
testing::ValuesIn(kTestParamApplyCdef));
#endif // LIBGAV1_MAX_BITDEPTH >= 10
+#if LIBGAV1_MAX_BITDEPTH == 12
+using PostFilterApplyCdefTest12bpp = PostFilterApplyCdefTest<12, uint16_t>;
+
+TEST_P(PostFilterApplyCdefTest12bpp, ApplyCdef) {
+ TestMultiThread(2);
+ TestMultiThread(4);
+ TestMultiThread(8);
+}
+
+INSTANTIATE_TEST_SUITE_P(PostFilterApplyCdefTestInstance,
+ PostFilterApplyCdefTest12bpp,
+ testing::ValuesIn(kTestParamApplyCdef));
+#endif // LIBGAV1_MAX_BITDEPTH == 12
+
} // namespace libgav1
diff --git a/src/quantizer.cc b/src/quantizer.cc
index cd720d6..eb13314 100644
--- a/src/quantizer.cc
+++ b/src/quantizer.cc
@@ -20,8 +20,9 @@
#include "src/utils/common.h"
#include "src/utils/constants.h"
-#if LIBGAV1_MAX_BITDEPTH != 8 && LIBGAV1_MAX_BITDEPTH != 10
-#error LIBGAV1_MAX_BITDEPTH must be 8 or 10
+#if LIBGAV1_MAX_BITDEPTH != 8 && LIBGAV1_MAX_BITDEPTH != 10 && \
+ LIBGAV1_MAX_BITDEPTH != 12
+#error LIBGAV1_MAX_BITDEPTH must be 8, 10 or 12
#endif
namespace libgav1 {
@@ -87,6 +88,43 @@ constexpr int16_t kDcLookup[][256] = {
4737, 4929, 5130, 5347
},
#endif // LIBGAV1_MAX_BITDEPTH >= 10
+#if LIBGAV1_MAX_BITDEPTH == 12
+ // Lookup table for 12 bit.
+ {
+ 4, 12, 18, 25, 33, 41, 50, 60,
+ 70, 80, 91, 103, 115, 127, 140, 153,
+ 166, 180, 194, 208, 222, 237, 251, 266,
+ 281, 296, 312, 327, 343, 358, 374, 390,
+ 405, 421, 437, 453, 469, 484, 500, 516,
+ 532, 548, 564, 580, 596, 611, 627, 643,
+ 659, 674, 690, 706, 721, 737, 752, 768,
+ 783, 798, 814, 829, 844, 859, 874, 889,
+ 904, 919, 934, 949, 964, 978, 993, 1008,
+ 1022, 1037, 1051, 1065, 1080, 1094, 1108, 1122,
+ 1136, 1151, 1165, 1179, 1192, 1206, 1220, 1234,
+ 1248, 1261, 1275, 1288, 1302, 1315, 1329, 1342,
+ 1368, 1393, 1419, 1444, 1469, 1494, 1519, 1544,
+ 1569, 1594, 1618, 1643, 1668, 1692, 1717, 1741,
+ 1765, 1789, 1814, 1838, 1862, 1885, 1909, 1933,
+ 1957, 1992, 2027, 2061, 2096, 2130, 2165, 2199,
+ 2233, 2267, 2300, 2334, 2367, 2400, 2434, 2467,
+ 2499, 2532, 2575, 2618, 2661, 2704, 2746, 2788,
+ 2830, 2872, 2913, 2954, 2995, 3036, 3076, 3127,
+ 3177, 3226, 3275, 3324, 3373, 3421, 3469, 3517,
+ 3565, 3621, 3677, 3733, 3788, 3843, 3897, 3951,
+ 4005, 4058, 4119, 4181, 4241, 4301, 4361, 4420,
+ 4479, 4546, 4612, 4677, 4742, 4807, 4871, 4942,
+ 5013, 5083, 5153, 5222, 5291, 5367, 5442, 5517,
+ 5591, 5665, 5745, 5825, 5905, 5984, 6063, 6149,
+ 6234, 6319, 6404, 6495, 6587, 6678, 6769, 6867,
+ 6966, 7064, 7163, 7269, 7376, 7483, 7599, 7715,
+ 7832, 7958, 8085, 8214, 8352, 8492, 8635, 8788,
+ 8945, 9104, 9275, 9450, 9639, 9832, 10031, 10245,
+ 10465, 10702, 10946, 11210, 11482, 11776, 12081, 12409,
+ 12750, 13118, 13501, 13913, 14343, 14807, 15290, 15812,
+ 16356, 16943, 17575, 18237, 18949, 19718, 20521, 21387
+ }
+#endif // LIBGAV1_MAX_BITDEPTH == 12
};
constexpr int16_t kAcLookup[][256] = {
@@ -142,6 +180,43 @@ constexpr int16_t kAcLookup[][256] = {
6900, 7036, 7172, 7312
},
#endif // LIBGAV1_MAX_BITDEPTH >= 10
+#if LIBGAV1_MAX_BITDEPTH == 12
+ // Lookup table for 12 bit.
+ {
+ 4, 13, 19, 27, 35, 44, 54, 64,
+ 75, 87, 99, 112, 126, 139, 154, 168,
+ 183, 199, 214, 230, 247, 263, 280, 297,
+ 314, 331, 349, 366, 384, 402, 420, 438,
+ 456, 475, 493, 511, 530, 548, 567, 586,
+ 604, 623, 642, 660, 679, 698, 716, 735,
+ 753, 772, 791, 809, 828, 846, 865, 884,
+ 902, 920, 939, 957, 976, 994, 1012, 1030,
+ 1049, 1067, 1085, 1103, 1121, 1139, 1157, 1175,
+ 1193, 1211, 1229, 1246, 1264, 1282, 1299, 1317,
+ 1335, 1352, 1370, 1387, 1405, 1422, 1440, 1457,
+ 1474, 1491, 1509, 1526, 1543, 1560, 1577, 1595,
+ 1627, 1660, 1693, 1725, 1758, 1791, 1824, 1856,
+ 1889, 1922, 1954, 1987, 2020, 2052, 2085, 2118,
+ 2150, 2183, 2216, 2248, 2281, 2313, 2346, 2378,
+ 2411, 2459, 2508, 2556, 2605, 2653, 2701, 2750,
+ 2798, 2847, 2895, 2943, 2992, 3040, 3088, 3137,
+ 3185, 3234, 3298, 3362, 3426, 3491, 3555, 3619,
+ 3684, 3748, 3812, 3876, 3941, 4005, 4069, 4149,
+ 4230, 4310, 4390, 4470, 4550, 4631, 4711, 4791,
+ 4871, 4967, 5064, 5160, 5256, 5352, 5448, 5544,
+ 5641, 5737, 5849, 5961, 6073, 6185, 6297, 6410,
+ 6522, 6650, 6778, 6906, 7034, 7162, 7290, 7435,
+ 7579, 7723, 7867, 8011, 8155, 8315, 8475, 8635,
+ 8795, 8956, 9132, 9308, 9484, 9660, 9836, 10028,
+ 10220, 10412, 10604, 10812, 11020, 11228, 11437, 11661,
+ 11885, 12109, 12333, 12573, 12813, 13053, 13309, 13565,
+ 13821, 14093, 14365, 14637, 14925, 15213, 15502, 15806,
+ 16110, 16414, 16734, 17054, 17390, 17726, 18062, 18414,
+ 18766, 19134, 19502, 19886, 20270, 20670, 21070, 21486,
+ 21902, 22334, 22766, 23214, 23662, 24126, 24590, 25070,
+ 25551, 26047, 26559, 27071, 27599, 28143, 28687, 29247
+ }
+#endif // LIBGAV1_MAX_BITDEPTH == 12
};
// clang-format on
diff --git a/src/quantizer_test.cc b/src/quantizer_test.cc
index 618d247..0c27027 100644
--- a/src/quantizer_test.cc
+++ b/src/quantizer_test.cc
@@ -106,6 +106,32 @@ TEST(QuantizerTest, GetDcValue) {
EXPECT_EQ(quantizer.GetDcValue(kPlaneV, 253), 5347);
}
#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+ // Test lookups of Dc_Qlookup[2][0], Dc_Qlookup[2][11], Dc_Qlookup[2][12],
+ // and Dc_Qlookup[2][255] in the spec, including the clipping of qindex.
+ {
+ Quantizer quantizer(12, &params);
+ EXPECT_EQ(quantizer.GetDcValue(kPlaneY, -2), 4);
+ EXPECT_EQ(quantizer.GetDcValue(kPlaneY, -1), 4);
+ EXPECT_EQ(quantizer.GetDcValue(kPlaneY, 10), 103);
+ EXPECT_EQ(quantizer.GetDcValue(kPlaneY, 11), 115);
+ EXPECT_EQ(quantizer.GetDcValue(kPlaneY, 254), 21387);
+ EXPECT_EQ(quantizer.GetDcValue(kPlaneY, 255), 21387);
+ EXPECT_EQ(quantizer.GetDcValue(kPlaneU, -3), 4);
+ EXPECT_EQ(quantizer.GetDcValue(kPlaneU, -2), 4);
+ EXPECT_EQ(quantizer.GetDcValue(kPlaneU, 9), 103);
+ EXPECT_EQ(quantizer.GetDcValue(kPlaneU, 10), 115);
+ EXPECT_EQ(quantizer.GetDcValue(kPlaneU, 253), 21387);
+ EXPECT_EQ(quantizer.GetDcValue(kPlaneU, 254), 21387);
+ EXPECT_EQ(quantizer.GetDcValue(kPlaneV, -4), 4);
+ EXPECT_EQ(quantizer.GetDcValue(kPlaneV, -3), 4);
+ EXPECT_EQ(quantizer.GetDcValue(kPlaneV, 8), 103);
+ EXPECT_EQ(quantizer.GetDcValue(kPlaneV, 9), 115);
+ EXPECT_EQ(quantizer.GetDcValue(kPlaneV, 254), 21387);
+ EXPECT_EQ(quantizer.GetDcValue(kPlaneV, 253), 21387);
+ }
+#endif // LIBGAV1_MAX_BITDEPTH == 12
}
TEST(QuantizerTest, GetAcValue) {
@@ -162,6 +188,32 @@ TEST(QuantizerTest, GetAcValue) {
EXPECT_EQ(quantizer.GetAcValue(kPlaneV, 254), 7312);
}
#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_MAX_BITDEPTH == 12
+ // Test lookups of Ac_Qlookup[1][0], Ac_Qlookup[1][11], Ac_Qlookup[1][12],
+ // and Ac_Qlookup[1][255] in the spec, including the clipping of qindex.
+ {
+ Quantizer quantizer(12, &params);
+ EXPECT_EQ(quantizer.GetAcValue(kPlaneY, -1), 4);
+ EXPECT_EQ(quantizer.GetAcValue(kPlaneY, 0), 4);
+ EXPECT_EQ(quantizer.GetAcValue(kPlaneY, 11), 112);
+ EXPECT_EQ(quantizer.GetAcValue(kPlaneY, 12), 126);
+ EXPECT_EQ(quantizer.GetAcValue(kPlaneY, 255), 29247);
+ EXPECT_EQ(quantizer.GetAcValue(kPlaneY, 256), 29247);
+ EXPECT_EQ(quantizer.GetAcValue(kPlaneU, -2), 4);
+ EXPECT_EQ(quantizer.GetAcValue(kPlaneU, -1), 4);
+ EXPECT_EQ(quantizer.GetAcValue(kPlaneU, 10), 112);
+ EXPECT_EQ(quantizer.GetAcValue(kPlaneU, 11), 126);
+ EXPECT_EQ(quantizer.GetAcValue(kPlaneU, 254), 29247);
+ EXPECT_EQ(quantizer.GetAcValue(kPlaneU, 255), 29247);
+ EXPECT_EQ(quantizer.GetAcValue(kPlaneV, -3), 4);
+ EXPECT_EQ(quantizer.GetAcValue(kPlaneV, -2), 4);
+ EXPECT_EQ(quantizer.GetAcValue(kPlaneV, 9), 112);
+ EXPECT_EQ(quantizer.GetAcValue(kPlaneV, 10), 126);
+ EXPECT_EQ(quantizer.GetAcValue(kPlaneV, 253), 29247);
+ EXPECT_EQ(quantizer.GetAcValue(kPlaneV, 254), 29247);
+ }
+#endif // LIBGAV1_MAX_BITDEPTH == 12
}
} // namespace
diff --git a/src/threading_strategy_test.cc b/src/threading_strategy_test.cc
index 2a7a781..beea36f 100644
--- a/src/threading_strategy_test.cc
+++ b/src/threading_strategy_test.cc
@@ -99,7 +99,14 @@ TEST_F(ThreadingStrategyTest, MultipleCalls) {
ASSERT_TRUE(strategy_.Reset(frame_header_, 16));
EXPECT_NE(strategy_.tile_thread_pool(), nullptr);
for (int i = 0; i < 8; ++i) {
- EXPECT_NE(strategy_.row_thread_pool(i), nullptr);
+ // See ThreadingStrategy::Reset().
+#if defined(__ANDROID__)
+ if (i >= 4) {
+ EXPECT_EQ(strategy_.row_thread_pool(i), nullptr) << "i = " << i;
+ continue;
+ }
+#endif
+ EXPECT_NE(strategy_.row_thread_pool(i), nullptr) << "i = " << i;
}
EXPECT_NE(strategy_.post_filter_thread_pool(), nullptr);
@@ -120,11 +127,18 @@ TEST_F(ThreadingStrategyTest, MultipleCalls) {
EXPECT_NE(strategy_.tile_thread_pool(), nullptr);
// First two tiles will get 1 thread each.
for (int i = 0; i < 2; ++i) {
- EXPECT_NE(strategy_.row_thread_pool(i), nullptr);
+ // See ThreadingStrategy::Reset().
+#if defined(__ANDROID__)
+ if (i == 1) {
+ EXPECT_EQ(strategy_.row_thread_pool(i), nullptr) << "i = " << i;
+ continue;
+ }
+#endif
+ EXPECT_NE(strategy_.row_thread_pool(i), nullptr) << "i = " << i;
}
// All the other row threads must be reset.
for (int i = 2; i < 8; ++i) {
- EXPECT_EQ(strategy_.row_thread_pool(i), nullptr);
+ EXPECT_EQ(strategy_.row_thread_pool(i), nullptr) << "i = " << i;
}
EXPECT_NE(strategy_.post_filter_thread_pool(), nullptr);
@@ -153,6 +167,13 @@ TEST_F(ThreadingStrategyTest, MultipleCalls2) {
ASSERT_TRUE(strategy_.Reset(frame_header_, 4));
EXPECT_NE(strategy_.tile_thread_pool(), nullptr);
for (int i = 0; i < 2; ++i) {
+ // See ThreadingStrategy::Reset().
+#if defined(__ANDROID__)
+ if (i == 1) {
+ EXPECT_EQ(strategy_.row_thread_pool(i), nullptr) << "i = " << i;
+ continue;
+ }
+#endif
EXPECT_NE(strategy_.row_thread_pool(i), nullptr);
}
for (int i = 2; i < 8; ++i) {
diff --git a/src/tile.h b/src/tile.h
index 83c3423..fcab963 100644
--- a/src/tile.h
+++ b/src/tile.h
@@ -464,13 +464,14 @@ class Tile : public MaxAlignedAllocable {
int* start_y, int* step_x, int* step_y); // 7.11.3.3.
// If the method returns false, the caller only uses the output parameters
// *ref_block_start_x and *ref_block_start_y. If the method returns true, the
- // caller uses all three output parameters.
+ // caller uses all four output parameters.
static bool GetReferenceBlockPosition(
int reference_frame_index, bool is_scaled, int width, int height,
int ref_start_x, int ref_last_x, int ref_start_y, int ref_last_y,
int start_x, int start_y, int step_x, int step_y, int left_border,
int right_border, int top_border, int bottom_border,
- int* ref_block_start_x, int* ref_block_start_y, int* ref_block_end_x);
+ int* ref_block_start_x, int* ref_block_start_y, int* ref_block_end_x,
+ int* ref_block_end_y);
template <typename Pixel>
void BuildConvolveBlock(Plane plane, int reference_frame_index,
diff --git a/src/tile/prediction.cc b/src/tile/prediction.cc
index bba5a69..4348548 100644
--- a/src/tile/prediction.cc
+++ b/src/tile/prediction.cc
@@ -771,11 +771,10 @@ bool Tile::InterPrediction(const Block& block, const Plane plane, const int x,
[static_cast<int>(prediction_parameters.mask_is_inverse)](
block.scratch_buffer->prediction_buffer[0],
block.scratch_buffer->prediction_buffer[1],
- block.scratch_buffer->weight_mask,
- kMaxSuperBlockSizeInPixels);
+ block.scratch_buffer->weight_mask, block.width);
}
prediction_mask = block.scratch_buffer->weight_mask;
- prediction_mask_stride = kMaxSuperBlockSizeInPixels;
+ prediction_mask_stride = block.width;
}
if (is_compound) {
@@ -996,7 +995,7 @@ bool Tile::GetReferenceBlockPosition(
const int start_y, const int step_x, const int step_y,
const int left_border, const int right_border, const int top_border,
const int bottom_border, int* ref_block_start_x, int* ref_block_start_y,
- int* ref_block_end_x) {
+ int* ref_block_end_x, int* ref_block_end_y) {
*ref_block_start_x = GetPixelPositionFromHighScale(start_x, 0, 0);
*ref_block_start_y = GetPixelPositionFromHighScale(start_y, 0, 0);
if (reference_frame_index == -1) {
@@ -1006,7 +1005,7 @@ bool Tile::GetReferenceBlockPosition(
*ref_block_start_y -= kConvolveBorderLeftTop;
*ref_block_end_x = GetPixelPositionFromHighScale(start_x, step_x, width - 1) +
kConvolveBorderRight;
- int ref_block_end_y =
+ *ref_block_end_y =
GetPixelPositionFromHighScale(start_y, step_y, height - 1) +
kConvolveBorderBottom;
if (is_scaled) {
@@ -1015,13 +1014,13 @@ bool Tile::GetReferenceBlockPosition(
kScaleSubPixelBits) +
kSubPixelTaps;
*ref_block_end_x += kConvolveScaleBorderRight - kConvolveBorderRight;
- ref_block_end_y = *ref_block_start_y + block_height - 1;
+ *ref_block_end_y = *ref_block_start_y + block_height - 1;
}
// Determines if we need to extend beyond the left/right/top/bottom border.
return *ref_block_start_x < (ref_start_x - left_border) ||
*ref_block_end_x > (ref_last_x + right_border) ||
*ref_block_start_y < (ref_start_y - top_border) ||
- ref_block_end_y > (ref_last_y + bottom_border);
+ *ref_block_end_y > (ref_last_y + bottom_border);
}
// Builds a block as the input for convolve, by copying the content of
@@ -1140,6 +1139,7 @@ bool Tile::BlockInterPrediction(
int ref_block_start_x;
int ref_block_start_y;
int ref_block_end_x;
+ int ref_block_end_y;
const bool extend_block = GetReferenceBlockPosition(
reference_frame_index, is_scaled, width, height, ref_start_x, ref_last_x,
ref_start_y, ref_last_y, start_x, start_y, step_x, step_y,
@@ -1147,24 +1147,15 @@ bool Tile::BlockInterPrediction(
reference_buffer->right_border(plane),
reference_buffer->top_border(plane),
reference_buffer->bottom_border(plane), &ref_block_start_x,
- &ref_block_start_y, &ref_block_end_x);
+ &ref_block_start_y, &ref_block_end_x, &ref_block_end_y);
// In frame parallel mode, ensure that the reference block has been decoded
// and available for referencing.
if (reference_frame_index != -1 && frame_parallel_) {
- int reference_y_max;
- if (is_scaled) {
- // TODO(vigneshv): For now, we wait for the entire reference frame to be
- // decoded if we are using scaled references. This will eventually be
- // fixed.
- reference_y_max = reference_height;
- } else {
- reference_y_max =
- std::min(ref_block_start_y + height + kSubPixelTaps, ref_last_y);
- // For U and V planes with subsampling, we need to multiply
- // reference_y_max by 2 since we only track the progress of Y planes.
- reference_y_max = LeftShift(reference_y_max, subsampling_y);
- }
+ // For U and V planes with subsampling, we need to multiply the value of
+ // ref_block_end_y by 2 since we only track the progress of the Y planes.
+ const int reference_y_max = LeftShift(
+ std::min(ref_block_end_y + kSubPixelTaps, ref_last_y), subsampling_y);
if (reference_frame_progress_cache_[reference_frame_index] <
reference_y_max &&
!reference_frames_[reference_frame_index]->WaitUntil(
@@ -1297,11 +1288,12 @@ bool Tile::BlockWarpProcess(const Block& block, const Plane plane,
start_x += 8) {
const int src_x = (start_x + 4) << subsampling_x_[plane];
const int src_y = (start_y + 4) << subsampling_y_[plane];
- const int dst_y = src_x * warp_params->params[4] +
- src_y * warp_params->params[5] +
- warp_params->params[1];
- const int y4 = dst_y >> subsampling_y_[plane];
- const int iy4 = y4 >> kWarpedModelPrecisionBits;
+ const int64_t dst_y =
+ src_x * warp_params->params[4] +
+ static_cast<int64_t>(src_y) * warp_params->params[5] +
+ warp_params->params[1];
+ const int64_t y4 = dst_y >> subsampling_y_[plane];
+ const int iy4 = static_cast<int>(y4 >> kWarpedModelPrecisionBits);
reference_y_max = std::max(iy4 + 8, reference_y_max);
}
}
diff --git a/src/utils/constants.h b/src/utils/constants.h
index 1126ad6..8281aad 100644
--- a/src/utils/constants.h
+++ b/src/utils/constants.h
@@ -37,6 +37,10 @@ enum {
}; // anonymous enum
enum {
+ // Documentation variables.
+ kBitdepth8 = 8,
+ kBitdepth10 = 10,
+ kBitdepth12 = 12,
kInvalidMvValue = -32768,
kCdfMaxProbability = 32768,
kBlockWidthCount = 5,
@@ -59,6 +63,13 @@ enum {
kRestorationTypeSymbolCount = 3,
kSgrProjParamsBits = 4,
kSgrProjPrecisionBits = 7,
+ // Precision of a division table (mtable)
+ kSgrProjScaleBits = 20,
+ kSgrProjReciprocalBits = 12,
+ // Core self-guided restoration precision bits.
+ kSgrProjSgrBits = 8,
+ // Precision bits of generated values higher than source before projection.
+ kSgrProjRestoreBits = 4,
// Padding on left and right side of a restoration block.
// 3 is enough, but padding to 4 is more efficient, and makes the temporary
// source buffer 8-pixel aligned.
@@ -177,6 +188,15 @@ enum {
// On Linux, the cache line size can be looked up with the command:
// getconf LEVEL1_DCACHE_LINESIZE
kCacheLineSize = 64,
+ // InterRound0, Section 7.11.3.2.
+ kInterRoundBitsHorizontal = 3, // 8 & 10-bit.
+ kInterRoundBitsHorizontal12bpp = 5,
+ kInterRoundBitsCompoundVertical = 7, // 8, 10 & 12-bit compound prediction.
+ kInterRoundBitsVertical = 11, // 8 & 10-bit, single prediction.
+ kInterRoundBitsVertical12bpp = 9,
+ // Offset applied to 10bpp and 12bpp predictors to allow storing them in
+ // uint16_t. Removed before blending.
+ kCompoundOffset = (1 << 14) + (1 << 13),
}; // anonymous enum
enum FrameType : uint8_t {
diff --git a/src/utils/segmentation_map.cc b/src/utils/segmentation_map.cc
index 4284ca2..bbf40c3 100644
--- a/src/utils/segmentation_map.cc
+++ b/src/utils/segmentation_map.cc
@@ -21,9 +21,12 @@
namespace libgav1 {
bool SegmentationMap::Allocate(int32_t rows4x4, int32_t columns4x4) {
+ if (rows4x4 * columns4x4 > rows4x4_ * columns4x4_) {
+ segment_id_buffer_.reset(new (std::nothrow) int8_t[rows4x4 * columns4x4]);
+ }
+
rows4x4_ = rows4x4;
columns4x4_ = columns4x4;
- segment_id_buffer_.reset(new (std::nothrow) int8_t[rows4x4_ * columns4x4_]);
if (segment_id_buffer_ == nullptr) return false;
segment_id_.Reset(rows4x4_, columns4x4_, segment_id_buffer_.get());
return true;
diff --git a/src/warp_prediction.cc b/src/warp_prediction.cc
index 69b40e8..0da8a1f 100644
--- a/src/warp_prediction.cc
+++ b/src/warp_prediction.cc
@@ -231,9 +231,6 @@ bool WarpEstimation(const int num_samples, const int block_width4x4,
Clip3(vx, -kWarpModelTranslationClamp, kWarpModelTranslationClamp - 1);
params[1] =
Clip3(vy, -kWarpModelTranslationClamp, kWarpModelTranslationClamp - 1);
-
- params[6] = 0;
- params[7] = 0;
return true;
}
diff --git a/tests/block_utils.cc b/tests/block_utils.cc
index 07337c4..a68ae64 100644
--- a/tests/block_utils.cc
+++ b/tests/block_utils.cc
@@ -55,7 +55,6 @@ void PrintBlockDiff(const Pixel* block1, const Pixel* block2, int width,
block2 += stride2;
}
}
-#undef LIBGAV1_DEBUG_FORMAT_CODE
} // namespace
@@ -68,15 +67,16 @@ void PrintBlock(const Pixel* block, int width, int height, int stride,
printf("[%2d] ", y);
for (int x = 0; x < print_width; ++x) {
if (x >= width) {
- printf("[%*d] ", field_width, block[x]);
+ printf("[%*" LIBGAV1_DEBUG_FORMAT_CODE "] ", field_width, block[x]);
} else {
- printf("%*d ", field_width, block[x]);
+ printf("%*" LIBGAV1_DEBUG_FORMAT_CODE " ", field_width, block[x]);
}
}
printf("\n");
block += stride;
}
}
+#undef LIBGAV1_DEBUG_FORMAT_CODE
template void PrintBlock(const uint8_t* block, int width, int height,
int stride, bool print_padding /*= false*/);
diff --git a/tests/libgav1_tests.cmake b/tests/libgav1_tests.cmake
index 2b3f41c..c759d4f 100644
--- a/tests/libgav1_tests.cmake
+++ b/tests/libgav1_tests.cmake
@@ -96,9 +96,13 @@ list(APPEND libgav1_common_sse4_test_sources
list(APPEND libgav1_convolve_test_sources
"${libgav1_source}/dsp/convolve_test.cc")
list(APPEND libgav1_cpu_test_sources "${libgav1_source}/utils/cpu_test.cc")
-list(APPEND libgav1_c_decoder_test_sources "${libgav1_source}/c_decoder_test.c")
+list(APPEND libgav1_c_decoder_test_sources
+ "${libgav1_source}/c_decoder_test.c"
+ "${libgav1_source}/decoder_test_data.h")
list(APPEND libgav1_c_version_test_sources "${libgav1_source}/c_version_test.c")
-list(APPEND libgav1_decoder_test_sources "${libgav1_source}/decoder_test.cc")
+list(APPEND libgav1_decoder_test_sources
+ "${libgav1_source}/decoder_test.cc"
+ "${libgav1_source}/decoder_test_data.h")
list(APPEND libgav1_decoder_buffer_test_sources
"${libgav1_source}/decoder_buffer_test.cc")
list(APPEND libgav1_distance_weighted_blend_test_sources
@@ -217,18 +221,6 @@ macro(libgav1_add_tests_targets)
${libgav1_gtest_include_paths}
${libgav1_include_paths})
- if(ANDROID OR IOS)
- if(DEFINED LIBGAV1_THREADPOOL_USE_STD_MUTEX
- AND NOT LIBGAV1_THREADPOOL_USE_STD_MUTEX)
- set(use_absl_threading TRUE)
- endif()
- elseif(NOT
- (DEFINED
- LIBGAV1_THREADPOOL_USE_STD_MUTEX
- AND LIBGAV1_THREADPOOL_USE_STD_MUTEX))
- set(use_absl_threading TRUE)
- endif()
-
if(use_absl_threading)
list(APPEND libgav1_common_test_absl_deps absl::synchronization)
endif()
diff --git a/tests/utils.h b/tests/utils.h
index 4d73070..3394d64 100644
--- a/tests/utils.h
+++ b/tests/utils.h
@@ -25,6 +25,7 @@
#include "absl/strings/string_view.h"
#include "absl/time/time.h"
#include "src/gav1/decoder_buffer.h"
+#include "src/utils/compiler_attributes.h"
#include "src/utils/memory.h"
#include "tests/third_party/libvpx/acm_random.h"
@@ -42,9 +43,22 @@ static_assert(kAlternateDeterministicSeed !=
// Similar to libgav1::MaxAlignedAllocable, but retains the throwing versions
// of new to support googletest allocations.
+// Note when building the source as C++17 or greater, gcc 11.2.0 may issue a
+// warning of the form:
+// warning: 'void operator delete [](void*, std::align_val_t)' called on
+// pointer returned from a mismatched allocation function
+// note: returned from 'static void*
+// libgav1::test_utils::MaxAlignedAllocable::operator new [](size_t)'
+// This is a false positive as this function calls
+// libgav1::MaxAlignedAllocable::operator new[](size, std::nothrow) which in
+// turn calls
+// void* operator new[](std::size_t, std::align_val_t, const std::nothrow_t&).
+// This is due to unbalanced inlining of the functions, so we force them to be
+// inlined.
+// See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=103993
struct MaxAlignedAllocable {
// Class-specific allocation functions.
- static void* operator new(size_t size) {
+ static LIBGAV1_ALWAYS_INLINE void* operator new(size_t size) {
void* const p =
libgav1::MaxAlignedAllocable::operator new(size, std::nothrow);
#ifdef ABSL_HAVE_EXCEPTIONS
@@ -52,7 +66,7 @@ struct MaxAlignedAllocable {
#endif
return p;
}
- static void* operator new[](size_t size) {
+ static LIBGAV1_ALWAYS_INLINE void* operator new[](size_t size) {
void* const p =
libgav1::MaxAlignedAllocable::operator new[](size, std::nothrow);
#ifdef ABSL_HAVE_EXCEPTIONS
@@ -62,29 +76,33 @@ struct MaxAlignedAllocable {
}
// Class-specific non-throwing allocation functions
- static void* operator new(size_t size, const std::nothrow_t& tag) noexcept {
+ static LIBGAV1_ALWAYS_INLINE void* operator new(
+ size_t size, const std::nothrow_t& tag) noexcept {
return libgav1::MaxAlignedAllocable::operator new(size, tag);
}
- static void* operator new[](size_t size, const std::nothrow_t& tag) noexcept {
+ static LIBGAV1_ALWAYS_INLINE void* operator new[](
+ size_t size, const std::nothrow_t& tag) noexcept {
return libgav1::MaxAlignedAllocable::operator new[](size, tag);
}
// Class-specific deallocation functions.
- static void operator delete(void* ptr) noexcept {
+ static LIBGAV1_ALWAYS_INLINE void operator delete(void* ptr) noexcept {
libgav1::MaxAlignedAllocable::operator delete(ptr);
}
- static void operator delete[](void* ptr) noexcept {
+ static LIBGAV1_ALWAYS_INLINE void operator delete[](void* ptr) noexcept {
libgav1::MaxAlignedAllocable::operator delete[](ptr);
}
// Only called if new (std::nothrow) is used and the constructor throws an
// exception.
- static void operator delete(void* ptr, const std::nothrow_t& tag) noexcept {
+ static LIBGAV1_ALWAYS_INLINE void operator delete(
+ void* ptr, const std::nothrow_t& tag) noexcept {
libgav1::MaxAlignedAllocable::operator delete(ptr, tag);
}
// Only called if new[] (std::nothrow) is used and the constructor throws an
// exception.
- static void operator delete[](void* ptr, const std::nothrow_t& tag) noexcept {
+ static LIBGAV1_ALWAYS_INLINE void operator delete[](
+ void* ptr, const std::nothrow_t& tag) noexcept {
libgav1::MaxAlignedAllocable::operator delete[](ptr, tag);
}
};